In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

# Decision Tree

In [2]:
from sklearn.tree import DecisionTreeClassifier

In [3]:
X_train = pd.read_csv('Datasets/CV0_train_subset_0.csv', index_col=0)
y_train= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_train.index]["damage_grade"]
X_test = pd.read_csv('Datasets/CV0_test_subset_0.csv', index_col=0)
y_test= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_test.index]["damage_grade"]

Since the training of Decision Trees does not take much time to compute, I will optimize the mean score of the 10 subsets.

In [4]:
spaceTree  = [Integer(100, 1000, name="min_samples_split"),
             Integer(1, 50, name="min_samples_leaf")]

In [5]:
@use_named_args(spaceTree)
def objectiveTree(**params):
    tree.set_params(**params)
    generalization = pd.Series(index=range(10))
    for i in generalization.index:
        X_train = pd.read_csv('Datasets/CV0_train_subset_{}.csv'.format(i), index_col=0)
        y_train= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_train.index]["damage_grade"]
        X_test = pd.read_csv('Datasets/CV0_test_subset_{}.csv'.format(i), index_col=0)
        y_test= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_test.index]["damage_grade"]
        tree.fit(X_train,y_train)
        predict = tree.predict(X_test)
        generalization[i] = f1_score(y_test, predict, average='micro')
    e = generalization.mean()
    print(e, [(a, params[a]) for a in params])
    return 1 - e

In [6]:
tree = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=None, min_samples_leaf=None)
res_tree = gp_minimize(objectiveTree, spaceTree, n_calls=50, random_state=0)
1-res_tree.fun, res_tree.x[0], res_tree.x[1]

0.7168199735239507 [('min_samples_split', 634), ('min_samples_leaf', 42)]
0.7160064678527557 [('min_samples_split', 872), ('min_samples_leaf', 43)]
0.7166012525133267 [('min_samples_split', 661), ('min_samples_leaf', 20)]
0.7174569487379261 [('min_samples_split', 368), ('min_samples_leaf', 4)]
0.7178636979660713 [('min_samples_split', 345), ('min_samples_leaf', 24)]
0.7161446103057756 [('min_samples_split', 831), ('min_samples_leaf', 25)]
0.717230555486106 [('min_samples_split', 454), ('min_samples_leaf', 42)]
0.7173073017514285 [('min_samples_split', 404), ('min_samples_leaf', 33)]
0.7170079909927364 [('min_samples_split', 431), ('min_samples_leaf', 48)]
0.7180440439298754 [('min_samples_split', 226), ('min_samples_leaf', 44)]
0.7118084700211732 [('min_samples_split', 100), ('min_samples_leaf', 1)]
0.7153004086021333 [('min_samples_split', 985), ('min_samples_leaf', 1)]
0.715511456782589 [('min_samples_split', 1000), ('min_samples_leaf', 50)]
0.7173725408328997 [('min_samples_split', 

(0.7180440439298754, 226, 44)

# Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In order to be clearer, I have previously narrowed the space search. Here is displayed the final step. Random forest training takes a lot of time to compute, so I will optimize only subset0 and then see if it generalizes well.

In [8]:
spaceForest  = [Integer(2, 30, name="min_samples_split"),
                Integer(1, 5, name="min_samples_leaf"),
                Integer(700, 2000, name="n_estimators")]

In [9]:
@use_named_args(spaceForest)
def objectiveForest(**params):
    forest.set_params(**params)
    forest.fit(X_train,y_train)
    predict = forest.predict(X_test)
    e = f1_score(y_test, predict, average='micro')
    print(e, [(a, params[a]) for a in params])
    return 1 - e

In [10]:
X_train = pd.read_csv('Datasets/CV0_train_subset_0.csv', index_col=0)
y_train= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_train.index]["damage_grade"]
X_test = pd.read_csv('Datasets/CV0_test_subset_0.csv', index_col=0)
y_test= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_test.index]["damage_grade"]
forest = RandomForestClassifier(min_samples_split=225, min_samples_leaf=45, n_jobs=-1)
res_forest = gp_minimize(objectiveForest, spaceForest, n_calls=25, random_state=0)
1-res_forest.fun, res_forest.x[0], res_forest.x[1], res_forest.x[2]

0.735111281657713 [('min_samples_split', 19), ('min_samples_leaf', 4), ('n_estimators', 1815)]
0.7344205679201842 [('min_samples_split', 26), ('min_samples_leaf', 3), ('n_estimators', 1200)]
0.7338066001534919 [('min_samples_split', 10), ('min_samples_leaf', 1), ('n_estimators', 1054)]
0.7348426707597853 [('min_samples_split', 15), ('min_samples_leaf', 4), ('n_estimators', 1324)]
0.7351496546431312 [('min_samples_split', 13), ('min_samples_leaf', 4), ('n_estimators', 1139)]
0.736415963161934 [('min_samples_split', 20), ('min_samples_leaf', 2), ('n_estimators', 1944)]
0.7361089792785878 [('min_samples_split', 6), ('min_samples_leaf', 4), ('n_estimators', 1316)]
0.7352264006139678 [('min_samples_split', 24), ('min_samples_leaf', 3), ('n_estimators', 1583)]
0.7354950115118957 [('min_samples_split', 22), ('min_samples_leaf', 3), ('n_estimators', 1399)]
0.7356485034535687 [('min_samples_split', 23), ('min_samples_leaf', 1), ('n_estimators', 1316)]
0.7353415195702225 [('min_samples_split', 2

(0.736415963161934, 20, 2, 1944)

I must pay attention here : if the best model would have been the most prone to overfitting (2000 estimators, 1 min samples leaf and 1 samples split) it would have been a clue that I were in the wrong direction. Fortunately, the optimization find a better model.

In [11]:
generalization = pd.Series(index=range(10))
for i in generalization.index:
    forest = RandomForestClassifier(min_samples_split=res_forest.x[0], min_samples_leaf=res_forest.x[1],
                                    n_estimators=res_forest.x[0], n_jobs=-1)
    X_train = pd.read_csv('Datasets/CV0_train_subset_{}.csv'.format(i), index_col=0)
    y_train= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_train.index]["damage_grade"]
    X_test = pd.read_csv('Datasets/CV0_test_subset_{}.csv'.format(i), index_col=0)
    y_test= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_test.index]["damage_grade"]
    forest.fit(X_train,y_train)
    predict = forest.predict(X_test)
    generalization[i] = f1_score(y_test, predict, average='micro')
generalization

0    0.732924
1    0.734152
2    0.730200
3    0.728665
4    0.736685
5    0.727974
6    0.735303
7    0.732579
8    0.732348
9    0.732359
dtype: float64

Generalization is good, and this is my best model so far !

# Nearest neighbors classifier

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
X_train = pd.read_csv('Datasets/CV0_train_subset_0.csv', index_col=0)
y_train= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_train.index]["damage_grade"]
X_test = pd.read_csv('Datasets/CV0_test_subset_0.csv', index_col=0)
y_test= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_test.index]["damage_grade"]

In order to be clearer, I have previously narrowed the space search. Here is displayed the final step. Nearest neighbors classsifiers training takes a lot of time to compute, so I will optimize only subset0 and then see if it generalizes well.

In [14]:
spaceKNN  = [Integer(20, 30, name="n_neighbors"),
             Integer(25, 55, name="leaf_size")]

In [15]:
@use_named_args(spaceKNN)
def objectiveKNN(**params):
    knn.set_params(**params)
    knn.fit(X_train,y_train)
    predict = knn.predict(X_test)
    e = f1_score(y_test, predict, average='micro')
    print(e, [(a, params[a]) for a in params])
    return 1 - e

In [16]:
knn = KNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='auto', leaf_size=10, p=2,
                            metric='minkowski', metric_params=None, n_jobs=-1)

In [17]:
res_KNN = gp_minimize(objectiveKNN, spaceKNN, n_calls=25, random_state=0)
1-res_KNN.fun, res_KNN.x[0], res_KNN.x[1]

0.709669992325403 [('n_neighbors', 26), ('leaf_size', 50)]
0.7080583269378358 [('n_neighbors', 29), ('leaf_size', 50)]
0.709669992325403 [('n_neighbors', 26), ('leaf_size', 37)]
0.7081734458940907 [('n_neighbors', 23), ('leaf_size', 27)]
0.7082118188795087 [('n_neighbors', 23), ('leaf_size', 39)]
0.7085571757482733 [('n_neighbors', 28), ('leaf_size', 39)]
0.7089025326170376 [('n_neighbors', 24), ('leaf_size', 50)]
0.7082118188795087 [('n_neighbors', 23), ('leaf_size', 44)]
0.7089025326170376 [('n_neighbors', 24), ('leaf_size', 54)]
0.7066385264773599 [('n_neighbors', 21), ('leaf_size', 51)]
0.7080199539524175 [('n_neighbors', 30), ('leaf_size', 25)]
0.7082885648503453 [('n_neighbors', 20), ('leaf_size', 25)]
0.7080199539524175 [('n_neighbors', 30), ('leaf_size', 25)]
0.7081734458940907 [('n_neighbors', 30), ('leaf_size', 55)]
0.7082885648503453 [('n_neighbors', 20), ('leaf_size', 25)]
0.7080199539524175 [('n_neighbors', 20), ('leaf_size', 55)]
0.7080199539524175 [('n_neighbors', 30), (

(0.709669992325403, 26, 50)

In [18]:
generalization = pd.Series(index=range(10))
for i in generalization.index:
    knn = KNeighborsClassifier(n_neighbors=res_KNN.x[0], weights='uniform', algorithm='auto', leaf_size=res_KNN.x[1], p=2,
                            metric='minkowski', metric_params=None, n_jobs=-1)
    X_train = pd.read_csv('Datasets/CV0_train_subset_{}.csv'.format(i), index_col=0)
    y_train= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_train.index]["damage_grade"]
    X_test = pd.read_csv('Datasets/CV0_test_subset_{}.csv'.format(i), index_col=0)
    y_test= pd.read_csv('Datasets/train_labels.csv', index_col=0).loc[X_test.index]["damage_grade"]
    knn.fit(X_train,y_train)
    predict = knn.predict(X_test)
    generalization[i] = f1_score(y_test, predict, average='micro')
generalization

0    0.709670
1    0.706523
2    0.707598
3    0.704873
4    0.710860
5    0.707483
6    0.709823
7    0.709785
8    0.709593
9    0.707149
dtype: float64

Generalization is very good. However, I expected better results from this model.