In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [2]:
df_train = pd.read_csv('stand_train.csv', index_col='building_id')
df_test = pd.read_csv('stand_test.csv', index_col='building_id')

In [3]:
df_train.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-0.983414,-0.518705,1.629055,-0.178274,0.0471,-0.45946,-0.226419,0.038365,1,1,...,0,0,0,0,0,0,0,0,1,0
28830,-0.734459,0.481998,-0.945017,-0.178274,-0.224765,-0.00411,0.816109,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
94947,0.883744,-0.819158,0.744612,-0.178274,-0.224765,-0.687135,-0.226419,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
590882,1.008221,-0.685893,1.216589,-0.178274,-0.224765,-0.45946,-0.226419,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
201944,-0.361028,-1.381296,-1.308119,1.195989,0.0471,-0.00411,1.858636,0.038365,1,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
df_test.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,0.387545,-0.261582,1.382754,1.190666,-0.089223,-0.231598,0.29571,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0
99355,-0.982399,-1.362313,1.569122,-0.18288,-0.021116,1.13893,-0.22869,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0
890251,1.010246,-1.657455,1.036602,-0.18288,-0.293547,-0.916862,-0.22869,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0
745817,1.508407,-1.609071,-1.542679,-1.556426,-0.361655,2.509459,-1.277491,2.407631,0,0,...,0,0,0,0,0,0,0,0,1,0
421793,0.387545,-1.004273,0.468179,1.190666,-0.157331,-0.003176,0.820111,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0


# split model first

In [5]:
X = df_train.drop(columns=['damage_grade'])
y = df_train['damage_grade']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [7]:
classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, learning_rate=0.02,max_depth=10, nfold=5, seed=101)

In [8]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.02, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nfold=5, nthread=None, num_class=3, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=101, silent=None, subsample=1, verbosity=1)

In [9]:
y_pred = classifier.predict(X_test)

In [10]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.67      0.39      0.49      7636
           2       0.70      0.85      0.77     44607
           3       0.73      0.55      0.63     25938

    accuracy                           0.71     78181
   macro avg       0.70      0.60      0.63     78181
weighted avg       0.71      0.71      0.70     78181



# Use f-beta score

In [11]:
from sklearn.metrics import fbeta_score, make_scorer

fO5_scorer = make_scorer(fbeta_score, beta=2, average = 'micro')

In [12]:
classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, learning_rate=0.02,max_depth=20, seed=101)

In [13]:
from sklearn.model_selection import GridSearchCV

In [20]:
grid_search = GridSearchCV(classifier, param_grid={'learning_rate':[0.005]}, scoring = fO5_scorer, cv =5)

In [21]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.02, max_delta_step=0,
                                     max_depth=20, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, num_class=3,
                                     objective='multi:softmax', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=101, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.005]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False,


In [22]:
y_pred = grid_search.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.63      0.48      0.54      7636
           2       0.73      0.82      0.78     44607
           3       0.73      0.62      0.67     25938

    accuracy                           0.72     78181
   macro avg       0.70      0.64      0.66     78181
weighted avg       0.72      0.72      0.72     78181



In [24]:
predict = grid_search.predict(df_test)

In [25]:
submit = pd.read_csv('submission_format.csv')

In [26]:
submit.drop(columns=['damage_grade'], inplace=True)

In [27]:
submit['damage_grade'] = predict

In [28]:
submit.head()

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,2
4,421793,3


In [29]:
submit.to_csv('pram_sub1.csv', index = False)

### Second try with larger parameter

In [30]:
classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, seed=101)

In [31]:
params = {
    'learning_rate': [0.005, 0.01, 0.05],
    'max_depth': [10,20,50]
}

In [32]:
grid_search2 = GridSearchCV(classifier, param_grid=params, scoring = fO5_scorer, cv =5)

In [33]:
grid_search2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, num_class=3,
                                     objective='multi:softmax', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=101, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.005, 0.01, 0.05],
                         'max_depth': [10, 20, 50]},
             p

In [34]:
y_pred = grid_search2.predict(X_test)

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.65      0.50      0.57      7636
           2       0.74      0.83      0.79     44607
           3       0.74      0.64      0.68     25938

    accuracy                           0.74     78181
   macro avg       0.71      0.66      0.68     78181
weighted avg       0.73      0.74      0.73     78181



In [42]:
grid_search2.best_params_

{'learning_rate': 0.05, 'max_depth': 20}

In [36]:
predict2 = grid_search2.predict(df_test)

In [37]:
submit = pd.read_csv('submission_format.csv')

In [38]:
submit.drop(columns=['damage_grade'], inplace=True)

In [39]:
submit['damage_grade'] = predict2

In [40]:
submit.head()

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,2
3,745817,2
4,421793,3


In [41]:
submit.to_csv('pram_sub2.csv', index = False)

* Grid Search on parameter improved model
* Grid Search took over 4 hours

In [51]:
from sklearn.metrics import fbeta_score, make_scorer

fO5_scorer = make_scorer(fbeta_score, beta=2, average = 'micro')

In [52]:
classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, seed=101)

In [53]:
params = {
    'learning_rate': [0.05],
    'max_depth': [20],
    'gamma': [0.5]
}

In [54]:
grid_search3 = GridSearchCV(classifier, param_grid=params, scoring = fO5_scorer, cv =5)

In [55]:
# eval_metric function can change evaluation metric object: eg. [“auc”,“error”,“logloss”]

grid_search3.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], early_stopping_rounds=10)

[0]	validation_0-merror:0.192561	validation_1-merror:0.288791
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.188351	validation_1-merror:0.286873
[2]	validation_0-merror:0.186087	validation_1-merror:0.285402
[3]	validation_0-merror:0.184278	validation_1-merror:0.284276
[4]	validation_0-merror:0.182107	validation_1-merror:0.283662
[5]	validation_0-merror:0.180457	validation_1-merror:0.282959
[6]	validation_0-merror:0.179163	validation_1-merror:0.281859
[7]	validation_0-merror:0.177267	validation_1-merror:0.280925
[8]	validation_0-merror:0.175869	validation_1-merror:0.28026
[9]	validation_0-merror:0.174383	validation_1-merror:0.279735
[10]	validation_0-merror:0.172788	validation_1-merror:0.279058
[11]	validation_0-merror:0.17116	validation_1-merror:0.278303
[12]	validation_0-merror:0.170157	validation_1-merror:0.277932
[13]	validation_0-merror:0.168

[34]	validation_0-merror:0.147544	validation_1-merror:0.271754
[35]	validation_0-merror:0.1467	validation_1-merror:0.271703
[36]	validation_0-merror:0.146141	validation_1-merror:0.271844
[37]	validation_0-merror:0.145456	validation_1-merror:0.271127
[38]	validation_0-merror:0.144677	validation_1-merror:0.270846
[39]	validation_0-merror:0.1438	validation_1-merror:0.270616
[40]	validation_0-merror:0.143318	validation_1-merror:0.270513
[41]	validation_0-merror:0.142386	validation_1-merror:0.270321
[42]	validation_0-merror:0.141969	validation_1-merror:0.270334
[43]	validation_0-merror:0.141393	validation_1-merror:0.270526
[44]	validation_0-merror:0.140851	validation_1-merror:0.270411
[45]	validation_0-merror:0.140303	validation_1-merror:0.27004
[46]	validation_0-merror:0.13982	validation_1-merror:0.269388
[47]	validation_0-merror:0.139277	validation_1-merror:0.269541
[48]	validation_0-merror:0.138822	validation_1-merror:0.269324
[49]	validation_0-merror:0.13828	validation_1-merror:0.269324

[67]	validation_0-merror:0.131756	validation_1-merror:0.268339
[68]	validation_0-merror:0.131466	validation_1-merror:0.268505
[69]	validation_0-merror:0.130978	validation_1-merror:0.268237
[70]	validation_0-merror:0.13077	validation_1-merror:0.268275
[71]	validation_0-merror:0.130293	validation_1-merror:0.268185
[72]	validation_0-merror:0.129821	validation_1-merror:0.268339
[73]	validation_0-merror:0.129487	validation_1-merror:0.268173
[74]	validation_0-merror:0.128889	validation_1-merror:0.268249
[75]	validation_0-merror:0.128412	validation_1-merror:0.267866
[76]	validation_0-merror:0.127886	validation_1-merror:0.267789
[77]	validation_0-merror:0.12736	validation_1-merror:0.267623
[78]	validation_0-merror:0.127015	validation_1-merror:0.267994
[79]	validation_0-merror:0.126603	validation_1-merror:0.268173
[80]	validation_0-merror:0.125995	validation_1-merror:0.267891
[81]	validation_0-merror:0.125672	validation_1-merror:0.268032
[82]	validation_0-merror:0.125249	validation_1-merror:0.2

[4]	validation_0-merror:0.182326	validation_1-merror:0.280836
[5]	validation_0-merror:0.180665	validation_1-merror:0.279953
[6]	validation_0-merror:0.179421	validation_1-merror:0.279697
[7]	validation_0-merror:0.177892	validation_1-merror:0.277983
[8]	validation_0-merror:0.176088	validation_1-merror:0.277638
[9]	validation_0-merror:0.174542	validation_1-merror:0.277024
[10]	validation_0-merror:0.172788	validation_1-merror:0.276896
[11]	validation_0-merror:0.171302	validation_1-merror:0.276346
[12]	validation_0-merror:0.170042	validation_1-merror:0.276525
[13]	validation_0-merror:0.168753	validation_1-merror:0.276384
[14]	validation_0-merror:0.167854	validation_1-merror:0.276205
[15]	validation_0-merror:0.166637	validation_1-merror:0.275208
[16]	validation_0-merror:0.165218	validation_1-merror:0.275297
[17]	validation_0-merror:0.164099	validation_1-merror:0.274798
[18]	validation_0-merror:0.162915	validation_1-merror:0.274734
[19]	validation_0-merror:0.16166	validation_1-merror:0.274376

[32]	validation_0-merror:0.126198	validation_1-merror:0.26775
[33]	validation_0-merror:0.125452	validation_1-merror:0.267149
[34]	validation_0-merror:0.124504	validation_1-merror:0.267111
[35]	validation_0-merror:0.123435	validation_1-merror:0.26706
[36]	validation_0-merror:0.122629	validation_1-merror:0.26683
[37]	validation_0-merror:0.121752	validation_1-merror:0.266676
[38]	validation_0-merror:0.1211	validation_1-merror:0.266791
[39]	validation_0-merror:0.120442	validation_1-merror:0.26683
[40]	validation_0-merror:0.119811	validation_1-merror:0.266561
[41]	validation_0-merror:0.11883	validation_1-merror:0.266842
[42]	validation_0-merror:0.117909	validation_1-merror:0.266753
[43]	validation_0-merror:0.116939	validation_1-merror:0.266702
[44]	validation_0-merror:0.116215	validation_1-merror:0.266855
[45]	validation_0-merror:0.115558	validation_1-merror:0.266471
[46]	validation_0-merror:0.114763	validation_1-merror:0.266382
[47]	validation_0-merror:0.114138	validation_1-merror:0.266369

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, num_class=3,
                                     objective='multi:softmax', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=101, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'gamma': [0.5], 'learning_rate': [0.05],
                         'max_depth': [20]},
             pre_di

In [20]:
train_pred = grid_search3.predict(X_train)
test_pred = grid_search3.predict(X_test)

In [21]:
print('Trainset classification report')
print(classification_report(y_train, train_pred))

Trainset classification report
              precision    recall  f1-score   support

           1       0.97      0.90      0.93     17488
           2       0.91      0.96      0.94    103652
           3       0.93      0.87      0.90     61280

    accuracy                           0.92    182420
   macro avg       0.94      0.91      0.92    182420
weighted avg       0.93      0.92      0.92    182420



In [22]:
print('Testset classification report')
print(classification_report(y_test, test_pred))

Testset classification report
              precision    recall  f1-score   support

           1       0.65      0.50      0.57      7636
           2       0.74      0.83      0.79     44607
           3       0.74      0.64      0.68     25938

    accuracy                           0.74     78181
   macro avg       0.71      0.66      0.68     78181
weighted avg       0.73      0.74      0.73     78181

