# Model training

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
sns.set_style('darkgrid')
%matplotlib inline

In [2]:
df = pd.read_csv('train_ready.csv', index_col= 'building_id')
df_test = pd.read_csv('test_ready.csv', index_col= 'building_id')

In [3]:
df.head()
# damage_grade is target variable

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
28830,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0
94947,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
590882,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
201944,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
df_test.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,0,1,0,...,0,0,0,0,0,0,0,0,1,0
99355,6,141,11987,2,25,13,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
890251,22,19,10044,2,5,4,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
745817,26,39,633,1,0,19,3,0,0,0,...,0,0,0,0,0,0,0,0,1,0
421793,17,289,7970,3,15,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0


## train and test split

In [5]:
X = df.drop(columns=['damage_grade'])
y = df['damage_grade']

In [6]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
28830,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0
94947,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
590882,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
201944,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
y.head()

building_id
802906    3
28830     2
94947     3
590882    2
201944    3
Name: damage_grade, dtype: int64

In [8]:
# train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [9]:
print('Trainset size: ', len(X_train))
print('Testset size: ', len(X_test))

Trainset size:  182420
Testset size:  78181


In [10]:
classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, seed=101)

### Model fitting

In [11]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None, num_class=3,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=101, silent=None,
       subsample=1, verbosity=1)

### Evaluate

In [12]:
y_pred = classifier.predict(X_test)

  if diff:


In [13]:
print(classification_report(y_test, y_pred))
# recall score is too low on class 1 and 3

             precision    recall  f1-score   support

          1       0.62      0.35      0.45      7636
          2       0.66      0.88      0.75     44607
          3       0.73      0.42      0.53     25938

avg / total       0.68      0.67      0.65     78181



In [14]:
print(confusion_matrix(y_test, y_pred))

[[ 2662  4906    68]
 [ 1563 39068  3976]
 [  103 15026 10809]]


In [15]:
print(accuracy_score(y_test, y_pred))

0.6720174978575357


### Tune xgboost parameter

In [16]:
cv_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, learning_rate=0.02,max_depth=10, nfold=5, seed=101)

In [17]:
cv_classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.02,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nfold=5, nthread=None, num_class=3,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=101, silent=None,
       subsample=1, verbosity=1)

In [18]:
y_pred = cv_classifier.predict(X_test)

  if diff:


In [19]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.67      0.39      0.49      7636
          2       0.70      0.85      0.77     44607
          3       0.73      0.55      0.63     25938

avg / total       0.71      0.71      0.70     78181



In [24]:
print(accuracy_score(y_test, y_pred))

0.7079853161254013


# Submission

In [25]:
predict = cv_classifier.predict(df_test)

  if diff:


In [31]:
submission = pd.read_csv('submission_format.csv')

In [34]:
submission.drop(columns='damage_grade')
submission['damage_grade'] = predict

In [36]:
submission.to_csv('submit1.csv', index=False)