# Approach: Under Sampling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
df_train = pd.read_csv('stand_train.csv', index_col='building_id')
df_test = pd.read_csv('stand_test.csv', index_col='building_id')

In [3]:
len(df_train)

260601

In [4]:
len(df_test)

86868

In [5]:
df_train['damage_grade'].value_counts()

2    148259
3     87218
1     25124
Name: damage_grade, dtype: int64

## Split data by damage

In [7]:
damage_1 = df_train[df_train['damage_grade'] == 1]
damage_2 = df_train[df_train['damage_grade'] == 2]
damage_3 = df_train[df_train['damage_grade'] == 3]

In [8]:
damage_1.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
475515,0.759267,-0.916078,1.639476,-0.178274,-0.360698,-0.00411,0.294845,0.038365,0,0,...,0,0,0,0,0,0,1,0,1,0
989500,1.50613,0.448076,-1.443596,-1.552536,-0.360698,1.134266,-0.747683,0.038365,0,0,...,0,0,0,0,0,0,0,0,1,0
864809,-0.112073,-0.959693,-0.082788,-0.178274,-0.360698,0.223565,1.337372,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
440102,-0.858937,-1.318298,-0.272292,-1.552536,-0.292732,0.678916,-1.268946,2.42849,0,0,...,0,0,0,0,0,0,0,0,1,0
402996,1.50613,0.147623,-1.512706,-1.552536,-0.360698,1.589617,-1.268946,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0


In [9]:
damage_2.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28830,-0.734459,0.481998,-0.945017,-0.178274,-0.224765,-0.00411,0.816109,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
590882,1.008221,-0.685893,1.216589,-0.178274,-0.224765,-0.45946,-0.226419,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
333020,-0.734459,-0.346671,-0.046314,-0.178274,-0.224765,0.223565,-0.226419,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
441126,-1.730277,0.135508,0.263584,-0.178274,-0.156799,-0.00411,0.294845,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
452227,0.385835,1.390626,-0.618116,-1.552536,-0.224765,-0.00411,-0.747683,0.038365,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
damage_3.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-0.983414,-0.518705,1.629055,-0.178274,0.0471,-0.45946,-0.226419,0.038365,1,1,...,0,0,0,0,0,0,0,0,1,0
94947,0.883744,-0.819158,0.744612,-0.178274,-0.224765,-0.687135,-0.226419,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
201944,-0.361028,-1.381296,-1.308119,1.195989,0.0471,-0.00411,1.858636,0.038365,1,0,...,0,0,0,0,0,0,0,0,1,0
728451,-0.609982,-0.547781,1.592854,-0.178274,-0.020866,-1.142486,-0.747683,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
7962,0.385835,1.012637,1.626312,-0.178274,-0.088832,0.223565,0.294845,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0


In [60]:
# Shuffle the dataframe before under sampling to prevent selection bias

from sklearn.utils import shuffle

In [61]:
damage1= shuffle(damage_1)
damage2= shuffle(damage_2)
damage3= shuffle(damage_3)

In [62]:
damage2 = damage2.iloc[:70000]
damage3 = damage3.iloc[:40000]

In [63]:
print(len(damage1))
print(len(damage2))
print(len(damage3))

25124
70000
40000


In [64]:
df_train = pd.concat([damage1, damage2, damage3], axis=0)

In [65]:
df_train = shuffle(df_train)

In [66]:
df_train.to_csv('under_train.csv')

## Train test split

In [67]:
X = df_train.drop(columns=['damage_grade'])
y = df_train['damage_grade']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

## Model fitting

In [69]:
classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, learning_rate=0.02,max_depth=10, nfold=5, seed=101)

In [70]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.02,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nfold=5, nthread=None, num_class=3,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=101, silent=None,
       subsample=1, verbosity=1)

In [71]:
y_pred = classifier.predict(X_test)

  if diff:


In [72]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.72      0.59      0.65      7434
          2       0.67      0.82      0.74     20969
          3       0.73      0.53      0.62     12135

avg / total       0.70      0.69      0.68     40538



In [73]:
print(accuracy_score(y_test, y_pred))

0.6908579604321871


## Submission

In [81]:
submit = pd.read_csv('submission_format.csv', index_col='building_id')

In [82]:
submit.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,1
99355,1
890251,1
745817,1
421793,1


In [83]:
submit.drop(columns=['damage_grade'], inplace=True)

In [85]:
submit['damage_grade'] = classifier.predict(df_test)

  if diff:


In [86]:
submit.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


In [87]:
submit.to_csv('under_submit.csv')

## under sampling did not improve model

# Next approach:
* Oversampling using SMOTE