## Normalize data

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [76]:
df_train = pd.read_csv('train_ready.csv', index_col= 'building_id')
df_test = pd.read_csv('test_ready.csv', index_col= 'building_id')

In [77]:
# Get numerical columns
num_col = []

for col in df_train.columns:
    if len(df_train[col].unique()) > 2:
        num_col.append(col)

In [78]:
num_col.remove('damage_grade')
print(num_col,'\n')
print('Count: ',len(num_col))

['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families'] 

Count:  8


In [79]:
# get binary columns
other_col = []

for col in df_train.columns:
    other_col.append(col)

for col in num_col:
    other_col.remove(col)

In [80]:
print(other_col,'\n')
print('Count: ',len(other_col))

['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'has_secondary_use_other', 'damage_grade', 'land_surface_condition_o', 'land_surface_condition_t', 'foundation_type_i', 'foundation_type_r', 'foundation_type_u', 'foundation_type_w', 'roof_type_q', 'roof_type_x', 'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x', 'ground_floor_type_z', 'other_f

In [81]:
df_train[num_col].describe()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families
count,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0,260601.0
mean,13.900353,701.074685,6257.876148,2.129723,26.535029,8.018051,5.434365,0.983949
std,8.033617,412.710734,3646.369645,0.727665,73.565937,4.392231,1.918418,0.418389
min,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0
25%,7.0,350.0,3073.0,2.0,10.0,5.0,4.0,1.0
50%,12.0,702.0,6270.0,2.0,15.0,7.0,5.0,1.0
75%,21.0,1050.0,9412.0,2.0,30.0,9.0,6.0,1.0
max,30.0,1427.0,12567.0,9.0,995.0,100.0,32.0,9.0


## Normalize Train data

In [82]:
scaler = StandardScaler()
print(scaler.fit(df_train[num_col]))

StandardScaler(copy=True, with_mean=True, with_std=True)


In [83]:
df_train.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
28830,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0
94947,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
590882,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
201944,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [84]:
transformed_df = pd.DataFrame(scaler.transform(df_train[num_col]), columns=num_col, index=df_train.index)
df_train.drop(columns=num_col, inplace=True)
stand_df = pd.concat([transformed_df, df_train], axis=1)

In [87]:
stand_df.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-0.983414,-0.518705,1.629055,-0.178274,0.0471,-0.45946,-0.226419,0.038365,1,1,...,0,0,0,0,0,0,0,0,1,0
28830,-0.734459,0.481998,-0.945017,-0.178274,-0.224765,-0.00411,0.816109,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
94947,0.883744,-0.819158,0.744612,-0.178274,-0.224765,-0.687135,-0.226419,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
590882,1.008221,-0.685893,1.216589,-0.178274,-0.224765,-0.45946,-0.226419,0.038365,0,1,...,0,0,0,0,0,0,0,0,1,0
201944,-0.361028,-1.381296,-1.308119,1.195989,0.0471,-0.00411,1.858636,0.038365,1,0,...,0,0,0,0,0,0,0,0,1,0


## Normalize Test data

In [90]:
scaler = StandardScaler()
print(scaler.fit(df_test[num_col]))

StandardScaler(copy=True, with_mean=True, with_std=True)


In [91]:
df_test.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,0,1,0,...,0,0,0,0,0,0,0,0,1,0
99355,6,141,11987,2,25,13,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
890251,22,19,10044,2,5,4,5,0,1,0,...,0,0,0,0,0,0,0,0,1,0
745817,26,39,633,1,0,19,3,0,0,0,...,0,0,0,0,0,0,0,0,1,0
421793,17,289,7970,3,15,8,7,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [92]:
transformed_df = pd.DataFrame(scaler.transform(df_test[num_col]), columns=num_col, index=df_test.index)
df_test.drop(columns=num_col, inplace=True)
df_test_std = pd.concat([transformed_df, df_test], axis=1)

In [93]:
df_test_std.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,0.387545,-0.261582,1.382754,1.190666,-0.089223,-0.231598,0.29571,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0
99355,-0.982399,-1.362313,1.569122,-0.18288,-0.021116,1.13893,-0.22869,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0
890251,1.010246,-1.657455,1.036602,-0.18288,-0.293547,-0.916862,-0.22869,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0
745817,1.508407,-1.609071,-1.542679,-1.556426,-0.361655,2.509459,-1.277491,2.407631,0,0,...,0,0,0,0,0,0,0,0,1,0
421793,0.387545,-1.004273,0.468179,1.190666,-0.157331,-0.003176,0.820111,0.039984,0,1,...,0,0,0,0,0,0,0,0,1,0


# Data all ready

In [94]:
stand_df.to_csv('stand_train.csv')
df_test_std.to_csv('stand_test.csv')