# Model flow on comp_data_household 


## Import packages and data

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from earthquake_damage.ml_logic.preprocessor import cus_imputation, preprocess_features, preprocess_targets
from earthquake_damage.data.main import train_test_val

## Imputation and preprocess the data 

In [4]:
cus_imputation(filename = 'comp_data_household')


Imputation...

✅  There are 26 vaules missing in the dataset.

✅ df_imputed, with shape (747137, 44)
✅ df_imputed saved to /Users/caobai/code/chantalwuer/earthquake_damage/processed_data/df_imputed.csv


In [5]:
preprocess_features()
preprocess_targets()


Preprocess features...

✅ X_processed, with shape (747137, 79)
✅ X_processed saved to /Users/caobai/code/chantalwuer/earthquake_damage/processed_data/X_processed.csv

Preprocess target...

✅ y processed, with shape (747137,)
✅ y_processed saved to /Users/caobai/code/chantalwuer/earthquake_damage/processed_data/y_processed.csv


In [6]:
X = pd.read_csv('/Users/caobai/code/chantalwuer/earthquake_damage/processed_data/X_processed.csv')
y = pd.read_csv('/Users/caobai/code/chantalwuer/earthquake_damage/processed_data/y_processed.csv')


In [7]:
X.shape, y.shape

((747137, 79), (747137, 1))

In [8]:
a = pd.read_csv('/Users/caobai/code/chantalwuer/earthquake_damage/processed_data/df_imputed.csv')
a.columns 

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'age_building', 'land_surface_condition',
       'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',
       'position', 'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'damage_grade', 'legal_ownership_status', 'count_families',
       'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_h

## Check data statistics

In [21]:
X.isna().sum().sum()

0

In [22]:
X.duplicated().sum()

0

In [25]:
y.value_counts()

damage_grade
4               276274
3               181467
2               132168
1                83609
0                73619
dtype: int64

In [10]:
X_train, X_test, X_val, y_train, y_test, y_val = train_test_val()

In [11]:
y_train = y_train['damage_grade']
y_val = y_val['damage_grade']
y_test = y_test['damage_grade']

## PCA on dimensionality reduction 
### The accuracy & f1_micro score is better without PCA 

In [28]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)


In [29]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_pca, y_train)


In [30]:
X_val_pca = pca.transform(X_val)
y_val_pred_pca = knn.predict(X_val_pca)


In [31]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_val, y_val_pred_pca)

0.3802300040908671

In [32]:
knn.fit(X_train, y_train)
y_val_pred = knn.predict(X_val)

In [33]:
balanced_accuracy_score(y_val, y_val_pred)

0.4038442715541331

In [34]:
from sklearn.metrics import f1_score

print('f1-micro with pca',f1_score(y_test, y_val_pred_pca, average='micro'))
print('f1-micro without pca',f1_score(y_test, y_val_pred, average='micro'))


f1-micro with pca 0.25027884109180787
f1-micro without pca 0.2532412488511747


In [14]:
from sklearn.metrics import f1_score
f1_score(y_val, y_val_pred, average='micro')

0.5820506643110172

### Use model.feature_importances_ to select features

In [123]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_jobs=-1)

xgb_model.fit(X_train, y_train)

y_val_pred = xgb_model.predict(X_val)


In [127]:
a

Unnamed: 0,0
pipeline-1__has_superstructure_mud_mortar_stone,0.308947
pipeline-1__district_id,0.057673
pipeline-2__foundation_type_Mud mortar-Stone/Brick,0.05259
pipeline-2__ground_floor_type_RC,0.048562
pipeline-1__has_superstructure_stone_flag,0.04268
pipeline-2__roof_type_RCC/RB/RBC,0.04214
pipeline-1__vdcmun_id,0.042011
pipeline-2__other_floor_type_TImber/Bamboo-Mud,0.030566
pipeline-1__ward_id,0.024305
pipeline-1__has_superstructure_cement_mortar_brick,0.021347


In [128]:
pd.set_option('display.max_rows', None)
a = pd.DataFrame(xgb_model.feature_importances_, index=X_train.columns).sort_values(by=0, ascending=False)
model_feature = a.head(76).index
model_feature 

Index(['pipeline-1__has_superstructure_mud_mortar_stone',
       'pipeline-1__district_id',
       'pipeline-2__foundation_type_Mud mortar-Stone/Brick',
       'pipeline-2__ground_floor_type_RC',
       'pipeline-1__has_superstructure_stone_flag',
       'pipeline-2__roof_type_RCC/RB/RBC', 'pipeline-1__vdcmun_id',
       'pipeline-2__other_floor_type_TImber/Bamboo-Mud', 'pipeline-1__ward_id',
       'pipeline-1__has_superstructure_cement_mortar_brick',
       'pipeline-1__has_superstructure_adobe_mud',
       'pipeline-2__foundation_type_RC', 'pipeline-1__household_id',
       'pipeline-1__has_superstructure_mud_mortar_brick',
       'pipeline-1__has_superstructure_rc_engineered',
       'pipeline-2__other_floor_type_Not applicable',
       'pipeline-2__foundation_type_Bamboo/Timber',
       'pipeline-2__ground_floor_type_Timber',
       'pipeline-1__has_superstructure_timber',
       'pipeline-2__roof_type_Bamboo/Timber-Light roof',
       'pipeline-1__has_secondary_use', 'pipeline-2_

In [129]:
xgb_model.fit(X_train[model_feature], y_train)

y_val_pred_model_feature = xgb_model.predict(X_val[model_feature])

f1_score(y_val, y_val_pred_model_feature, average='micro')

0.5820238955662036

### Use SelectPrecentile to select top 8 features from the dataset

In [113]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectFromModel, SelectPercentile, f_classif 

selection = SelectPercentile(f_classif, percentile=10)
X_new = selection.fit_transform(X_train, y_train)

X_new = pd.DataFrame(X_new)
X_new.shape
X_new.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [114]:
mask = selection.get_support()
list = X_train.columns[mask]
X_val[list].head(3)
X_new.columns = list
list

Index(['pipeline-1__has_superstructure_mud_mortar_stone',
       'pipeline-1__has_superstructure_cement_mortar_brick',
       'pipeline-2__foundation_type_Mud mortar-Stone/Brick',
       'pipeline-2__foundation_type_RC', 'pipeline-2__roof_type_RCC/RB/RBC',
       'pipeline-2__ground_floor_type_Mud', 'pipeline-2__ground_floor_type_RC',
       'pipeline-2__other_floor_type_RCC/RB/RBC'],
      dtype='object')

In [115]:
xgb_model.fit(X_new, y_train)

y_val_pred = xgb_model.predict(X_val[list])

f1_score(y_val, y_val_pred, average='micro')

0.43316290565802035

In [116]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectFromModel, SelectPercentile

clf = Pipeline([
  ('feature_selection', SelectFromModel(xgb_model)),
  ('classification', xgb_model)
])

In [39]:
# for the baseline model, we use the xgboost model with all features 
from sklearn.model_selection import cross_val_score

xgb_model_score = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='f1_micro')
xgb_model_score = xgb_model_score.mean()
xgb_model_score 

0.5826116884482643

# GridSearch best parameter for XGBoostclassifier

In [44]:
from sklearn.model_selection import GridSearchCV 
