# Model flow on comp_data_household 


## Import packages and data

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from earthquake_damage.ml_logic.preprocessor import cus_imputation, preprocess_features, preprocess_targets
from earthquake_damage.data.main import train_test_val

## Imputation and preprocess the data 

In [4]:
cus_imputation(filename = 'comp_data_household')


Imputation...

✅  There are 26 vaules missing in the dataset.

✅ df_imputed, with shape (747137, 44)
✅ df_imputed saved to /Users/caobai/code/chantalwuer/earthquake_damage/processed_data/df_imputed.csv


In [5]:
preprocess_features()
preprocess_targets()


Preprocess features...

✅ X_processed, with shape (747137, 79)
✅ X_processed saved to /Users/caobai/code/chantalwuer/earthquake_damage/processed_data/X_processed.csv

Preprocess target...

✅ y processed, with shape (747137,)
✅ y_processed saved to /Users/caobai/code/chantalwuer/earthquake_damage/processed_data/y_processed.csv


In [6]:
X = pd.read_csv('/Users/caobai/code/chantalwuer/earthquake_damage/processed_data/X_processed.csv')
y = pd.read_csv('/Users/caobai/code/chantalwuer/earthquake_damage/processed_data/y_processed.csv')


In [7]:
X.shape, y.shape

((747137, 79), (747137, 1))

## Check data statistics

In [21]:
X.isna().sum().sum()

0

In [22]:
X.duplicated().sum()

0

In [25]:
y.value_counts()

damage_grade
4               276274
3               181467
2               132168
1                83609
0                73619
dtype: int64

In [10]:
X_train, X_test, X_val, y_train, y_test, y_val = train_test_val()

In [11]:
y_train = y_train['damage_grade']
y_val = y_val['damage_grade']
y_test = y_test['damage_grade']

## PCA on dimensionality reduction 
### The accuracy & f1_micro score is better without PCA 

In [28]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)


In [29]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_pca, y_train)


In [30]:
X_val_pca = pca.transform(X_val)
y_val_pred_pca = knn.predict(X_val_pca)


In [31]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_val, y_val_pred_pca)

0.3802300040908671

In [32]:
knn.fit(X_train, y_train)
y_val_pred = knn.predict(X_val)

In [33]:
balanced_accuracy_score(y_val, y_val_pred)

0.4038442715541331

In [34]:
from sklearn.metrics import f1_score

print('f1-micro with pca',f1_score(y_test, y_val_pred_pca, average='micro'))
print('f1-micro without pca',f1_score(y_test, y_val_pred, average='micro'))


f1-micro with pca 0.25027884109180787
f1-micro without pca 0.2532412488511747


In [14]:
from sklearn.metrics import f1_score
f1_score(y_val, y_val_pred, average='micro')

0.5820506643110172

### Use model.feature_importances_ to select features is better than selectpercentile method

In [123]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_jobs=-1)

xgb_model.fit(X_train, y_train)

y_val_pred = xgb_model.predict(X_val)


In [128]:
pd.set_option('display.max_rows', None)
a = pd.DataFrame(xgb_model.feature_importances_, index=X_train.columns).sort_values(by=0, ascending=False)
model_feature = a.head(76).index
model_feature 

Index(['pipeline-1__has_superstructure_mud_mortar_stone',
       'pipeline-1__district_id',
       'pipeline-2__foundation_type_Mud mortar-Stone/Brick',
       'pipeline-2__ground_floor_type_RC',
       'pipeline-1__has_superstructure_stone_flag',
       'pipeline-2__roof_type_RCC/RB/RBC', 'pipeline-1__vdcmun_id',
       'pipeline-2__other_floor_type_TImber/Bamboo-Mud', 'pipeline-1__ward_id',
       'pipeline-1__has_superstructure_cement_mortar_brick',
       'pipeline-1__has_superstructure_adobe_mud',
       'pipeline-2__foundation_type_RC', 'pipeline-1__household_id',
       'pipeline-1__has_superstructure_mud_mortar_brick',
       'pipeline-1__has_superstructure_rc_engineered',
       'pipeline-2__other_floor_type_Not applicable',
       'pipeline-2__foundation_type_Bamboo/Timber',
       'pipeline-2__ground_floor_type_Timber',
       'pipeline-1__has_superstructure_timber',
       'pipeline-2__roof_type_Bamboo/Timber-Light roof',
       'pipeline-1__has_secondary_use', 'pipeline-2_

In [129]:
xgb_model.fit(X_train[model_feature], y_train)

y_val_pred_model_feature = xgb_model.predict(X_val[model_feature])

f1_score(y_val, y_val_pred_model_feature, average='micro')

0.5820238955662036

### Use SelectPrecentile to select top 8 features from the dataset

# GridSearch best parameter for XGBoostclassifier

In [135]:
from sklearn.model_selection import GridSearchCV 
import warnings
warnings.filterwarnings('ignore')


In [136]:
grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.3, 0.5,1],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [137]:
model = XGBClassifier(n_jobs=-1, n_estimators=100, random_state=42)
search = GridSearchCV(model, grid, scoring="f1_micro", n_jobs=-1, cv=3, verbose=1)


In [138]:
search.fit(X_train[model_feature], y_train) 

Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [140]:
search.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 1,
 'max_depth': 7,
 'reg_lambda': 10,
 'subsample': 0.8}

In [141]:
search.best_score_

0.5925735434158835

In [142]:
search.best_estimator_

In [None]:
best_model = XGBClassifier(n_jobs=-1, n_estimators=100, random_state=42, colsample_bytree=0.5,
    gamma=0, learning_rate=1, max_depth=7, reg_lambda=10, subsample=0.8)
best_model.fit(X_train[model_feature], y_train) 

In [None]:
y_val_best = best_model.predict(X_val[model_feature])
f1_score(y_val, y_val_best, average='micro') 

In [168]:
X_train[model_feature].shape
X_val[model_feature].shape

(112071, 76)

# Build a neural netword 

In [152]:
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras import layers

In [203]:
model = Sequential()
model.add(layers.Dense(10, input_dim=X_train.shape[1], activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [204]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 10)                800       
                                                                 
 dense_13 (Dense)            (None, 8)                 88        
                                                                 
 dense_14 (Dense)            (None, 5)                 45        
                                                                 
Total params: 933
Trainable params: 933
Non-trainable params: 0
_________________________________________________________________


In [205]:
es = EarlyStopping(patience=3, restore_best_weights=True)

In [206]:
from keras.utils import to_categorical
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)
y_test_cat = to_categorical(y_test)
y_train_cat.shape


(522995, 5)

In [194]:
X_train.iloc[:20, :10].shape
y_train_cat[:20].shape

(20, 5)

In [207]:
from earthquake_damage.data.main import reduce_memory_df

In [210]:
a = reduce_memory_df(X_train)
b = reduce_memory_df(pd.DataFrame(y_train)) 
b= to_categorical(b)

Original memory usage of df is 162 MB
New memory usage of df is 162 MB
Original memory usage of df is 4 MB
New memory usage of df is 0 MB


In [211]:
model.fit(a, b, 
          epochs=20, batch_size=32, verbose=1,
          callbacks=[es], validation_data=0.1)


Epoch 1/20


ValueError: Creating variables on a non-first call to a function decorated with tf.function.