In [324]:
from sklearn import datasets
import pandas as pd
import numpy as np
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from tensorflow.keras import backend as K

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.metrics import f1_score, plot_confusion_matrix, confusion_matrix
from sklearn.compose import ColumnTransformer

from imblearn.pipeline import Pipeline as imbPipeline

from category_encoders import TargetEncoder

import warnings
warnings.filterwarnings('ignore')

In [226]:
X_train = pd.read_csv('Data/X_train.csv', index_col='building_id')
X_test = pd.read_csv('Data/X_test.csv', index_col='building_id')
y_train = pd.read_csv('Data/y_train.csv', index_col='building_id')
y_test = pd.read_csv('Data/y_test.csv', index_col='building_id')

### Reshape y_train for the Keras model

I need y_train in the shape (len(y_train), 3).  I also need to re-encode y so that 1, 2, 3 goes to 0, 1, 2, so that I can use np.utils.to_categorical.  When I do my predictions, I will reverse this encoding.

In [143]:
y_train.values.reshape(1, -1)

array([[2, 2, 2, ..., 2, 3, 2]])

In [165]:
y_train['target'] = y_train.damage_grade.apply(lambda x: x - 1)

In [178]:
dummy_y_train = np_utils.to_categorical(y_train.target)

In [179]:
dummy_y_train.shape

(195450, 3)

### Column Transformer

I will be doing the ususal transformations:  dropping unimportant columns, log-transforming and scaling the integer columns, and target encoding the categorical columns.

In [217]:
binary_cols = []
for col in X_train.columns:
    if col.startswith('has'):
        binary_cols.append(col)

cat_cols = list(X_train.select_dtypes(include='object').columns)

integer_cols = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

all_cols = geo_cols + cat_cols + integer_cols + binary_cols

In [218]:
binary_cols_dropped = binary_cols.copy()
for col in binary_cols_dropped:
    if col.startswith('has_secondary'):
        binary_cols_dropped.remove(col)
binary_cols_dropped.append('has_secondary_use')

cat_cols_dropped = cat_cols.copy()
cat_cols_dropped.remove('legal_ownership_status')
cat_cols_dropped.remove('plan_configuration')

In [219]:
def log_transform(x):
    return np.log(x+1)

In [224]:
function_transformer = FunctionTransformer(log_transform)

ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
integer_pipe = imbPipeline([
    ('function', function_transformer),
    ('ss', StandardScaler())
])
target_pipe = imbPipeline([('target', TargetEncoder(cols=geo_cols))])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols_dropped),
    ('categorical', ohe_pipe, cat_cols_dropped),
    ('geo', target_pipe, geo_cols),
    ('integer', integer_pipe, integer_cols),
])

In [230]:
transformed_X_train = transformer.fit_transform(X_train, y_train)

In [232]:
transformed_X_train.shape

(195450, 49)

The final X_train has 49 features.

### Build the neural network

In [381]:
keras0 = Sequential()
keras0.add(Dense(64, input_dim=49, activation='relu'))
keras0.add(Dense(32, activation='relu'))
keras0.add(Dense(16, activation='relu'))
keras0.add(Dense(3, activation='softmax'))

keras0.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [382]:
start = time.time()
keras0.fit(transformed_X_train, dummy_y_train, epochs=1, batch_size=1000)
end = time.time()
print(f'Runtime: {end-start} seconds')

Runtime: 0.7515280246734619 seconds


### Create a cross validation procedure

In [392]:
def custom_cross_val(name, X=transformed_X_train, y=dummy_y_train, epochs=1,\
                     batch_size=1000, n_splits=5, shuffle=True):
    
    """Takes in a model and performs a cross-validation and returns the average micro-averaged F1 score.
    
    Parameters
    ------------
    name: the name of the model
    X:  default is 'transformed_X_train'
    y:  default is 'dummy_y_train
    epochs: default is 1
    batch_size: default is 1000
    n_splits: number of splits in the KFold, default is 5
    shuffle:  whether to shuffle the splits, default is True
    """
    
    #Create a list to hold f1-micro scores
    f1_micro_scores = []
    #Instantiate a KFold object
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=42)
    
    for train_ind, val_ind in kf.split(X):
        #Fit the model
        name.fit(X[train_ind], y[train_ind], epochs=epochs, batch_size=batch_size)
        #Make predictions
        y_pred_adj = name.predict_classes(X[val_ind])
        #Calculate the f1-micro score
        f1_micro = f1_score(np.argmax(y[val_ind], axis=1), y_pred_adj, average='micro')
        print(f1_micro)
        #Append the score to the list 
        f1_micro_scores.append(f1_micro)
        
    return np.mean(f1_micro_scores)

In [393]:
custom_cross_val(keras0)

0.7533128677411103
0.74778715784088
0.7520081862368893
0.7518802762854949
0.7466103862880532


0.7503197748784856

In [389]:
#Instantiate a KFold object and 
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [390]:
start = time.time()
f1_micro_keras0 = []
for train_ind, val_ind in kf.split(transformed_X_train):
    #Fit the model
    keras0.fit(transformed_X_train[train_ind], dummy_y_train[train_ind], epochs=1, batch_size=1000)
    #Make predictions
    y_pred_adj = keras0.predict_classes(transformed_X_train[val_ind])
    #Calculate the f1-micro score
    f1_micro = f1_score(np.argmax(dummy_y_train[val_ind], axis=1), y_pred_adj, average='micro')
    #Append the score to the list 
    f1_micro_keras0.append(f1_micro)
end = time.time()
print(f'Run time: {end-start}')
run_time_keras0 = end-start

Run time: 4.622860908508301


In [391]:
f1_micro_keras0 = np.mean(f1_micro_keras0)
f1_micro_keras0

0.7502327961115375

### Transform X_test

In [281]:
transformed_X_test = transformer.transform(X_test)

### Make predictions on X_test and score the results

After predicting classes, I will reverse transform the predictions so that 0, 1, 2 is mapped to 1, 2, 3 (the original target.)

In [371]:
y_pred = keras0.predict_classes(transformed_X_test)

In [372]:
y_pred_df = pd.DataFrame(y_pred, columns=['y_pred_adj'], index=X_test.index)

In [373]:
y_pred_df['y_pred'] = y_pred_df['y_pred_adj'].apply(lambda x: x + 1)

In [374]:
keras_score = f1_score(y_test, y_pred_df.y_pred, average='micro')
keras_score

0.7317309020582954