# Model Building

This notebook trains and examins different ML classifiers on training data dataset. 

* K Nearest neighbours
* SVM
* Random forest
* Logistic regression
* Decision Tree
* ADABoost
* ANN

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

import pickle

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight

In [2]:
df_tr = pd.read_csv('./data/train_features_processed.csv')
y_tr = pd.read_csv('./data/train_labels_processed.csv')
y_tr = y_tr.squeeze()

In [3]:
df_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Age      891 non-null    float64
 1   SibSp    891 non-null    float64
 2   Parch    891 non-null    float64
 3   Fare     891 non-null    float64
 4   female   891 non-null    float64
 5   C        891 non-null    float64
 6   Q        891 non-null    float64
 7   Class_2  891 non-null    float64
 8   Class_3  891 non-null    float64
dtypes: float64(9)
memory usage: 62.8 KB


## 1. Splitting training data
 Split training dataset into training dataset, cross validation dataset and test dataset - 55%,25% and 20% resp. 

In [4]:
X_train_temp, X_test, y_train_temp, y_test = train_test_split(df_tr,
                                                              y_tr,
                                                              test_size=0.2,
                                                              random_state=42)

In [5]:
X_train, X_cv, y_train, y_cv = train_test_split(X_train_temp,
                                                y_train_temp,
                                                test_size=0.25)

In [6]:
type(y_train)

pandas.core.series.Series

## 2. Model development

In [7]:
def bestModel(model, X_train, X_cv, y_train, y_cv, grid):
    model_cv = GridSearchCV(model, grid, cv=4)
    model_cv.fit(X_train, y_train)
    print("tuned hpyerparameters :(best parameters) ", model_cv.best_params_)
    print("accuracy :", model_cv.best_score_)
    mod_best= model_cv.best_estimator_
    return mod_best


def testModel(model, X_train, X_cv, y_train, y_cv):
    model.fit(X_train,y_train)
    y_pred_tr = model.predict(X_train)
    y_pred_cv = model.predict(X_cv)
    precision_tr, recall_tr, fscore_tr, support_tr = precision_recall_fscore_support(
        y_train, y_pred_tr, average='macro')
    precision_cv, recall_cv, fscore_cv, support_cv = precision_recall_fscore_support(
        y_cv, y_pred_cv, average='macro')
    report= np.array([precision_tr, recall_tr, fscore_tr, precision_cv, recall_cv, fscore_cv])
    return report,model

def testResult(model_trained,X_test):
    y_pred_test=model_trained.predict(X_test)
    return y_pred_test

### 1. Logistic Regression

In [8]:
logReg = LogisticRegression(class_weight='auto', max_iter=200)
grid = {
    'C': np.logspace(-5, 0, 40),
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
}
logReg_best=bestModel(logReg, X_train, X_cv, y_train, y_cv, grid)

tuned hpyerparameters :(best parameters)  {'C': 0.02894266124716749, 'solver': 'newton-cg'}
accuracy : 0.7920547637751094


### 2. K-Nearest neighbours

In [None]:
knn = KNeighborsClassifier()
grid = {
    'n_neighbors': np.int0(np.linspace(3, 22, 20)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
knn_best=bestModel(knn, X_train, X_cv, y_train, y_cv, grid)

### 3. Decision tree

In [None]:
decTree = DecisionTreeClassifier()
grid = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}
decTree_best=bestModel(decTree, X_train, X_cv, y_train, y_cv, grid)

### 4. Random forest

In [None]:
rnFr = RandomForestClassifier()
grid = {
    'n_estimators': np.array([10,20,30]),
    'criterion': ['gini', 'entropy']
}
rnFr_best=bestModel(rnFr, X_train, X_cv, y_train, y_cv, grid)

### 5. SVM

In [None]:
svm = SVC()
grid = {
    'C': np.logspace(-5, 2, 10),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
svm_best=bestModel(svm, X_train, X_cv, y_train, y_cv, grid)

### 6. ADABoost

In [None]:
adab = AdaBoostClassifier()
grid = {'algorithm': ['SAMME', 'SAMME.R']}
adab_best=bestModel(adab, X_train, X_cv, y_train, y_cv, grid)

### 7. XGBoost

In [None]:
xgbst = xgb.XGBClassifier(use_label_encoder=False, objective='binary:logistic')
grid = {'n_estimators': np.array([50, 70, 100])}
xgbst_best=bestModel(xgbst, X_train, X_cv, y_train, y_cv, grid)

### 8. Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

In [None]:
ann_mod=Sequential()
ann_mod.add(Dense(units=9,activation='relu'))
ann_mod.add(Dropout(0.5))
ann_mod.add(Dense(units=18,activation='relu'))
ann_mod.add(Dropout(0.5))
ann_mod.add(Dense(units=1,activation='sigmoid'))
ann_mod.compile(optimizer='adam',loss='binary_crossentropy')

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)

In [None]:
ann_mod.fit(x=X_train.values, 
          y=y_train.values, 
          epochs=600,
          validation_data=(X_cv.values, y_cv.values), verbose=1,
          callbacks=[early_stop])

In [None]:
model_loss=pd.DataFrame(ann_mod.history.history)
model_loss.plot()

In [None]:
y_pred_tr = ann_mod.predict_classes(X_train.values)
y_pred_cv = ann_mod.predict_classes(X_cv.values)
precision_tr, recall_tr, fscore_tr, support_tr = precision_recall_fscore_support(
    y_train.values, y_pred_tr, average='macro')
precision_cv, recall_cv, fscore_cv, support_cv = precision_recall_fscore_support(
    y_cv.values, y_pred_cv, average='macro')

In [None]:
print(precision_tr, recall_tr, fscore_tr, support_tr)
print(precision_cv, recall_cv, fscore_cv, support_cv)

In [None]:
ann_mod.save('ann_titanic.h5')


## 3. Model testing

In [None]:
model_list=[logReg_best,knn_best,decTree_best,rnFr_best,svm_best,adab_best,xgbst_best]

In [None]:
for model in model_list:
    print(str(model).split('(')[0])

In [None]:
report_d={}
for model in model_list:
    name=str(model).split('(')[0]
    filename = './models/'+name + '_titanic.model'    
    report,mod=testModel(model, X_train, X_cv, y_train, y_cv)
    pickle.dump(model, open(filename, 'wb'))
    report_d[name]=report

In [None]:
report_d