# Model Building

This notebook trains and examins different ML classifiers on training data dataset. 

* K Nearest neighbours
* SVM
* Random forest
* Logistic regression
* ANN

In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report,accuracy_score,roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

In [2]:
df_tr=pd.read_csv('./data/train_features_processed.csv')
y_tr=pd.read_csv('./data/train_labels_processed.csv')
y_tr=y_tr.squeeze()

In [3]:
df_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Age      891 non-null    float64
 1   SibSp    891 non-null    float64
 2   Parch    891 non-null    float64
 3   Fare     891 non-null    float64
 4   female   891 non-null    float64
 5   C        891 non-null    float64
 6   Q        891 non-null    float64
 7   Class_2  891 non-null    float64
 8   Class_3  891 non-null    float64
dtypes: float64(9)
memory usage: 62.8 KB


## 1. Splitting training data
 Split training dataset into training dataset, cross validation dataset and test dataset - 55%,25% and 20% resp. 

In [4]:
X_train_temp, X_test, y_train_temp, y_test = train_test_split(df_tr, y_tr, test_size=0.2, random_state=42)

In [5]:
X_train, X_cv, y_train, y_cv = train_test_split(X_train_temp, y_train_temp, test_size=0.25)

In [6]:
type(y_train)

pandas.core.series.Series

## 2. Model development

### 1. Logistic Regression

In [18]:
logReg=LogisticRegression(class_weight='auto',max_iter=200)
grid={'C':np.logspace(0,1,20),'solver':['newton-cg', 'lbfgs', 'sag', 'saga']}
logReg_cv=GridSearchCV(logReg,grid,cv=10)
logReg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logReg_cv.best_params_)
print("accuracy :",logReg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 4.281332398719393, 'solver': 'newton-cg'}
accuracy : 0.803319357092942


In [19]:
logReg_fin=LogisticRegression(C=4.2813,solver='newton-cg',class_weight='auto',max_iter=200)
logReg_fin.fit(X_train,y_train)

y_pred_tr=logReg_fin.predict(X_train)

In [23]:
print(classification_report(y_train,y_pred_tr))
print(roc_auc_score(y_train,y_pred_tr))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       325
           1       0.79      0.70      0.74       209

    accuracy                           0.81       534
   macro avg       0.80      0.79      0.80       534
weighted avg       0.81      0.81      0.81       534

0.7901361796098638


In [25]:
y_pred_cv=logReg_fin.predict(X_cv)
print(classification_report(y_cv,y_pred_cv))
print(roc_auc_score(y_cv,y_pred_cv))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       119
           1       0.71      0.63      0.67        59

    accuracy                           0.79       178
   macro avg       0.77      0.75      0.76       178
weighted avg       0.79      0.79      0.79       178

0.7505341119498647
