### Improving my models (Tuning)

#### Import the essential libraries

In [95]:
import numpy as np
import pandas as pd
from datetime import datetime
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

SEED = np.random.seed(9)

In [96]:
path = '../../data/processed/'

train = pd.read_csv(f'{path}train_processed.csv')
test = pd.read_csv(f'{path}test_processed.csv')

In [97]:
X = train.drop(['PassengerId','Survived'], axis=1)
y = train['Survived']

In [98]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

#### Logistic Regression - Parameters

In [99]:
rl_clf = LogisticRegression(random_state=SEED)

In [100]:
rl_parameters = {
    'penalty': ['l2'],
    'C': [0.05, 0.1, 0.4, 0.8],
    'solver': ['lbfgs'],
    'max_iter': [500, 700, 1000, 1200, 1500]
}

#### Random Forest Classifier - Parameters

In [101]:
rf_clf = RandomForestClassifier(random_state=SEED)

In [102]:
rf_parameters = {
    'n_estimators': [450, 500, 550],
    'criterion': ['gini', 'entropy'],
    'max_depth': [7, 8, 9],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4]
}

#### MLP Classifier - Parameters

In [103]:
mlp_clf = MLPClassifier(random_state=SEED)

In [113]:
mlp_parameters = {
    'solver': ['adam', 'lbfgs'],
    'alpha': [10.0**(-15), 10.0**(-20), 10.0**(-25)],
    'max_iter': [800, 1_000, 1_500],
    'hidden_layer_sizes': [(50, 50), (100, 50)]
}

### Using Grid Seach

- let's use a stopwatch to see how much time has passed

In [105]:
def stopwatch():
    start_time = datetime.now()
    print(f"Time: {start_time.hour}:{start_time.minute}:{start_time.second}")

#### Logistic Regression - Grid Search

In [106]:
stopwatch()

rl_kfold = KFold(shuffle=True, random_state=SEED, n_splits=8)
rl_grid_search = GridSearchCV(estimator=rl_clf, param_grid=rl_parameters, scoring='accuracy',cv=rl_kfold)
rl_grid_search = rl_grid_search.fit(X_train, y_train)

stopwatch()

Time: 14:27:16
Time: 14:27:17


##### Best Score 

In [107]:
rl_grid_search.best_score_.round(3)

0.82

##### Best Params

In [108]:
rl_grid_search.best_params_

{'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}

#### Random Forest - Grid Search

In [109]:
stopwatch()

rf_kfold = KFold(shuffle=True, random_state=SEED, n_splits=8)
rf_grid_search = GridSearchCV(estimator=rf_clf, param_grid=rf_parameters, scoring='accuracy',cv=rf_kfold)
rf_grid_search = rf_grid_search.fit(X_train, y_train)

stopwatch()

Time: 14:27:17
Time: 14:40:9


##### Best Score

In [110]:
rf_grid_search.best_score_.round(3)

0.848

##### Best Params

In [111]:
rf_grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 550}

#### MLP Classifier - Grid Search

In [114]:
start = stopwatch()

mlp_kfold = KFold(shuffle=True, random_state=42, n_splits=8)
mlp_grid_search = GridSearchCV(estimator=mlp_clf, param_grid=mlp_parameters, scoring='accuracy',cv=mlp_kfold)
mlp_grid_search = mlp_grid_search.fit(X_train, y_train)

end = stopwatch()

Time: 14:58:47
Time: 15:7:4


##### Best Score

In [116]:
mlp_grid_search.best_score_.round(3)

0.826

##### Best Params

In [117]:
mlp_grid_search.best_params_

{'alpha': 1e-15,
 'hidden_layer_sizes': (50, 50),
 'max_iter': 1500,
 'solver': 'adam'}

### Making predictions using the best results

In [131]:
rl_best = rl_grid_search.best_estimator_
y_pred_rl = rl_best.predict(X_val)

In [132]:
rf_best = rf_grid_search.best_estimator_
y_pred_rf = rf_best.predict(X_val)

In [133]:
mlp_best = mlp_grid_search.best_estimator_
y_pred_mlp = mlp_best.predict(X_val)

### Evaluating accuracy after tuning our models

- Logistic Regression

In [134]:
lr_ac = accuracy_score(y_val, y_pred_rl)
print(f"Accuracy Score (Logistic Regression): {lr_ac:.3f}")

Accuracy Score (Logistic Regression): 0.777


In [135]:
confusion_matrix(y_val, y_pred_rl)

array([[90, 10],
       [30, 49]])

- Random Forest

In [136]:
rf_ac = accuracy_score(y_val, y_pred_rf)
print(f"Accuracy Score (Random Forest): {lr_ac:.3f}")

Accuracy Score (Random Forest): 0.777


In [137]:
confusion_matrix(y_val, y_pred_rf)

array([[91,  9],
       [34, 45]])

- MLP

In [138]:
mlp_ac = accuracy_score(y_val, y_pred_mlp)
print(f"Accuracy Score (MLP): {mlp_ac:.3f}")

Accuracy Score (MLP): 0.799


In [139]:
confusion_matrix(y_val, y_pred_mlp)

array([[94,  6],
       [30, 49]])

### Ranking the accuracy of ML models

In [140]:
rank = pd.DataFrame({'Models': ['Logistic','RandomForest','MLP'],
                     'Accuracy': [lr_ac,rf_ac,mlp_ac]})

rank

Unnamed: 0,Models,Accuracy
0,Logistic,0.776536
1,RandomForest,0.759777
2,MLP,0.798883


### Making predictions for the test data

In [141]:
test.sample(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,IsMale,Relatives,Embarked
116,1008,3,0.0,0,0,-0.33963,1,0,1
25,917,3,1.803922,1,0,0.00194,1,1,0
229,1121,2,0.705882,0,0,-0.061608,1,0,0


In [142]:
X_test = test.drop('PassengerId',axis=1)

In [148]:
y_pred = rf_best.predict(X_test)

- Creating a new column with predictions in the test dataset

In [149]:
test['Survived'] = y_pred

In [150]:
submission = test[['PassengerId','Survived']]

In [151]:
submission.to_csv('../../data/predict/submission3.csv', index=False)

Public Score: 0.765

### Saving the model

In [154]:
joblib.dump(rf_grid_search, '../../saved_models/model_rf.pkl')

['../../saved_models/model_rf.pkl']