### Improving my models (Tuning)

#### Import the essential libraries

In [21]:
import numpy as np
import pandas as pd
from datetime import datetime
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

SEED = np.random.seed(9)

In [22]:
path = '../../data/processed/'

train = pd.read_csv(f'{path}train_processed.csv')
test = pd.read_csv(f'{path}test_processed.csv')

In [23]:
X = train.drop(['PassengerId','Survived'], axis=1)
y = train['Survived']

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

#### Logistic Regression - Parameters

In [25]:
rl_clf = LogisticRegression(random_state=SEED)

In [26]:
rl_parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [1000, 10000, 50000, 72000]
}

#### Random Forest Classifier - Parameters

In [27]:
rf_clf = RandomForestClassifier(random_state=SEED)

In [28]:
rf_parameters = {
    'n_estimators': [100, 200, 500, 1_000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [2, 4, 6, 8, None],
    'max_features': ['sqrt', 'log2', None]
}

#### MLP Classifier - Parameters

In [29]:
mlp_clf = MLPClassifier(random_state=SEED)

In [30]:
mlp_parameters = {
    'solver':  ['lbfgs', 'sgd', 'adam'],
    'alpha': [10.0**(-1), 10.0**(-5), 10.0**(-7), 10.0**(-10)],
    'max_iter': [500, 1_000,5_000, 10_000]
}

### Using Grid Seach

- let's use a stopwatch to see how much time has passed

In [34]:
def stopwatch():
    start_time = datetime.now()
    print(f"Time: {start_time.hour}:{start_time.minute}:{start_time.second}")

#### Logistic Regression - Grid Search

In [35]:
stopwatch()

rl_kfold = KFold(shuffle=True, random_state=SEED, n_splits=8)
rl_grid_search = GridSearchCV(estimator=rl_clf, param_grid=rl_parameters, scoring='accuracy',cv=rl_kfold)
rl_grid_search = rl_grid_search.fit(X_train, y_train)

stopwatch()

Time: 18:45:52
Time: 18:45:57


##### Best Score 

In [40]:
rl_grid_search.best_score_.round(3)

0.816

##### Best Params

In [43]:
rl_grid_search.best_params_

{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}

#### Random Forest - Grid Search

In [37]:
stopwatch()

rf_kfold = KFold(shuffle=True, random_state=SEED, n_splits=8)
rf_grid_search = GridSearchCV(estimator=rf_clf, param_grid=rf_parameters, scoring='accuracy',cv=rf_kfold)
rf_grid_search = rf_grid_search.fit(X_train, y_train)

stopwatch()

Time: 19:1:25
Time: 19:21:38


##### Best Score

In [42]:
rf_grid_search.best_score_.round(3)

0.848

##### Best Params

In [44]:
rf_grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 500}

#### MLP Classifier - Grid Search

In [45]:
start = stopwatch()

mlp_kfold = KFold(shuffle=True, random_state=42, n_splits=8)
mlp_grid_search = GridSearchCV(estimator=mlp_clf, param_grid=mlp_parameters, scoring='accuracy',cv=mlp_kfold)
mlp_grid_search = mlp_grid_search.fit(X_train, y_train)

end = stopwatch()

Time: 19:27:38
Time: 19:35:52


##### Best Score

In [47]:
mlp_grid_search.best_score_.round(3)

0.833

##### Best Params

In [49]:
mlp_grid_search.best_params_

{'alpha': 1e-10, 'max_iter': 1000, 'solver': 'adam'}

### Making predictions using the best results

In [60]:
rl_best = rl_grid_search.best_estimator_
y_pred_rl = rl_best.predict(X_val)

In [61]:
rf_best = rf_grid_search.best_estimator_
y_pred_rf = rf_best.predict(X_val)

In [62]:
best_mlp = mlp_grid_search.best_estimator_
y_pred_mlp = best_mlp.predict(X_val)

### Evaluating accuracy after tuning our models

- Logistic Regression

In [63]:
lr_ac = accuracy_score(y_val, y_pred_rl)
print(f"Accuracy Score (Logistic Regression): {lr_ac:.3f}")

Accuracy Score (Logistic Regression): 0.777


In [66]:
confusion_matrix(y_val, y_pred_rl)

array([[90, 10],
       [30, 49]])

- Random Forest

In [64]:
rf_ac = accuracy_score(y_val, y_pred_rf)
print(f"Accuracy Score (Random Forest): {lr_ac:.3f}")

Accuracy Score (Random Forest): 0.777


In [67]:
confusion_matrix(y_val, y_pred_rf)

array([[91,  9],
       [33, 46]])

- MLP

In [65]:
mlp_ac = accuracy_score(y_val, y_pred_mlp)
print(f"Accuracy Score (MLP): {mlp_ac:.3f}")

Accuracy Score (MLP): 0.771


In [68]:
confusion_matrix(y_val, y_pred_mlp)

array([[90, 10],
       [31, 48]])

### Ranking the accuracy of ML models

In [69]:
rank = pd.DataFrame({'Models': ['Logistic','RandomForest','MLP'],
                     'Accuracy': [lr_ac,rf_ac,mlp_ac]})

rank

Unnamed: 0,Models,Accuracy
0,Logistic,0.776536
1,RandomForest,0.765363
2,MLP,0.77095


### Making predictions for the test data

In [70]:
test.sample(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,IsMale,Relatives,Embarked
61,953,2,0.392157,0,0,-0.040425,1,0,0
69,961,1,2.588235,1,4,10.529728,0,5,0
118,1010,1,0.705882,0,0,2.575283,1,0,1


In [71]:
X_test = test.drop('PassengerId',axis=1)

In [72]:
y_pred = rf_best.predict(X_test)

- Creating a new column with predictions in the test dataset

In [73]:
test['Survived'] = y_pred

In [74]:
submission = test[['PassengerId','Survived']]

In [75]:
submission.to_csv('../../data/predict/submission2.csv', index=False)

Public Score: 0.765