# Best Model Selection

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# other libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

In [3]:
# load dataset
df = sns.load_dataset('titanic')
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
y = df['survived']

X = pd.get_dummies(X, columns=['sex'])  # one-hot encoding
X['age'] = X['age'].fillna(X['age'].mean())  # fill missing values with mean age

In [4]:
X.isnull().sum()  # check for missing values

pclass        0
age           0
sibsp         0
parch         0
fare          0
sex_female    0
sex_male      0
dtype: int64

In [5]:
# splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (712, 7)
X_test shape:  (179, 7)
y_train shape:  (712,)
y_test shape:  (179,)


In [7]:
# select best model

models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), SVC(), KNeighborsClassifier()]
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'KNN']

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append([model_name, accuracy])

    precision = precision_score(y_test, y_pred)
    precision_scores.append([model_name, precision])

    recall = recall_score(y_test, y_pred)
    recall_scores.append([model_name, recall])

    f1 = f1_score(y_test, y_pred)
    f1_scores.append([model_name, f1])

In [8]:
def sorted_scores(scores):
    return sorted(scores, key=lambda x: x[1], reverse=True) 

In [9]:
# print accuracy Scores
for model in sorted_scores(accuracy_scores):
    print("Accuracy Score: ", f'{model[0]} : {model[1]:.2f}')

Accuracy Score:  Logistic Regression : 0.81
Accuracy Score:  Random Forest : 0.81
Accuracy Score:  Decision Tree : 0.75
Accuracy Score:  KNN : 0.69
Accuracy Score:  SVM : 0.66


In [10]:
# print precision Scores
for model in sorted_scores(precision_scores):
    print("Precision Score: ", f'{model[0]} : {model[1]:.2f}')

Precision Score:  Logistic Regression : 0.80
Precision Score:  Random Forest : 0.79
Precision Score:  SVM : 0.76
Precision Score:  Decision Tree : 0.71
Precision Score:  KNN : 0.65


In [11]:
# print recall Scores
for model in sorted_scores(recall_scores):
    print("Recall Score: ", f'{model[0]} : {model[1]:.2f}')

Recall Score:  Random Forest : 0.74
Recall Score:  Logistic Regression : 0.72
Recall Score:  Decision Tree : 0.68
Recall Score:  KNN : 0.54
Recall Score:  SVM : 0.26


In [12]:
# print f1 scores
for model in sorted_scores(f1_scores):
    print("F1 Score: ", f'{model[0]} : {model[1]:.2f}')

F1 Score:  Random Forest : 0.76
F1 Score:  Logistic Regression : 0.76
F1 Score:  Decision Tree : 0.69
F1 Score:  KNN : 0.59
F1 Score:  SVM : 0.38


In [13]:
print(accuracy_scores)

[['Logistic Regression', 0.8100558659217877], ['Decision Tree', 0.7541899441340782], ['Random Forest', 0.8100558659217877], ['SVM', 0.659217877094972], ['KNN', 0.6871508379888268]]


In [15]:
# create data frame from above lists
accuracy_df = pd.DataFrame(accuracy_scores, columns=['Model', 'Accuracy'])
precision_df = pd.DataFrame(precision_scores, columns=['Model', 'Precision'])
recall_df = pd.DataFrame(recall_scores, columns=['Model', 'Recall'])
f1_df = pd.DataFrame(f1_scores, columns=['Model', 'F1'])

# combine all dataframes
df = pd.merge(accuracy_df, precision_df, on='Model')
df = pd.merge(df, recall_df, on='Model')
df = pd.merge(df, f1_df, on='Model')

df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.810056,0.80303,0.716216,0.757143
1,Decision Tree,0.75419,0.714286,0.675676,0.694444
2,Random Forest,0.810056,0.785714,0.743243,0.763889
3,SVM,0.659218,0.76,0.256757,0.383838
4,KNN,0.687151,0.645161,0.540541,0.588235
