In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import pickle

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Matthew1906/COMP67450001-FinalProject/main/datasets/processed.csv')

In [3]:
df['satisfaction'] = np.where(df['satisfaction'] == 'satisfied', 1,0 )
df['satisfaction'].value_counts()

0    58879
1    45025
Name: satisfaction, dtype: int64

### Data Understanding

In [4]:
df.sample(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
34469,Female,Loyal Customer,23,Business travel,Business,1830,2,4,4,4,...,2,4,4,3,2,3,2,0,27.0,0
69879,Female,Loyal Customer,53,Business travel,Eco,481,5,4,4,4,...,5,5,5,5,5,5,1,1,1.0,1
40172,Female,Loyal Customer,12,Personal Travel,Eco,329,0,3,0,4,...,4,2,1,2,2,4,2,140,137.0,1
103531,Male,Loyal Customer,34,Business travel,Business,2723,5,5,5,5,...,4,4,3,4,3,4,5,0,0.0,1
85380,Female,Loyal Customer,27,Business travel,Eco,1183,3,3,3,3,...,3,1,1,3,1,4,3,81,62.0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  object 
 1   Customer Type                      103904 non-null  object 
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  object 
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    1039

In [6]:
df.describe()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
count,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0
mean,39.379706,1189.448375,2.729683,3.060296,2.756901,2.976883,3.202129,3.250375,3.439396,3.358158,3.382363,3.351055,3.631833,3.30429,3.640428,3.286351,14.815618,15.133392,0.433333
std,15.114964,997.147281,1.327829,1.525075,1.398929,1.277621,1.329533,1.349509,1.319088,1.332991,1.288354,1.315605,1.180903,1.265396,1.175663,1.312273,38.230901,38.649776,0.495538
min,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0,0.0
50%,40.0,843.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0,0.0
75%,51.0,1743.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0,1.0
max,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0,1.0


In [7]:
cat = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
continuous =['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
num = df.describe().columns.tolist()

In [8]:
for i in cat:
  print(i)
  print(df[i].value_counts(), end = '\n\n')

Gender
Female    52727
Male      51177
Name: Gender, dtype: int64

Customer Type
Loyal Customer       84923
disloyal Customer    18981
Name: Customer Type, dtype: int64

Type of Travel
Business travel    71655
Personal Travel    32249
Name: Type of Travel, dtype: int64

Class
Business    49665
Eco         46745
Eco Plus     7494
Name: Class, dtype: int64



### Splitting the dataset

In [9]:
train_df, test_df = train_test_split(df, random_state = 42, test_size = 0.2)

In [10]:
train_df.sample(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
48424,Female,Loyal Customer,28,Business travel,Eco Plus,666,3,4,1,4,...,3,4,2,3,3,4,3,15,8.0,0
69849,Male,disloyal Customer,34,Business travel,Eco,371,2,0,2,2,...,3,5,2,1,2,3,3,56,40.0,0
28755,Male,Loyal Customer,52,Personal Travel,Eco,162,2,5,0,1,...,2,5,5,5,3,4,2,0,13.0,0
6870,Male,Loyal Customer,38,Business travel,Eco,692,5,5,5,5,...,5,3,2,1,2,5,5,0,0.0,1
47881,Female,Loyal Customer,21,Business travel,Business,2438,3,2,2,2,...,3,1,1,2,3,4,3,0,0.0,0


In [11]:
test_df.sample(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
80358,Female,disloyal Customer,36,Business travel,Business,1390,2,2,2,2,...,3,5,5,4,4,4,3,1,0.0,0
44301,Female,Loyal Customer,63,Personal Travel,Eco,473,4,4,4,2,...,1,1,4,1,4,1,4,0,0.0,0
98677,Female,Loyal Customer,39,Business travel,Business,2865,1,5,5,5,...,1,1,1,1,1,1,1,0,0.0,0
3619,Female,Loyal Customer,9,Personal Travel,Eco,972,2,4,2,3,...,4,5,4,4,5,5,4,0,10.0,0
2212,Male,Loyal Customer,23,Business travel,Business,3414,2,2,2,2,...,5,5,5,5,3,5,5,1,0.0,1


### Remove Outliers

In [12]:
train_df[continuous].describe()

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes
count,83123.0,83123.0,83123.0,83123.0
mean,39.34135,1188.664714,14.851605,15.164094
std,15.123682,996.940695,38.494636,38.930686
min,7.0,31.0,0.0,0.0
25%,27.0,414.0,0.0,0.0
50%,40.0,842.0,0.0,0.0
75%,51.0,1739.0,12.0,13.0
max,85.0,4983.0,1592.0,1584.0


In [13]:
def removeOutlier(df, column):
    Q1=df[column].quantile(0.25)
    Q3=df[column].quantile(0.75)
    IQR=Q3-Q1
    df_final=df[~((df[column]<(Q1-1.5*IQR)) | (df[column]>(Q3+1.5*IQR)))]
    return df_final

In [14]:
print(train_df.shape)
train_df = removeOutlier(train_df, 'Departure Delay in Minutes')
print(train_df.shape)

(83123, 23)
(71441, 23)


### Preprocess Numerical Columns

In [15]:
def preprocessData(scaler, df, column, type):
  if type == 'train':
    df[column] = scaler.fit_transform(df[[column]])
  elif type == 'test':
    df[column] = scaler.transform(df[column])
  return df

In [16]:
minmax = MinMaxScaler()
for i in num:
  print(i)
  train_df = preprocessData(minmax, train_df, i, 'train')
  test_df = preprocessData(minmax, test_df, [i], 'test')

Age
Flight Distance
Inflight wifi service
Departure/Arrival time convenient
Ease of Online booking
Gate location
Food and drink
Online boarding
Seat comfort
Inflight entertainment
On-board service
Leg room service
Baggage handling
Checkin service
Inflight service
Cleanliness
Departure Delay in Minutes
Arrival Delay in Minutes
satisfaction


### Encode Categorical Columns

In [17]:
test_df.sample(5)

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
74431,Female,Loyal Customer,0.461538,Business travel,Business,0.317851,1.0,1.0,1.0,1.0,...,0.8,0.6,0.8,0.75,0.5,0.8,0.6,5.7,1.0,1.0
66821,Male,Loyal Customer,0.551282,Personal Travel,Eco,0.114903,0.8,0.4,0.8,0.8,...,0.2,0.6,0.4,0.5,0.5,0.8,0.2,0.0,0.0,1.0
59438,Female,Loyal Customer,0.512821,Personal Travel,Eco,0.054321,0.4,0.8,0.4,0.6,...,0.4,0.4,0.4,0.25,1.0,0.4,1.0,0.966667,0.175182,0.0
25873,Female,Loyal Customer,0.076923,Personal Travel,Eco,0.210824,0.4,1.0,0.6,0.4,...,1.0,0.8,0.8,1.0,0.5,0.8,1.0,0.6,0.094891,0.0
20154,Female,disloyal Customer,0.115385,Business travel,Eco,0.028675,0.4,0.8,0.4,0.6,...,0.4,0.4,0.2,0.5,0.75,0.6,0.4,3.5,0.708029,0.0


In [18]:
for i in cat:
  print(train_df[i].value_counts())

Female    36215
Male      35226
Name: Gender, dtype: int64
Loyal Customer       58445
disloyal Customer    12996
Name: Customer Type, dtype: int64
Business travel    49154
Personal Travel    22287
Name: Type of Travel, dtype: int64
Business    34292
Eco         32091
Eco Plus     5058
Name: Class, dtype: int64


In [19]:
def getDummies(df2, column):
  df2_ex = pd.DataFrame()
  df2_ex = pd.get_dummies(df2[column])

  df2 = pd.concat([df2, df2_ex], axis = 1)
  df2 = df2.drop(column, axis = 1)
  return df2

In [20]:
one_hot_columns = ['Gender', 'Type of Travel']
ordinal_columns = ['Class', 'Customer Type']

In [21]:
for i in one_hot_columns:
  train_df = getDummies(train_df, i)
  test_df = getDummies(test_df, i)  

In [22]:
def ordinalEncode(df, map, column):
  df[column] = df[column].map(map)
  return df
class_map = {'Business' : 2,
             'Eco Plus' : 1,
             'Eco': 0}
cType_map = {'Loyal Customer': 1,
             'disloyal Customer': 0}             

In [23]:
train_df['Class'].value_counts()

Business    34292
Eco         32091
Eco Plus     5058
Name: Class, dtype: int64

In [24]:
train_df = ordinalEncode(train_df, class_map, 'Class')
train_df = ordinalEncode(train_df, cType_map, 'Customer Type')
test_df = ordinalEncode(test_df, class_map, 'Class')
test_df = ordinalEncode(test_df, cType_map, 'Customer Type')

In [25]:
train_df.sample(5)

Unnamed: 0,Customer Type,Age,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,...,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Female,Male,Business travel,Personal Travel
63621,1,0.576923,2,0.167609,1.0,1.0,1.0,1.0,0.6,0.8,...,0.5,1.0,0.8,0.0,0.0,1.0,0,1,1,0
4781,1,0.307692,0,0.082189,0.8,0.8,0.8,0.2,0.8,0.8,...,1.0,0.8,0.8,0.0,0.007299,0.0,0,1,0,1
97192,1,0.358974,0,0.542407,0.6,1.0,0.6,0.6,0.6,0.4,...,0.25,0.8,0.4,0.233333,0.138686,0.0,1,0,0,1
80526,1,0.141026,2,0.372375,0.2,0.2,0.2,0.2,0.8,1.0,...,1.0,0.8,0.8,0.0,0.0,1.0,0,1,1,0
66334,1,0.512821,0,0.043821,1.0,0.8,0.8,0.8,1.0,1.0,...,0.5,0.6,1.0,0.0,0.0,1.0,0,1,1,0


### Model Selection

In [26]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 85 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [27]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.40.0-cp37-cp37m-manylinux2010_x86_64.whl (564 kB)
[K     |████████████████████████████████| 564 kB 5.2 MB/s 
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.40.0 slicer-0.0.7


In [28]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, recall_score, accuracy_score, roc_auc_score, precision_score, auc, roc_curve

In [29]:
models = {
    "Logistic Regression" : LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "LDA" : LinearDiscriminantAnalysis(), 
    "SGD" : SGDClassifier(),
    "Gaussian": GaussianNB(),
    "Random Forest" : RandomForestClassifier(), 
    "Gradient Boosting" : GradientBoostingClassifier(),
    "XGBoost" : XGBClassifier(),
    "CatBoost" : CatBoostClassifier(),
    "LGBM" : LGBMClassifier(),
    "KNN" : KNeighborsClassifier(n_neighbors=3)
}

In [30]:
X_train = train_df.drop("satisfaction", axis='columns')
y_train = train_df['satisfaction']
X_test = test_df.drop('satisfaction', axis='columns')
y_test = test_df['satisfaction']

In [31]:
scores = []
probability = {}
for model in models:
  classifier = models[model]
  classifier.fit(X_train, y_train)
  predicts = classifier.predict(X_test)
  try:
    score = classifier.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, score, average='weighted')
    probability[model] = score
  except:
    roc = 0
  scores.append([
    model, 
    accuracy_score(y_test, predicts),
    f1_score(y_test, predicts, average ='weighted'),
    precision_score(y_test, predicts),
    recall_score(y_test, predicts),
    roc
  ])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Learning rate set to 0.063763
0:	learn: 0.6057037	total: 67.4ms	remaining: 1m 7s
1:	learn: 0.5175633	total: 87.4ms	remaining: 43.6s
2:	learn: 0.4444911	total: 107ms	remaining: 35.6s
3:	learn: 0.4088680	total: 127ms	remaining: 31.5s
4:	learn: 0.3768392	total: 147ms	remaining: 29.2s
5:	learn: 0.3446428	total: 167ms	remaining: 27.6s
6:	learn: 0.3208842	total: 186ms	remaining: 26.4s
7:	learn: 0.2910685	total: 211ms	remaining: 26.2s
8:	learn: 0.2793013	total: 231ms	remaining: 25.4s
9:	learn: 0.2647521	total: 252ms	remaining: 25s
10:	learn: 0.2492099	total: 273ms	remaining: 24.5s
11:	learn: 0.2357348	total: 292ms	remaining: 24s
12:	learn: 0.2273777	total: 311ms	remaining: 23.6s
13:	learn: 0.2161293	total: 331ms	remaining: 23.3s
14:	learn: 0.2097410	total: 351ms	remaining: 23.1s
15:	learn: 0.2010471	total: 371ms	remaining: 22.8s
16:	learn: 0.1941085	total: 395ms	remaining: 22.8s
17:	learn: 0.1906422	total: 419ms	remaining: 22.9s
18:	learn: 0.1856781	total: 439ms	remaining: 22.7s
19:	learn: 0.

In [32]:
model_comparison_df = pd.DataFrame(scores)
model_comparison_df.rename({0:'Model', 1: 'Accuracy', 2: 'F1', 3: 'Precision', 4: 'Recall', 5: 'ROC'}, axis = 1, inplace=True)
model_comparison_df.sort_values('Recall', ascending=False).reset_index(). drop('index', axis = 1, inplace=True)
model_comparison_df

Unnamed: 0,Model,Accuracy,F1,Precision,Recall,ROC
0,Logistic Regression,0.866753,0.865966,0.877955,0.806793,0.916526
1,Decision Tree,0.9411,0.941135,0.927886,0.937914,0.94074
2,LDA,0.866176,0.865526,0.87232,0.812197,0.918199
3,SGD,0.869256,0.867931,0.899685,0.788266,0.0
4,Gaussian,0.841105,0.840107,0.847517,0.775364,0.879621
5,Random Forest,0.961166,0.961084,0.971249,0.938796,0.99343
6,Gradient Boosting,0.943314,0.943213,0.947787,0.92082,0.987916
7,XGBoost,0.939656,0.939531,0.945496,0.914424,0.987658
8,CatBoost,0.962466,0.962394,0.971445,0.941663,0.994979
9,LGBM,0.962273,0.962195,0.972292,0.94034,0.994717


Will be using the CatBoost Algorithm with the highest performance out of 10 classification algorithms

In [31]:
grid = {'max_depth': [5, 6, 7],
        'n_estimators':[300, 400, 500],
        'learning_rate' : [0.01, 0.05, 0.1, 0.15]}

In [32]:
cbc = CatBoostClassifier()

In [33]:
gscv = GridSearchCV (estimator = cbc, param_grid = grid, scoring ='accuracy', cv = 5)

In [34]:
gscv.fit(X_train, y_train)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
200:	learn: 0.0636799	total: 3.97s	remaining: 1.96s
201:	learn: 0.0635687	total: 3.99s	remaining: 1.93s
202:	learn: 0.0634076	total: 4.01s	remaining: 1.91s
203:	learn: 0.0631990	total: 4.03s	remaining: 1.89s
204:	learn: 0.0630627	total: 4.04s	remaining: 1.87s
205:	learn: 0.0630159	total: 4.06s	remaining: 1.85s
206:	learn: 0.0629130	total: 4.09s	remaining: 1.83s
207:	learn: 0.0628773	total: 4.11s	remaining: 1.82s
208:	learn: 0.0627804	total: 4.14s	remaining: 1.8s
209:	learn: 0.0626852	total: 4.16s	remaining: 1.78s
210:	learn: 0.0625046	total: 4.18s	remaining: 1.76s
211:	learn: 0.0623911	total: 4.2s	remaining: 1.74s
212:	learn: 0.0622747	total: 4.22s	remaining: 1.72s
213:	learn: 0.0621948	total: 4.24s	remaining: 1.7s
214:	learn: 0.0621108	total: 4.26s	remaining: 1.68s
215:	learn: 0.0620835	total: 4.27s	remaining: 1.66s
216:	learn: 0.0619584	total: 4.29s	remaining: 1.64s
217:	learn: 0.0618455	total: 4.31s	remaining: 1.62s
21

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostClassifier object at 0x7f00c06d2e90>,
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.15],
                         'max_depth': [5, 6, 7],
                         'n_estimators': [300, 400, 500]},
             scoring='accuracy')

In [35]:
#returns the estimator with the best performance
print(gscv.best_estimator_, end = '\n\n')

#returns the best score
print(gscv.best_score_, end = '\n\n')

#returns the best parameters
print(gscv.best_params_)

<catboost.core.CatBoostClassifier object at 0x7f00d6544550>

0.9627804851804577

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 400}


In [36]:
final_model = CatBoostClassifier(learning_rate=0.1, max_depth = 7, n_estimators= 400)
final_model.fit(X_train, y_train)

0:	learn: 0.5468935	total: 22.8ms	remaining: 9.1s
1:	learn: 0.4249368	total: 45.6ms	remaining: 9.07s
2:	learn: 0.3637575	total: 69.1ms	remaining: 9.14s
3:	learn: 0.3203396	total: 92.1ms	remaining: 9.12s
4:	learn: 0.2681264	total: 116ms	remaining: 9.18s
5:	learn: 0.2459512	total: 139ms	remaining: 9.12s
6:	learn: 0.2212577	total: 161ms	remaining: 9.05s
7:	learn: 0.2092383	total: 185ms	remaining: 9.05s
8:	learn: 0.1942657	total: 207ms	remaining: 8.99s
9:	learn: 0.1824168	total: 234ms	remaining: 9.12s
10:	learn: 0.1760044	total: 260ms	remaining: 9.19s
11:	learn: 0.1699009	total: 285ms	remaining: 9.2s
12:	learn: 0.1629295	total: 308ms	remaining: 9.16s
13:	learn: 0.1553620	total: 330ms	remaining: 9.11s
14:	learn: 0.1519693	total: 358ms	remaining: 9.2s
15:	learn: 0.1488424	total: 382ms	remaining: 9.16s
16:	learn: 0.1462022	total: 404ms	remaining: 9.1s
17:	learn: 0.1440555	total: 428ms	remaining: 9.09s
18:	learn: 0.1393287	total: 458ms	remaining: 9.19s
19:	learn: 0.1377307	total: 483ms	remaini

<catboost.core.CatBoostClassifier at 0x7f00bfe58a50>

In [38]:
predicts = final_model.predict(X_test)
print(f"Final Model's accuracy  : {accuracy_score(y_test, predicts)}")
print(f"Final Model's F1 Score  : {f1_score(y_test, predicts, average ='weighted')}")
print(f"Final Model's Precision : {precision_score(y_test, predicts)}")
print(f"Final Model's Recall    : {recall_score(y_test, predicts)}")

Final Model's accuracy  : 0.9628025600307973
Final Model's F1 Score  : 0.9627364843877363
Final Model's Precision : 0.9708252923146782
Final Model's Recall    : 0.9430966034406705


### Saving the model

In [39]:
with open('model.pickle','wb') as pickle_file:
    pickle.dump(final_model, pickle_file)

In [40]:
from google.colab import  files
files.download('model.pickle')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>