# Classification Models and Classification Metrics

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics 

In [2]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res


## Data

In [3]:
# Load the dataset directly into a pandas DataFrame
df = pd.read_csv("flat_file3.csv" , index_col=False)
df = df.dropna()

# Display the first few rows of the dataset
df

Unnamed: 0,restaurant_name,claimed,price_level,vegetarian_friendly,vegan_options,gluten_free,open_days_per_week,working_shifts_per_week,Certificate of Excellence 2011,Certificate of Excellence 2012,...,Street Parking,Table Service,Takeout,Television,Valet Parking,Validated Parking,Waterfront,Wheelchair Accessible,Wine and Beer,province_Code
1,"""Au Gentil Marquis"" Pâtisserie Cyrille Pertuisot",1.0,3.0,0,0,0,5,10.0,0,0,...,1,0,0,0,0,0,0,0,0,55
5,"""La Maree"" Poissonnerie & Restaurant",1.0,2.0,0,0,0,5,5.0,0,0,...,1,0,1,0,0,0,0,1,1,41
6,#Les2C,1.0,2.0,0,0,0,6,8.0,0,0,...,0,1,0,0,0,0,0,0,0,14
11,0 d'Attente,1.0,2.0,1,0,0,7,12.0,0,0,...,0,0,0,0,0,0,0,0,0,-1
16,1 Coeur Pour 2,1.0,2.0,0,0,0,1,1.0,0,0,...,0,0,0,0,0,0,0,0,0,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154916,İstanbul kebab Feytiat,1.0,1.0,0,0,0,6,11.0,0,0,...,0,0,0,0,0,0,0,0,0,42
154918,İstanbul restaurant,1.0,1.0,0,0,0,7,14.0,0,0,...,0,0,0,0,0,0,0,0,0,18
154919,Ōdate Sushi,1.0,2.0,0,0,0,6,11.0,0,0,...,0,0,0,0,0,0,0,0,0,39
154920,Ōtsu sushi bar,1.0,2.0,0,0,0,6,12.0,0,0,...,0,0,0,0,0,0,0,0,0,85


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
df['high_rated_popular'].value_counts()

high_rated_popular
0    52403
1    25912
Name: count, dtype: int64

In [13]:
df = df.drop('restaurant_name', axis=1)


In [14]:
X = df.loc[:,df.columns != 'high_rated_popular']
y = df['high_rated_popular']

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78315 entries, 1 to 154923
Data columns (total 87 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   claimed                         78315 non-null  float64
 1   price_level                     78315 non-null  float64
 2   vegetarian_friendly             78315 non-null  int64  
 3   vegan_options                   78315 non-null  int64  
 4   gluten_free                     78315 non-null  int64  
 5   open_days_per_week              78315 non-null  int64  
 6   working_shifts_per_week         78315 non-null  float64
 7   Certificate of Excellence 2011  78315 non-null  int64  
 8   Certificate of Excellence 2012  78315 non-null  int64  
 9   Certificate of Excellence 2013  78315 non-null  int64  
 10  Certificate of Excellence 2014  78315 non-null  int64  
 11  Certificate of Excellence 2015  78315 non-null  int64  
 12  Certificate of Excellence 2016  7831

In [8]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Models

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import xgboost as xgb

In [11]:
models_list = pd.DataFrame()

### Logistic Regression

In [13]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [17]:
mod1 = LogisticRegression(max_iter=1000)
mod1.fit(X_train,y_train)

pred1 = mod1.predict(X_test)

In [18]:
mod1 = LogisticRegression(random_state=1, max_iter=1000)
mod1.fit(X,y)

pred1 = mod1.predict(X)

In [19]:
classificationMetrics(y, pred1)

{'Accuracy': 0.913873459745898,
 'Precision': 0.8942042696721648,
 'Recall': 0.8389549243593701,
 'f1-score': 0.865698982537881,
 'Log-loss': 3.104315164522699,
 'AUC': 0.8949368824418837}

In [20]:
model_dict = {'model': "Logistic Regression"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred1)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.913873,0.894204,0.838955,0.865699,3.104315,0.894937


In [21]:
pd.crosstab(y, pred1)

col_0,0,1
high_rated_popular,Unnamed: 1_level_1,Unnamed: 2_level_1
0,49831,2572
1,4173,21739


### Decision Tree

In [22]:
mod2 = DecisionTreeClassifier(random_state=1)
mod2.fit(X,y)

In [23]:
pred2 = mod2.predict(X)
model_dict = {'model': "Decision Tree"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred2)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.913873,0.894204,0.838955,0.865699,3.104315,0.894937
1,Decision Tree,0.999898,1.0,0.999691,0.999846,0.003682,0.999846


In [24]:
pd.crosstab(y, pred2)

col_0,0,1
high_rated_popular,Unnamed: 1_level_1,Unnamed: 2_level_1
0,52403,0
1,8,25904


### Random Forest

In [25]:
mod3 = RandomForestClassifier(random_state=1)
mod3.fit(X,y)

In [26]:
pred3 = mod3.predict(X)
model_dict = {'model': "RandomForest"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred3)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.913873,0.894204,0.838955,0.865699,3.104315,0.894937
1,Decision Tree,0.999898,1.0,0.999691,0.999846,0.003682,0.999846
2,RandomForest,0.999885,0.999961,0.999691,0.999826,0.004142,0.999836


In [27]:
pd.crosstab(y, pred3)

col_0,0,1
high_rated_popular,Unnamed: 1_level_1,Unnamed: 2_level_1
0,52402,1
1,8,25904


### Adaptive Boosting (ADABoost)

In [28]:
mod4 = AdaBoostClassifier(random_state=1)
mod4.fit(X,y)



In [29]:
pred4 = mod4.predict(X)
model_dict = {'model': "ADABoost"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred4)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.913873,0.894204,0.838955,0.865699,3.104315,0.894937
1,Decision Tree,0.999898,1.0,0.999691,0.999846,0.003682,0.999846
2,RandomForest,0.999885,0.999961,0.999691,0.999826,0.004142,0.999836
3,ADABoost,0.910758,0.900487,0.82101,0.858914,3.216614,0.888073


In [27]:
pd.crosstab(y, pred4)

col_0,0,1
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
0,444,56
1,79,189


### Gradient Boosting Machine (GBM)

In [30]:
mod5 = GradientBoostingClassifier(random_state=1)
mod5.fit(X,y)

In [31]:
pred5 = mod5.predict(X)
model_dict = {'model': "GBM"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred5)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.913873,0.894204,0.838955,0.865699,3.104315,0.894937
1,Decision Tree,0.999898,1.0,0.999691,0.999846,0.003682,0.999846
2,RandomForest,0.999885,0.999961,0.999691,0.999826,0.004142,0.999836
3,ADABoost,0.910758,0.900487,0.82101,0.858914,3.216614,0.888073
4,GBM,0.915814,0.890551,0.850031,0.869819,3.034359,0.899187


In [32]:
pd.crosstab(y, pred5)

col_0,0,1
high_rated_popular,Unnamed: 1_level_1,Unnamed: 2_level_1
0,49696,2707
1,3886,22026


### Support Vector Machine (SVM)

In [33]:
mod6 = SVC(probability=True)
mod6.fit(X,y)

In [32]:
pred6 = mod6.predict(X)
model_dict = {'model': "SVM"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred6)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.78125,0.735849,0.58209,0.65,7.884549,0.735045
1,Decision Tree,1.0,1.0,1.0,1.0,2.220446e-16,1.0
2,RandomForest,1.0,1.0,1.0,1.0,2.220446e-16,1.0
3,ADABoost,0.824219,0.771429,0.705224,0.736842,6.335798,0.796612
4,GBM,0.90625,0.911765,0.809701,0.857708,3.379093,0.883851
5,SVM,0.768229,0.755682,0.496269,0.599099,8.353868,0.705134


In [33]:
pd.crosstab(y, pred6)

col_0,0,1
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
0,457,43
1,135,133


In [34]:
### XGBoost

In [35]:
mod7 = xgb.XGBClassifier()
mod7.fit(X,y)

In [36]:
pred7 = mod7.predict(X)
model_dict = {'model': "XGB"}
new_row = pd.DataFrame([{**model_dict, **classificationMetrics(y, pred7)}])
models_list = pd.concat([models_list, new_row], ignore_index=True)
models_list

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Logistic Regression,0.78125,0.735849,0.58209,0.65,7.884549,0.735045
1,Decision Tree,1.0,1.0,1.0,1.0,2.220446e-16,1.0
2,RandomForest,1.0,1.0,1.0,1.0,2.220446e-16,1.0
3,ADABoost,0.824219,0.771429,0.705224,0.736842,6.335798,0.796612
4,GBM,0.90625,0.911765,0.809701,0.857708,3.379093,0.883851
5,SVM,0.768229,0.755682,0.496269,0.599099,8.353868,0.705134
6,XGB,1.0,1.0,1.0,1.0,2.220446e-16,1.0


## Model Selection

In [37]:
models_list.sort_values('Accuracy',ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
1,Decision Tree,1.0,1.0,1.0,1.0,2.220446e-16,1.0
2,RandomForest,1.0,1.0,1.0,1.0,2.220446e-16,1.0
6,XGB,1.0,1.0,1.0,1.0,2.220446e-16,1.0
4,GBM,0.90625,0.911765,0.809701,0.857708,3.379093,0.883851
3,ADABoost,0.824219,0.771429,0.705224,0.736842,6.335798,0.796612
0,Logistic Regression,0.78125,0.735849,0.58209,0.65,7.884549,0.735045
5,SVM,0.768229,0.755682,0.496269,0.599099,8.353868,0.705134


In [38]:
models_list.sort_values('AUC',ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
1,Decision Tree,1.0,1.0,1.0,1.0,2.220446e-16,1.0
2,RandomForest,1.0,1.0,1.0,1.0,2.220446e-16,1.0
6,XGB,1.0,1.0,1.0,1.0,2.220446e-16,1.0
4,GBM,0.90625,0.911765,0.809701,0.857708,3.379093,0.883851
3,ADABoost,0.824219,0.771429,0.705224,0.736842,6.335798,0.796612
0,Logistic Regression,0.78125,0.735849,0.58209,0.65,7.884549,0.735045
5,SVM,0.768229,0.755682,0.496269,0.599099,8.353868,0.705134
