In [1]:
# Importing libraries 
import numpy as np
import pandas as pd
from sklearn import tree
from time import time
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,mean_absolute_error, classification_report,roc_auc_score, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
# Loading/Reading data set using pandas 
# dataLoad = '/content/gdrive/My Drive/data/dataset.csv'
dt = '/content/gdrive/Shared with me/Hands_On_ML_Projects/Data Source/dataset.csv'
dataset = pd.read_csv(dt)

FileNotFoundError: ignored

In [None]:
# Subsetting dataset to select ONLY 
input_data = dataset.iloc[:, 0:len(dataset.columns)-1]

In [None]:
# Displaying few data point
input_data.head(5)

Unnamed: 0,Alt,Bar,Fri,Hun,Pat,Price,Rain,Res,Type,Est
0,Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10
1,No,No,No,Yes,Some,$$$,No,Yes,French,0-11
2,Yes,No,No,Yes,Some,$$$,No,No,Thai,0-12
3,Yes,Yes,No,Yes,Full,$$$,No,Yes,Thai,0-13
4,Yes,Yes,No,Yes,Full,$$$,Yes,NO,Thai,> 60


In [None]:
# Peeking on the data set using pandan describe
input_data.describe()

Unnamed: 0,Alt,Bar,Fri,Hun,Pat,Price,Rain,Res,Type,Est
count,25,25,25,25,25,25,25,25,25,25
unique,2,2,2,2,3,2,2,3,4,23
top,Yes,No,No,Yes,Some,$$$,No,Yes,French,30 - 60
freq,16,17,24,23,13,18,15,14,10,2


In [None]:
# Representing our categorical variables using One-Hot-Encoding (Dummy Variables)
dummies = [pd.get_dummies(dataset[c]) for c in input_data.columns]

features = pd.concat(dummies, axis = 1)

# 0 to 25 is train set
X = features[0:25].values
le = LabelEncoder()

# 15th is test set
y = le.fit_transform(dataset['WillWait'].values)

In [None]:
# Splitting data set into training & testing sets (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

In [None]:
# Evaluating & printing the model
def model_evaluation(y_test, y_pred):

    # Computing the following with confusion matrix
    cf_1 = confusion_matrix(y_test, y_pred)

    # Computing my simple model evaluation metrics - that is, TP, TN, FP etc.,
    TP = cf_1[1][1]  #TP - true positives
    FP = cf_1[0][1]  #FP - false positives
    TN = cf_1[0][0]  #TN - true negativess
    FN = cf_1[1][0]  #FN - false negatives
    TPR = round((TP/(TP + FN)) * 100, 2) #TPR = TP/(TP + FN)
    TNR = round((TN/(TN + FP)) * 100, 2) #TNR = TN/(TN + FP)
    ACC = round(((TP + TN)/(TP + TN + FP + FN)) * 100, 2)
    roc_auc = roc_auc_score(y_test, y_pred)
    precision = TP / (TP + FP)
    
    # Finding the RMSE value
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))    
    
    # Displaying the Performance metrics of the model
    print(f'TP - true positives: {TP}\nFP - false positives: {FP}\nTN - true negativess: '
          f'{TN}\nFN - false negatives: {FN}\nTPR - true positive rate: {TPR}%\n'
          f'TNR - true negative rate: {TNR}%')
    print("Mean Absolute Error (MAE) : {:.2}".format(mae))
    print("Root Mean Squared Error (RMSE) : {:.2}\n".format(rmse))

    print(classification_report(y_test, y_pred))
    print(end='\n')

In [None]:
# Method for DecisionTreeClassifier_model    
def decisionTreeClassifier_model(X_train, X_test, y_train, y_test):
    t0 = time()
    
    # Create classifier
    model = DecisionTreeClassifier(criterion='entropy')
    
    # Fit the classifier on the training features and labels.
    t0 = time()
    model.fit(X_train, y_train)
    print('\nPerfomance Report:\n')
    print("Training time:", round(time()-t0, 3), "s")

    # Predicting using X_test_norm
    t1 = time()
    y_pred = model.predict(X_test)
    print("Prediction time:", round(time()-t1, 3), "s\n")
 
    # Calling/displaying model evaluation function
    model_evaluation(y_test, y_pred)      

    return y_pred

In [None]:
# Calling/Printing metrics from a user defined DecisionTreeClassifier_model method from function.py
y_pred = decisionTreeClassifier_model(X_train, X_test, y_train, y_test)


Perfomance Report:

Training time: 0.002 s
Prediction time: 0.0 s

TP - true positives: 2
FP - false positives: 2
TN - true negativess: 2
FN - false negatives: 2
TPR - true positive rate: 50.0%
TNR - true negative rate: 50.0%
Mean Absolute Error (MAE) : 0.5
Root Mean Squared Error (RMSE) : 0.71

              precision    recall  f1-score   support

           0       0.50      0.50      0.50         4
           1       0.50      0.50      0.50         4

    accuracy                           0.50         8
   macro avg       0.50      0.50      0.50         8
weighted avg       0.50      0.50      0.50         8




In [None]:
# Displaying Actuall values vs. Predicted values
df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,1,0
3,0,1
4,1,1
5,0,0
6,1,0
7,0,1


In [None]:
# Error rate
error_rate = np.mean(y_pred != y_test )
print(f'Error Rate: {error_rate}')

Error Rate: 0.5
