<a href="https://colab.research.google.com/github/donib-irakihda/fusemachines-ai-fellowship/blob/main/Evaluation_Metrics_Decision_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [20]:
DATASET_URL = "https://storage.googleapis.com/codehub-data/1-lv1-1-student-mat.csv"
df = pd.read_csv(DATASET_URL, sep=";")
print("Shape of data", df.shape)
df.head()

Shape of data (395, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [21]:
X = df[df.columns.difference(['G3'])]
y = df['G3']

y.loc[y< 10] = int(0)
y.loc[(y >= 10) & (y < 15)] = int(1)
y.loc[y >= 15] = int(2)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [23]:
## Data Preprocessing

from sklearn.preprocessing import LabelEncoder

def preprocess(df):
  numeric = ['age', 'traveltime', 'Medu','Fedu', 'studytime', 'failures', 'famrel','freetime', 'goout', 'Dalc', 'Walc', 'health' , 'absences', 'G1', 'G2']
  cat_nominal = ['Mjob', 'Fjob', 'reason', 'guardian']
  cat_binary = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup','famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
  
  df[cat_binary] = df[cat_binary].apply(LabelEncoder().fit_transform)

  df = pd.get_dummies(df, columns=cat_nominal, prefix=cat_nominal)

  return df

X_train, x_test = preprocess(X_train), preprocess(X_test)

In [24]:
# Train different classifiers

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier

model_LR = LogisticRegression()

model_DT = DecisionTreeClassifier()

model_dummy = DummyClassifier(strategy= 'most_frequent')

model_LR.fit(X_train, y_train)
model_DT.fit(X_train, y_train)
model_dummy.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [25]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix

class_names = ["Grade A", "Grade B", "Grade C"]

from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay.from_estimator(model_LR, X_test, y_test, display_labels=class_names, cmap=plt.cm.Blues)

print("Total number of points in Test Set, Grade A: ", np.array(y_test.value_counts())[2])
print("Total number of points in Test Set, Grade B: ", np.array(y_test.value_counts())[1])
print("Total number of points in Test Set, Grade c: ", np.array(y_test.value_counts())[0])

plt.show()

ValueError: ignored

In [26]:
y_pred_LR = model_LR.predict(X_test)

ValueError: ignored

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Exercise
# Macro precision, recall and F1 score
precision_LR_macro = precision_score(y_test, y_pred_LR, average='macro')
recall_LR_macro = recall_score(y_test, y_pred_LR, average='macro')
f1score_LR_macro = f1_score(y_test, y_pred_LR,average='macro')

# Exercise
# Micro precision, recall and F1 score
precision_LR_micro = precision_score(y_test, y_pred_LR, average='micro')
recall_LR_micro = recall_score(y_test, y_pred_LR, average='micro')
f1score_LR_micro = f1_score(y_test, y_pred_LR,average='micro')

# Exercise
# Weighted precision, recall and F1 score
precision_LR_W = precision_score(y_test, y_pred_LR, average='weighted')
recall_LR_W = recall_score(y_test, y_pred_LR, average='weighted')
f1score_LR_W = f1_score(y_test, y_pred_LR,average='weighted')




print(f"Logistic Regression | Macro Precision - {round(precision_LR_macro,2)}, Macro Recall - { round(recall_LR_macro,2)}, Macro F1 Score - { round(f1score_LR_macro,2)}") 
print(f"Logistic Regression | Micro Precision - {round(precision_LR_micro,2)}, Micro Recall - { round(recall_LR_micro,2)}, Micro F1 Score - { round(f1score_LR_micro,2)}") 
print(f"Logistic Regression | Weighted Precision - {round(precision_LR_W,2)}, Weighted Recall - { round(recall_LR_W,2)}, Weighted F1 Score - { round(f1score_LR_W,2)}") 

In [27]:
# Import LabelBinarizer
from sklearn.preprocessing import label_binarize

print("Before Label Binarizer\n", y_test[0:5])

# 3 Labels : 0 - C, 1 - B, 2 - A  
labels = [0, 1, 2] 

# LabelBinarizer
y_test = label_binarize(y_test, classes=labels)
print("After Label Binarizer \n", y_test[0:5])

Before Label Binarizer
 146    0
379    1
247    0
197    1
368    1
Name: G3, dtype: int64
After Label Binarizer 
 [[1 0 0]
 [0 1 0]
 [1 0 0]
 [0 1 0]
 [0 1 0]]


In [28]:
# Probability Prediction
# Exercise
pred_LR = model_LR.predict_proba(X_test)

print(pred_LR[0:5])

ValueError: ignored

In [29]:
from sklearn.metrics import precision_recall_curve

### PRECISION-RECALL FOR LOGISTIC REGRESSION MODEL

precision_lr = dict()  # Dictionary to store Precision 
recall_lr = dict()     # Dictionary to store Recall 

for i in range(3):
    precision_lr[i], recall_lr[i], _ = precision_recall_curve(y_test[:, i], pred_LR[:, i])

# Plot Precision-recall curve for a each class
plt.figure(dpi=(120))
for i in range(3):
    plt.plot(recall_lr[i], precision_lr[i],label='class {}'.format(i))
    plt.plot([0, 1], [0, 1], 'k--')  

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc="best")

plt.show()

NameError: ignored

In [30]:
# Probability Prediction
pred_LR = model_LR.predict_proba(X_test)
pred_DT = model_DT.predict_proba(X_test)
pred_dummy = model_dummy.predict_proba(X_test)

ValueError: ignored

In [31]:
from sklearn.metrics import roc_auc_score

# Exercise
# Using OVR ( One Vs Rest Approach ) and Micro Averaging.
auc_score_LR = roc_auc_score(y_test, pred_LR, multi_class = 'ovr', average ='macro')
auc_score_DT = roc_auc_score(y_test, pred_DT, multi_class = 'ovr', average ='macro')
auc_score_Dummy = roc_auc_score(y_test, pred_dummy, multi_class = 'ovr', average ='macro')

print("AUC Score of Logistic Regression is ", auc_score_LR)
print("AUC Score of Decision Tree is ", auc_score_DT)
print("AUC Score of Dummy Classifier is ", auc_score_Dummy)

NameError: ignored

In [32]:
### ROC CURVE FOR LOGISTIC REGRESSION MODEL

# Compute ROC curve and ROC area for each class
from sklearn.metrics import roc_curve, auc

fpr = dict()     # Dictionary to Store False Positive Rate
tpr = dict()     # Dictionary to Store True Positive Rate
roc_auc = dict() # Dictionary to Store AUC Score

for i in range(3):
    # TPR and FPR for Logisitc Regression Model
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], pred_LR[:, i])
    # ROC Score
    roc_auc[i] = auc(fpr[i], tpr[i]) 



# Plot ROC curve for a specific class
plt.figure(dpi=(150))
for i in range(3):
    plt.plot(fpr[i], tpr[i], label='(AUC  = %0.2f)' %roc_auc[i] ) # TPR and FPR
    plt.plot([0, 1], [0, 1], 'k--')                               # Diagonal : Dummy Classifier 
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic - Logistic Regression')
    plt.legend(loc="lower right")
plt.text(0.52, 0.2, 'Grade C - ')  
plt.text(0.52, 0.12, 'Grade B - ')    
plt.text(0.52, 0.03, 'Grade A - ')    

plt.show()

NameError: ignored