# Prediction involving decision trees and student performance data

using data set from https://archive.ics.uci.edu/ml/datasets/Student+Performance


In [1]:
# imports
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import sklearn as sklearn
from sklearn import tree
import matplotlib.pyplot as plt
%matplotlib qt

In [2]:
# import dataset to pandas dataframe
dataset = pd.read_csv('student_dataset/student-por.csv', sep=';')
dataset

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


In [3]:
# add label column 'pass' (0,1)
dataset['pass'] = dataset.apply(lambda row: 1 if (row['G1']+row['G2']+row['G3'])>=35 else 0 ,axis=1)
dataset = dataset.drop(columns=['G1','G2','G3'], axis=1)
dataset

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,pass
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,4,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,2,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,6,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,0,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,yes,no,5,4,2,1,2,5,4,0
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,yes,no,4,3,4,1,1,1,4,1
646,MS,F,18,U,GT3,T,1,1,other,other,...,no,no,1,1,1,1,1,5,6,0
647,MS,M,17,U,LE3,T,3,1,services,services,...,yes,no,2,4,5,3,4,2,6,0


In [4]:
cols = dataset.columns
numeric_cols = dataset._get_numeric_data().columns
categorical_cols = list(set(cols) - set(numeric_cols))
categorical_cols
len(categorical_cols)

['schoolsup',
 'address',
 'romantic',
 'nursery',
 'school',
 'higher',
 'reason',
 'internet',
 'paid',
 'Mjob',
 'famsup',
 'Pstatus',
 'famsize',
 'guardian',
 'sex',
 'activities',
 'Fjob']

17

In [5]:
# use 1-hot encoding on categorical columns
dataset = pd.get_dummies(data=dataset, columns=categorical_cols)
dataset

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_other,sex_F,sex_M,activities_no,activities_yes,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher
0,18,4,4,2,2,0,4,3,4,1,...,0,1,0,1,0,0,0,0,0,1
1,17,1,1,1,2,0,5,3,3,1,...,0,1,0,1,0,0,0,1,0,0
2,15,1,1,1,2,0,4,3,2,2,...,0,1,0,1,0,0,0,1,0,0
3,15,4,2,1,3,0,3,2,2,1,...,0,1,0,0,1,0,0,0,1,0
4,16,3,3,1,2,0,4,3,2,1,...,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,19,2,3,1,3,1,5,4,2,1,...,0,1,0,0,1,0,0,1,0,0
645,18,3,1,1,2,0,4,3,4,1,...,0,1,0,1,0,0,0,0,1,0
646,18,1,1,2,2,0,1,1,1,1,...,0,1,0,0,1,0,0,1,0,0
647,17,3,1,2,1,0,2,4,5,3,...,0,0,1,1,0,0,0,0,1,0


In [6]:
# shuffle rows and split to train(500) and test(149)
dataset = dataset.sample(frac=1)

train = dataset[:500]
test = dataset[500:]

X_train = train.drop(['pass'], axis=1)
y_train = train['pass']

X_test = test.drop(['pass'], axis=1)
y_test = test['pass']

dataset_att = dataset.drop(['pass'], axis=1)
passed = dataset['pass']


X_train
y_train
X_test
y_test

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_other,sex_F,sex_M,activities_no,activities_yes,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher
447,17,2,1,3,1,0,5,5,5,5,...,0,0,1,0,1,0,0,1,0,0
387,18,2,2,1,1,0,5,4,2,1,...,0,0,1,1,0,0,0,1,0,0
192,16,4,2,1,1,0,4,3,3,3,...,0,0,1,0,1,0,0,0,1,0
200,16,1,2,1,1,0,3,3,3,1,...,0,0,1,0,1,0,0,0,1,0
210,17,4,4,1,1,0,5,2,3,1,...,0,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,15,4,3,1,2,0,4,4,4,2,...,0,1,0,0,1,0,0,0,1,0
428,15,2,1,2,1,0,1,3,4,1,...,0,1,0,1,0,0,0,1,0,0
445,15,1,1,4,2,0,5,4,5,2,...,0,0,1,0,1,0,0,1,0,0
263,18,2,2,1,2,0,5,5,4,3,...,0,0,1,1,0,0,0,0,1,0


447    0
387    1
192    0
200    0
210    1
      ..
64     1
428    0
445    0
263    0
231    0
Name: pass, Length: 500, dtype: int64

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_other,sex_F,sex_M,activities_no,activities_yes,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher
449,15,1,2,2,1,0,5,1,2,1,...,0,1,0,1,0,0,0,0,1,0
565,16,1,1,2,1,0,4,5,5,2,...,0,0,1,0,1,0,0,1,0,0
288,18,2,1,1,2,0,4,3,4,1,...,0,1,0,1,0,0,0,1,0,0
215,16,2,3,1,2,0,4,4,3,1,...,0,1,0,0,1,0,0,1,0,0
512,18,1,1,1,2,1,4,3,2,1,...,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,17,3,3,1,2,0,4,3,4,1,...,0,0,1,0,1,0,0,1,0,0
601,18,2,2,2,2,0,1,3,1,1,...,0,1,0,1,0,0,0,1,0,0
438,17,2,3,2,1,0,5,5,5,1,...,0,1,0,1,0,0,0,1,0,0
162,15,2,1,4,1,0,4,5,5,2,...,0,0,1,1,0,0,0,1,0,0


449    1
565    0
288    1
215    1
512    0
      ..
191    0
601    0
438    0
162    0
493    0
Name: pass, Length: 149, dtype: int64

In [7]:
# check the pass distebution
print(f'{np.sum(passed)} out of {len(passed)}({round(float(np.sum(passed)*100)/len(passed),2)}%))')

328 out of 649(50.54%))


In [8]:
# Building the desicion tree of depth 5 using entropy 
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5)
model = model.fit(X_train, y_train)

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model,
                   filled=True)

In [9]:
# model score using test set
model.score(X_test, y_test)

0.6577181208053692

In [10]:
scores = sklearn.model_selection.cross_val_score(estimator=model,
                                                 X=dataset_att,
                                                 y=passed,
                                                 cv=5
                                                )
print(f'accuracy: {round(scores.mean(),2)} (+/-{round(scores.std(),2)})')

accuracy: 0.7 (+/-0.03)


In [11]:
# test different max_depth hyperparameter
depth_acc = np.empty((20,3), float)

for i in range(1,21):
    model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=i)
    scores = sklearn.model_selection.cross_val_score(estimator=model,
                                                     X=dataset_att,
                                                     y=passed,
                                                     cv=5
                                                    )
    depth_acc[i-1,0] = i
    depth_acc[i-1,1] = scores.mean()
    depth_acc[i-1,2] = scores.std()*2
    
    print(f'Max Depth: {i}, accuracy: {round(scores.mean(),2)} (+/-{round(scores.std(),2)})')
    
fig, ax = plt.subplots()
ax.errorbar(depth_acc[:,0],depth_acc[:,1], depth_acc[:,2])
plt.show()

Max Depth: 1, accuracy: 0.61 (+/-0.02)
Max Depth: 2, accuracy: 0.69 (+/-0.03)
Max Depth: 3, accuracy: 0.7 (+/-0.04)
Max Depth: 4, accuracy: 0.71 (+/-0.04)
Max Depth: 5, accuracy: 0.69 (+/-0.03)
Max Depth: 6, accuracy: 0.69 (+/-0.04)
Max Depth: 7, accuracy: 0.71 (+/-0.05)
Max Depth: 8, accuracy: 0.68 (+/-0.04)
Max Depth: 9, accuracy: 0.67 (+/-0.05)
Max Depth: 10, accuracy: 0.67 (+/-0.04)
Max Depth: 11, accuracy: 0.68 (+/-0.05)
Max Depth: 12, accuracy: 0.67 (+/-0.05)
Max Depth: 13, accuracy: 0.66 (+/-0.04)
Max Depth: 14, accuracy: 0.65 (+/-0.04)
Max Depth: 15, accuracy: 0.67 (+/-0.04)
Max Depth: 16, accuracy: 0.67 (+/-0.04)
Max Depth: 17, accuracy: 0.65 (+/-0.05)
Max Depth: 18, accuracy: 0.64 (+/-0.06)
Max Depth: 19, accuracy: 0.68 (+/-0.05)
Max Depth: 20, accuracy: 0.64 (+/-0.04)


<ErrorbarContainer object of 3 artists>