# Student Performance Analysis

In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import statsmodels.api as sm
from sklearn import metrics
from sklearn.metrics import classification_report

In [93]:
# load datasets for two subjects, Math and Portuguese
mat = pd.read_csv("student-mat.csv", sep=',')
por = pd.read_csv("student-por.csv", sep=',')
df = pd.concat([mat,por])

In [94]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0,1044.0
mean,16.726054,2.603448,2.387931,1.522989,1.970307,0.264368,3.935824,3.201149,3.15613,1.494253,2.284483,3.543103,4.434866,11.213602,11.246169,11.341954
std,1.239975,1.124907,1.099938,0.731727,0.834353,0.656142,0.933401,1.031507,1.152575,0.911714,1.285105,1.424703,6.210017,2.983394,3.285071,3.864796
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,9.0,9.0,10.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [95]:
# rename column labels
df.columns = ['school','sex','age','address','family_size','parents_status','mother_education','father_education',
        'mother_job','father_job','reason','guardian','commute_time','study_time','failures','school_support',
        'family_support','paid_classes','activities','nursery','desire_higher_edu','internet','romantic',
        'family_quality', 'free_time','go_out','weekday_alcohol_usage','weekend_alcohol_usage','health',
        'absences','period1_score','period2_score','final_score']

In [96]:
# convert final_score to categorical variable # Good:15~20 Fair:10~14 Poor:0~9
## Distributing final scores in three categories for the purpose of classification
#- Students with 16<= score <=20 get 'good'
#- Students with 12<= score <=15 get 'fair'
#- Students with  0 <=score <=11 get 'poor'

df['final_grade'] = 'na'
df.loc[(df.final_score >= 16) & (df.final_score <= 20), 'final_grade'] = 'good' 
df.loc[(df.final_score >= 12) & (df.final_score <= 15), 'final_grade'] = 'fair' 
df.loc[(df.final_score >= 0) & (df.final_score <= 11), 'final_grade'] = 'poor' 
df.head(5)

Unnamed: 0,school,sex,age,address,family_size,parents_status,mother_education,father_education,mother_job,father_job,...,free_time,go_out,weekday_alcohol_usage,weekend_alcohol_usage,health,absences,period1_score,period2_score,final_score,final_grade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,poor
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,poor
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,poor
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,fair
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,poor


# Prepare Dataset for Modelling

In [97]:
dfd = df.drop([ 'final_score'], axis=1)

In [98]:
# label encode final_grade
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [99]:
dfd.final_grade = le.fit_transform(dfd.final_grade)

In [100]:
# dataset train_test_split
from sklearn.cross_validation import train_test_split
X = dfd.drop('final_grade',axis=1)
y = dfd.final_grade
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [101]:
# get dummy varibles 
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [102]:
# see total number of features
len(list(X_train))

58

### Random Forest performs much better with a large number of datasets than when data is small while Support Vector Machine performs better with a smaller number of data sets. In general SVMs are good because the training algorithm is efficient, and it has a regularisation parameter, which causes you to think about regularisation and over-fitting.

### 2. Decision Tree Classification

In [103]:
# find the optimal # of minimum samples leaf
from sklearn.tree import DecisionTreeClassifier
msl=[]
for i in range(1,58):
    tree = DecisionTreeClassifier(min_samples_leaf=i)
    t= tree.fit(X_train, y_train)
    ts=t.score(X_test, y_test)
    msl.append(ts)
msl = pd.Series(msl)
msl.where(msl==msl.max()).dropna()

25    0.901274
26    0.901274
27    0.901274
28    0.901274
29    0.901274
30    0.901274
31    0.901274
32    0.901274
33    0.901274
34    0.901274
35    0.901274
36    0.901274
37    0.901274
38    0.901274
39    0.901274
40    0.901274
41    0.901274
42    0.901274
43    0.901274
44    0.901274
45    0.901274
46    0.901274
47    0.901274
48    0.901274
49    0.901274
50    0.901274
51    0.901274
52    0.901274
53    0.901274
54    0.901274
55    0.901274
56    0.901274
dtype: float64

In [104]:
# final model
tree = DecisionTreeClassifier(min_samples_leaf=17)
t= tree.fit(X_train, y_train)
print("Decisioin Tree Model Score" , ":" , t.score(X_train, y_train) , "," , 
      "Cross Validation Score" ,":" , t.score(X_test, y_test))

Decisioin Tree Model Score : 0.9123287671232877 , Cross Validation Score : 0.8821656050955414


In [105]:
res = tree.predict(X_test)

In [106]:
report = classification_report(y_test, res)
print(report)

             precision    recall  f1-score   support

          0       0.86      0.86      0.86       133
          1       0.79      0.92      0.85        36
          2       0.93      0.89      0.91       145

avg / total       0.88      0.88      0.88       314



### 3. Random Forest Classification

In [107]:
# find a good # of estimators
from sklearn.ensemble import RandomForestClassifier

ne=[]
for i in range(1,58):
    forest = RandomForestClassifier()
    f = forest.fit(X_train, y_train)
    fs = f.score(X_test, y_test)
    ne.append(fs)
ne = pd.Series(ne)
ne.where(ne==ne.max()).dropna()

17    0.894904
49    0.894904
dtype: float64

In [108]:
# find a good # of min_samples_leaf
from sklearn.ensemble import RandomForestClassifier

ne=[]
for i in range(1,58):
    forest = RandomForestClassifier(n_estimators=36, min_samples_leaf=i)
    f = forest.fit(X_train, y_train)
    fs = f.score(X_test, y_test)
    ne.append(fs)
ne = pd.Series(ne)
ne.where(ne==ne.max()).dropna()

8    0.894904
dtype: float64

In [109]:
# final model
forest = RandomForestClassifier(n_estimators=36, min_samples_leaf=2)
f = forest.fit(X_train, y_train)
print("Random Forest Model Score" , ":" , f.score(X_train, y_train) , "," ,
      "Cross Validation Score" ,":" , f.score(X_test, y_test))

Raondom Forest Model Score : 0.9698630136986301 , Cross Validation Score : 0.8885350318471338


In [110]:
res = f.predict(X_test)

In [111]:
report = classification_report(y_test, res)
print(report)

             precision    recall  f1-score   support

          0       0.86      0.89      0.87       133
          1       1.00      0.67      0.80        36
          2       0.90      0.94      0.92       145

avg / total       0.89      0.89      0.89       314



### 4. Support Vector Classification

In [112]:
from sklearn.svm import SVC
svc = SVC()
s= svc.fit(X_train, y_train)
print("SVC Model Score" , ":" , s.score(X_train, y_train) , "," ,
      "Cross Validation Score" ,":" , s.score(X_test, y_test))

SVC Model Score : 0.9273972602739726 , Cross Validation Score : 0.8694267515923567


In [113]:
res = s.predict(X_test)

In [114]:
report = classification_report(y_test, res)
print(report)

             precision    recall  f1-score   support

          0       0.85      0.85      0.85       133
          1       0.88      0.81      0.84        36
          2       0.89      0.90      0.89       145

avg / total       0.87      0.87      0.87       314



### As visible from the above results, DecisionTree, RandomForest and Support Vector Classifiers perform almost equally.

### 5. Ada Boost Classification

In [115]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=3)
af = ada.fit(X_train, y_train)
print("Ada Boost Model Score" , ":" , af.score(X_train, y_train) , "," ,
      "Cross Validation Score" ,":" , af.score(X_test, y_test))

Ada Boost Model Score : 0.8863013698630137 , Cross Validation Score : 0.8757961783439491


In [116]:
res = af.predict(X_test)

In [117]:
report = classification_report(y_test, res)
print(report)

             precision    recall  f1-score   support

          0       0.93      0.77      0.84       133
          1       0.67      0.97      0.80        36
          2       0.91      0.94      0.93       145

avg / total       0.89      0.88      0.88       314



- Ada Boost classifier builds decision boundary using several different types of classifiers. Hence, it doesn't perform as well as the above 3 machine learning models. It gives close to 80% precision,recall and f1 scores.

### 6. Multi Layer Perceptron Classification

In [118]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation = "tanh", solver = "adam", alpha = 0.0001, max_iter = 20000, )
sf = mlp.fit(X_train, y_train)
print("Multi Layer Perceptron Model Score" , ":" , sf.score(X_train, y_train) , "," ,
      "Cross Validation Score" ,":" , sf.score(X_test, y_test))

Multi Layer Perceptron Model Score : 0.9246575342465754 , Cross Validation Score : 0.8503184713375797


In [119]:
res = mlp.predict(X_test)

In [120]:
report = classification_report(y_test, res)
print(report)

             precision    recall  f1-score   support

          0       0.85      0.79      0.82       133
          1       0.87      0.75      0.81        36
          2       0.84      0.93      0.89       145

avg / total       0.85      0.85      0.85       314



### As per cross validation score random forests do the best