In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import OneHotEncoder # one-hot encoding
from sklearn.decomposition import PCA # for PCA
from sklearn import tree # decision tree
from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.model_selection import KFold # K-fold validation
from sklearn import metrics
from sklearn.metrics import classification_report # for model evaluation metrics

In [69]:
""" Data input """
data = pd.read_csv('student-mat.csv', sep=";")

""" Data preprocessing """

# one-hot representation
data = pd.get_dummies(data)

# add new col based on G3
data['pass'] = data['G3'].map(lambda x: 'pass' if x >= 10 else 'fail')
data['eval'] = 'A'
data.loc[data['G3']<16, 'eval'] = 'B'
data.loc[data['G3']<14, 'eval'] = 'C'
data.loc[data['G3']<12, 'eval'] = 'D'
data.loc[data['G3']<10, 'eval'] = 'E'

# shuffle
data = data.sample(frac=1)

""" Train-Test Split """
X = data.iloc[:,0:-3].values
y_eval = data.iloc[:,-1].values
y_pass = data.iloc[:,-2].values

kf = KFold(n_splits=3)

array(['pass', 'pass', 'pass', 'fail', 'fail', 'pass', 'pass', 'fail',
       'fail', 'pass', 'pass', 'pass', 'fail', 'fail', 'pass', 'fail',
       'pass', 'pass', 'pass', 'fail', 'fail', 'fail', 'pass', 'pass',
       'pass', 'pass', 'pass', 'pass', 'fail', 'pass', 'pass', 'pass',
       'fail', 'pass', 'pass', 'pass', 'pass', 'pass', 'pass', 'pass',
       'pass', 'pass', 'pass', 'fail', 'pass', 'pass', 'pass', 'pass',
       'pass', 'pass', 'fail', 'pass', 'pass', 'fail', 'pass', 'pass',
       'fail', 'pass', 'pass', 'pass', 'pass', 'pass', 'fail', 'pass',
       'fail', 'fail', 'pass', 'pass', 'fail', 'pass', 'pass', 'pass',
       'pass', 'pass', 'fail', 'pass', 'fail', 'fail', 'fail', 'pass',
       'pass', 'pass', 'pass', 'fail', 'pass', 'fail', 'pass', 'fail',
       'pass', 'fail', 'pass', 'pass', 'pass', 'fail', 'fail', 'pass',
       'fail', 'fail', 'pass', 'pass', 'pass', 'pass', 'pass', 'fail',
       'pass', 'fail', 'pass', 'pass', 'fail', 'pass', 'pass', 'pass',
      

In [34]:
""" PCA """

def pca(dim, feature):
    pca_dim = PCA(dim)
    feature_pca = pca.fit_transform(feature)
    
    return pd.DataFrame(feature_pca)

In [39]:
""" Model Construciton """

# decision tree
DT_clf = tree.DecisionTreeClassifier()

for train_index, test_index in kf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    DT_clf = DT_clf.fit(X_train, y_train)
    y_predicted = DT_clf.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predicted)
    print(classification_report(y_test, y_predicted))

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        81
           1       1.00      1.00      1.00        51

    accuracy                           1.00       132
   macro avg       1.00      1.00      1.00       132
weighted avg       1.00      1.00      1.00       132

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        49

    accuracy                           1.00       132
   macro avg       1.00      1.00      1.00       132
weighted avg       1.00      1.00      1.00       132

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        32

    accuracy                           1.00       131
   macro avg       1.00      1.00      1.00       131
weighted avg       1.00      1.00      1.00       131

