## Decision Trees



In [1]:
#imports
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm

import scipy
from scipy.stats import norm
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,6)

from sklearn import metrics
from sklearn.cross_validation import train_test_split, cross_val_predict, cross_val_score, KFold

from sklearn import tree

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, f_classif

from sklearn import svm

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('ggplot')

#import data
Cleaned_data = pd.read_csv("CleanData.csv")

In [32]:
#select best predictors
predictors = ['Fedu', 'G1', 'G2', 'G3', 'Medu', 'absences', 'age', 
                'failures', 'famrel', 'freetime', 'goout', 'health', 'math', 
                'port', 'studytime', 'traveltime', 'school_num', 'sex_num','address_num', 
                'famsize_num', 'Pstatus_num', 'Mjob_num', 'Fjob_num','reason_num', 
                'guardian_num', 'schoolsup_num', 'famsup_num', 'activities_num','nursery_num', 
                'higher_num', 'internet_num', 'romantic_num', 'paid_num']
Cleaned_data['Dalc_high'] = Cleaned_data['Dalc'].map({1:0, 2:0, 3:1, 4:1, 5:1})
Cleaned_data['Walc_high'] = Cleaned_data['Walc'].map({1:0, 2:0, 3:1, 4:1, 5:1})

y_dalc = Cleaned_data['Dalc_high'].as_matrix()
y_walc = Cleaned_data['Walc_high'].as_matrix()
X = Cleaned_data[predictors].as_matrix()

selector = SelectKBest(f_classif, k=10)
X_new = selector.fit_transform(X, y_dalc)

best_labels_dalc = []
for i in range(0,len(selector.get_support())):
    if selector.get_support()[i]:
        best_labels_dalc.append(predictors[i])
print(best_labels_dalc)

selector = SelectKBest(f_classif, k=10)
X_new = selector.fit_transform(X, y_walc)

best_labels_walc = []
for i in range(0,len(selector.get_support())):
    if selector.get_support()[i]:
        best_labels_walc.append(predictors[i])
print(best_labels_walc)

#SOURCE: https://github.com/datascience-course/2016-datascience-labs/blob/master/lab10-classification/lab-10-classification.ipynb

['G1', 'G2', 'G3', 'absences', 'age', 'freetime', 'goout', 'studytime', 'sex_num', 'guardian_num']
['G1', 'G2', 'G3', 'absences', 'failures', 'freetime', 'goout', 'health', 'studytime', 'sex_num']


In [33]:
#split into test and training data
X = Cleaned_data[predictors].as_matrix()

XTrain_dalc, XTest_dalc, yTrain_dalc, yTest_dalc = train_test_split(X, y_dalc, 
                                                    random_state=1, test_size=0.5)

XTrain_walc, XTest_walc, yTrain_walc, yTest_walc = train_test_split(X, y_walc, 
                                                    random_state=1, test_size=0.5)

In [113]:
#use all predictors in classification
decisionTree = tree.DecisionTreeClassifier(max_depth=5, min_samples_split=100)

# fit the tree with the traing data
decisionTree = decisionTree.fit(XTrain_walc, yTrain_walc)

# predict with the training data
y_pred_train = decisionTree.predict(XTrain_walc)
# measure accuracy
#print('Accuracy on training data, all predictors= ', metrics.accuracy_score(y_true = yTrain_walc, y_pred = y_pred_train))

# predict with the test data
y_pred = decisionTree.predict(XTest_walc)
# measure accuracy
print('Accuracy on test data, all predictors= ', metrics.accuracy_score(y_true = yTest_walc, y_pred = y_pred))
print(metrics.confusion_matrix(y_true = yTest_walc, y_pred = y_pred))

#SOURCE:https://github.com/datascience-course/2016-datascience-labs/tree/master/lecture9-decision-trees

Accuracy on test data, all predictors=  0.731301939058
[[207  10]
 [ 87  57]]


In [168]:
# use all predictors in classification
decisionTree = tree.DecisionTreeClassifier(max_depth=10, min_samples_split=2)

# fit the tree with the traing data
decisionTree = decisionTree.fit(XTrain_dalc, yTrain_dalc)

# predict with the training data
y_pred_train = decisionTree.predict(XTrain_dalc)
# measure accuracy
#print('Accuracy on training data, all predictors= ', metrics.accuracy_score(y_true = yTrain_dalc, y_pred = y_pred_train))

# predict with the test data
y_pred = decisionTree.predict(XTest_dalc)
# measure accuracy
print('Accuracy on test data, all predictors= ', metrics.accuracy_score(y_true = yTest_dalc, y_pred = y_pred))
print(metrics.confusion_matrix(y_true = yTest_dalc, y_pred = y_pred))

#SOURCE:https://github.com/datascience-course/2016-datascience-labs/tree/master/lecture9-decision-trees

Accuracy on test data, all predictors=  0.828254847645
[[291  23]
 [ 39   8]]


In [83]:
#split into test and training data using only best predictors for dalc
X_dalc = Cleaned_data[best_labels_dalc].as_matrix()
X_walc = Cleaned_data[best_labels_walc].as_matrix()

XTrain_dalc_best, XTest_dalc_best, yTrain_dalc_best, yTest_dalc_best = train_test_split(X_dalc, y_dalc, 
                                                    random_state=1, test_size=0.5)

XTrain_walc_best, XTest_walc_best, yTrain_walc_best, yTest_walc_best = train_test_split(X_walc, y_walc, 
                                                    random_state=1, test_size=0.5)

In [115]:
#use best predictors in classification
decisionTree = tree.DecisionTreeClassifier(max_depth=5, min_samples_split=100)

# fit the tree with the traing data
decisionTree = decisionTree.fit(XTrain_walc_best, yTrain_walc_best)

# predict with the training data
y_pred_train = decisionTree.predict(XTrain_walc_best)
# measure accuracy
#print('Accuracy on training data, all predictors= ', metrics.accuracy_score(y_true = yTrain_walc_best, y_pred = y_pred_train))

# predict with the test data
y_pred = decisionTree.predict(XTest_walc_best)
# measure accuracy
print('Accuracy on test data, all predictors= ', metrics.accuracy_score(y_true = yTest_walc_best, y_pred = y_pred))
print(metrics.confusion_matrix(y_true = yTest_walc_best, y_pred = y_pred))

#SOURCE:https://github.com/datascience-course/2016-datascience-labs/tree/master/lecture9-decision-trees

Accuracy on test data, all predictors=  0.731301939058
[[207  10]
 [ 87  57]]


In [124]:
#use all predictors in classification
decisionTree = tree.DecisionTreeClassifier(max_depth=7, min_samples_split=2)

# fit the tree with the traing data
decisionTree = decisionTree.fit(XTrain_dalc_best, yTrain_dalc_best)

# predict with the training data
y_pred_train = decisionTree.predict(XTrain_dalc_best)
# measure accuracy
#print('Accuracy on training data, all predictors= ', metrics.accuracy_score(y_true = yTrain_dalc_best, y_pred = y_pred_train))

# predict with the test data
y_pred = decisionTree.predict(XTest_dalc_best)
# measure accuracy
print('Accuracy on test data, all predictors= ', metrics.accuracy_score(y_true = yTest_dalc_best, y_pred = y_pred))
print(metrics.confusion_matrix(y_true = yTest_dalc_best, y_pred = y_pred))

#SOURCE:https://github.com/datascience-course/2016-datascience-labs/tree/master/lecture9-decision-trees

Accuracy on test data, all predictors=  0.850415512465
[[297  17]
 [ 37  10]]
