# Project 2: 
### Building a Student Intervention System

In [2]:
# Import libraries
import numpy as np
import pandas as pd

In [4]:
# Read student data
student_data = pd.read_csv("dataset/student-data.csv")
print(student_data)
print ("Student data read successfully!")

    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
5       GP   M   16       U     LE3       T     4     3  services     other   
6       GP   M   16       U     LE3       T     2     2     other     other   
7       GP   F   17       U     GT3       A     4     4     other   teacher   
8       GP   M   15       U     LE3       A     3     2  services     other   
9       GP   M   15       U     GT3       T     3     4     other     other   
10      GP   F   15       U     GT3       T     4     4   teacher    health   
11      GP   F   15       U     GT3       T     2   

In [5]:
n_students = student_data.shape[0]
n_features = len(list(student_data.columns[:-1]))
n_passed = len(student_data[student_data['passed'] == 'yes'])
n_failed = len(student_data[student_data['passed'] == 'no'])
grad_rate = float((n_passed / n_students) * 100)
print ("Total number of students: {}".format(n_students))
print ("Number of students who passed: {}".format(n_passed))
print ("Number of students who failed: {}".format(n_failed))
print ("Number of features: {}".format(n_features))
print ("Graduation rate of the class: {:.2f}%".format(grad_rate))

Total number of students: 395
Number of students who passed: 265
Number of students who failed: 130
Number of features: 30
Graduation rate of the class: 0.00%


## Preparing the Data

### Identify feature and target columns
It is often the case that the data you obtain contains non-numeric features. This can be a problem, as most machine learning algorithms expect numeric data to perform computations with.

In [7]:
# Extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1])  # all columns but last are features
target_col = student_data.columns[-1]  # last column is the target/label
print ("Feature column(s):-\n{}".format(feature_cols))
print ("Target column: {}".format(target_col))

X_all = student_data[feature_cols]  # feature values for all students
y_all = student_data[target_col]  # corresponding targets/labels
print ("\nFeature values:-")
print (X_all.head())  # print the first 5 rows

Feature column(s):-
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']
Target column: passed

Feature values:-
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

    ...    higher internet  romantic  famrel  freetime goout Dalc Walc health  \
0   ...       yes       no        no       4         3     4    1    1      3   
1   ...    

### Preprocess feature columns

As you can see, there are several non-numeric columns that need to be converted! Many of them are simply `yes`/`no`, e.g. `internet`. These can be reasonably converted into `1`/`0` (binary) values.

Other columns, like `Mjob` and `Fjob`, have more than two values, and are known as _categorical variables_. The recommended way to handle such a column is to create as many columns as possible values (e.g. `Fjob_teacher`, `Fjob_other`, `Fjob_services`, etc.), and assign a `1` to one of them and `0` to all others.

In [8]:
# Preprocess feature columns
def preprocess_features(X):
    outX = pd.DataFrame(index=X.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)  # e.g. 'school' => 'school_GP', 'school_MS'

        outX = outX.join(col_data)  # collect column(s) in output dataframe
    return outX

X_all = preprocess_features(X_all)

print ("Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (48):-
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


### Split data into training and test sets

In [9]:
# First, decide how many training vs test samples you want
num_all = student_data.shape[0]  # same as len(student_data)
num_train = 300  # about 75% of the data
num_test = num_all - num_train

# Note: Shuffle the data or randomly select samples to avoid any bias due to ordering in the dataset
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_all, y_all, test_size = num_test, random_state = 42)

print ("Training set: {} samples".format(X_train.shape[0]))
print ("Test set: {} samples".format(X_test.shape[0]))
# Note: If you need a validation set, extract it from within training data

Training set: 300 samples
Test set: 95 samples


## Training and Evaluating Models


In [10]:
# Train a model
import time

def train_classifier(clf, X_train, y_train):
    print ("Training {}...".format(clf.__class__.__name__))
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print ("Done!\nTraining time (secs): {:.3f}".format(end - start))
    
# Support Vector Machine (SVM)
from sklearn import svm
clf_svm = svm.SVC()

# Fit model to training data
train_classifier(clf_svm, X_train, y_train)  # note: using entire training set here
print(clf_svm)

Training SVC...
Done!
Training time (secs): 0.012
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [11]:
# Predict on training set and compute F1 score
from sklearn.metrics import f1_score

def predict_labels(clf, features, target):
    print ("Predicting labels using {}...".format(clf.__class__.__name__))
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print ("Done!\nPrediction time (secs): {:.3f}".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')

train_f1_score = predict_labels(clf_svm, X_train, y_train)
print ("F1 score for training set: {}".format(train_f1_score))

Predicting labels using SVC...
Done!
Prediction time (secs): 0.040
F1 score for training set: 0.876068376068


In [12]:
# Predict on test data
test_f1_score = predict_labels(clf_svm, X_test, y_test)
print ("F1 score for test set: {}".format(test_f1_score))

Predicting labels using SVC...
Done!
Prediction time (secs): 0.002
F1 score for test set: 0.783783783784


In [13]:
# Train and predict using different training set sizes
def train_predict(clf, X_train, y_train, X_test, y_test):
    print ("------------------------------------------")
    print ("Training set size: {}".format(len(X_train)))
    train_classifier(clf, X_train, y_train)
    print ("F1 score for training set: {}".format(predict_labels(clf, X_train, y_train)))
    print ("F1 score for test set: {}".format(predict_labels(clf, X_test, y_test)))

# TODO: Run the helper function above for desired subsets of training data
# Note: Keep the test set constant

X_train_subset_200 = X_train[:200]
y_train_subset_200 = y_train[:200]
train_predict(clf_svm, X_train_subset_200, y_train_subset_200, X_test, y_test)

X_train_subset_100 = X_train[:100]
y_train_subset_100 = y_train[:100]
train_predict(clf_svm, X_train_subset_100, y_train_subset_100, X_test, y_test)


------------------------------------------
Training set size: 200
Training SVC...
Done!
Training time (secs): 0.003
Predicting labels using SVC...
Done!
Prediction time (secs): 0.002
F1 score for training set: 0.867924528302
Predicting labels using SVC...
Done!
Prediction time (secs): 0.001
F1 score for test set: 0.781456953642
------------------------------------------
Training set size: 100
Training SVC...
Done!
Training time (secs): 0.001
Predicting labels using SVC...
Done!
Prediction time (secs): 0.001
F1 score for training set: 0.877697841727
Predicting labels using SVC...
Done!
Prediction time (secs): 0.001
F1 score for test set: 0.774647887324


In [15]:
# Train and predict using two other models

# Decision Tree

from sklearn import tree
decision_tree = tree.DecisionTreeClassifier()
train_classifier(decision_tree, X_train, y_train)
train_f1_score = predict_labels(decision_tree, X_train, y_train)
print ("F1 score for training set: {}".format(train_f1_score))
test_f1_score = predict_labels(decision_tree, X_test, y_test)
print ("F1 score for test set: {}".format(test_f1_score))

X_train_subset_200 = X_train[:200]
y_train_subset_200 = y_train[:200]
train_predict(decision_tree, X_train_subset_200, y_train_subset_200, X_test, y_test)

X_train_subset_100 = X_train[:100]
y_train_subset_100 = y_train[:100]
train_predict(decision_tree, X_train_subset_100, y_train_subset_100, X_test, y_test)



Training DecisionTreeClassifier...
Done!
Training time (secs): 0.002
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 1.0
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.612903225806
------------------------------------------
Training set size: 200
Training DecisionTreeClassifier...
Done!
Training time (secs): 0.001
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 1.0
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for test set: 0.75
------------------------------------------
Training set size: 100
Training DecisionTreeClassifier...
Done!
Training time (secs): 0.001
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time (secs): 0.000
F1 score for training set: 1.0
Predicting labels using DecisionTreeClassifier...
Done!
Prediction time 

In [16]:
# K-Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

neighbor = KNeighborsClassifier(n_neighbors=3)
train_classifier(neighbor, X_train, y_train)
train_f1_score = predict_labels(neighbor, X_train, y_train)
print ("F1 score for training set: {}".format(train_f1_score))
test_f1_score = predict_labels(neighbor, X_test, y_test)
print ("F1 score for test set: {}".format(test_f1_score))

X_train_subset_200 = X_train[:200]
y_train_subset_200 = y_train[:200]
train_predict(neighbor, X_train_subset_200, y_train_subset_200, X_test, y_test)

X_train_subset_100 = X_train[:100]
y_train_subset_100 = y_train[:100]
train_predict(neighbor, X_train_subset_100, y_train_subset_100, X_test, y_test)

Training KNeighborsClassifier...
Done!
Training time (secs): 0.006
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.008
F1 score for training set: 0.880733944954
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.002
F1 score for test set: 0.731343283582
------------------------------------------
Training set size: 200
Training KNeighborsClassifier...
Done!
Training time (secs): 0.000
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.002
F1 score for training set: 0.896551724138
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.001
F1 score for test set: 0.741258741259
------------------------------------------
Training set size: 100
Training KNeighborsClassifier...
Done!
Training time (secs): 0.001
Predicting labels using KNeighborsClassifier...
Done!
Prediction time (secs): 0.001
F1 score for training set: 0.832116788321
Predicting labels using KNeighborsClassifier.

## Choosing the Best Model

In [29]:
# Fine-tune model and report the best F1 score with the SVM classifier
from sklearn import grid_search

parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':[1], 'gamma':[0.1]}
tune = grid_search.GridSearchCV(clf_svm, parameters, scoring = 'f1_weighted')
tune.fit(X_train, y_train)

train_f1_score = predict_labels(tune, X_train, y_train)
print ("F1 score for training set: {}".format(train_f1_score))

# Predict on test data
test_f1_score = predict_labels(tune, X_test, y_test)
print ("F1 score for test set: {}".format(test_f1_score))

Predicting labels using GridSearchCV...
Done!
Prediction time (secs): 0.003
F1 score for training set: 0.854586129754
Predicting labels using GridSearchCV...
Done!
Prediction time (secs): 0.001
F1 score for test set: 0.781954887218
