# ASSISTments Data Mining Competition 2017 - Optional Semester Project

## Imports and constants

In [1]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.decomposition import PCA, TruncatedSVD
import xgboost as xgb

DATA_DIR = 'Data/'
SCORING = ['accuracy', 'roc_auc', 'neg_mean_squared_error']



## Loading the data

We create a DataFrame with all the students logs information

In [2]:
student_logs = pd.read_pickle(DATA_DIR + 'student_train_logs')
student_test_logs = pd.read_pickle(DATA_DIR + 'student_test_logs')
student_logs.head()

Unnamed: 0,ITEST_id,SY ASSISTments Usage,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,AveResConf,AveResFrust,...,confidence(CONFUSED),confidence(FRUSTRATED),confidence(OFF TASK),confidence(GAMING),RES_BORED,RES_CONCENTRATING,RES_CONFUSED,RES_FRUSTRATED,RES_OFFTASK,RES_GAMING
184889,9,1,0.185138,0.099734,0.438492,504,0.277149,0.644744,0.098078,0.162771,...,0.0,0.0,0.83871,0.008522,0.376427,0.320317,0.0,0.0,0.785585,0.000264
184890,9,1,0.185138,0.099734,0.438492,504,0.277149,0.644744,0.098078,0.162771,...,0.0,0.091463,0.0,0.047821,0.156027,0.225154,0.0,0.009561,0.0,0.001483
184891,9,1,0.185138,0.099734,0.438492,504,0.277149,0.644744,0.098078,0.162771,...,0.0,0.091463,0.280702,0.047821,0.156027,0.665929,0.0,0.009561,0.149121,0.001483
184892,9,1,0.185138,0.099734,0.438492,504,0.277149,0.644744,0.098078,0.162771,...,0.0,0.091463,0.6,0.047821,0.156027,0.780156,0.0,0.009561,0.468252,0.001483
184893,9,1,0.185138,0.099734,0.438492,504,0.277149,0.644744,0.098078,0.162771,...,0.378151,0.0,0.578947,0.18697,0.376427,0.195349,0.060808,0.0,0.440265,0.005797


A DataFrame with train labels with index as the ITEST_id and we also drop duplicates
And a DataFrame with test labels with an index on ITEST_id

In [3]:
train_labels = pd.read_csv('Data/training_label.csv', index_col='ITEST_id').sort_index()
train_labels.drop_duplicates(subset=None, keep='first', inplace=True)

test_labels = pd.read_csv(DATA_DIR + 'validation_test_label.csv', index_col='ITEST_id')
test_labels.drop_duplicates(subset=None, keep='first', inplace=True)

In [4]:
print(train_labels.shape)
train_labels.head()

(467, 4)


Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS,isSTEM
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,2,0.438492,32,1
27,1,0.348837,21,0
33,2,0.686391,52,0
35,2,0.379658,34,0
37,3,0.305785,-999,0


In [5]:
print(test_labels.shape)
test_labels.head()

(172, 3)


Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,2,0.438492,32
101,4,0.403553,29
161,1,0.483425,40
164,2,0.256983,9
176,2,0.575949,50


## Balance data

In [6]:
STEM_labels = train_labels[train_labels['isSTEM'] == 1]
nonSTEM_labels = train_labels[train_labels['isSTEM'] == 0]

In [7]:
nonSTEM_labels = nonSTEM_labels[:len(STEM_labels)]
train_labels_balanced = pd.concat([nonSTEM_labels, STEM_labels])
train_labels_balanced.sort_index(inplace=True)
train_labels_balanced.head()

Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS,isSTEM
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,2,0.438492,32,1
27,1,0.348837,21,0
33,2,0.686391,52,0
35,2,0.379658,34,0
37,3,0.305785,-999,0


In [8]:
train_idx = train_labels_balanced.index
train_idx

Int64Index([   9,   27,   33,   35,   37,   41,   77,   87,  114,  126,
            ...
            7179, 7469, 7609, 7613, 7679, 7704, 7723, 7737, 7769, 7775],
           dtype='int64', name='ITEST_id', length=234)

In [9]:
student_logs_balanced = student_logs[student_logs['ITEST_id'].isin(train_idx)]
student_logs_balanced.shape

(121104, 71)

## Feature engineering

We take the mean of the values grouped by student id and put 0 values instead NA values.

In [None]:
students_features = pd.concat([student_logs.groupby('ITEST_id').mean(), student_logs.groupby('ITEST_id').std()], axis=1)
students_features = students_features.fillna(0)
print(students_features.shape)
students_features.head()

In [None]:
students_test_features = pd.concat([student_test_logs.groupby('ITEST_id').mean(), student_test_logs.groupby('ITEST_id').std()], axis=1)
students_test_features = students_test_features.fillna(0)
print(students_test_features.shape)
students_test_features.head()

## Algorithms

In [None]:
def runCV(clf, X_train, y_train, X_test, k):
    print_scores(cross_validate(clf, X_train, y_train, cv=k, scoring=SCORING, return_train_score=False))
    clf.fit(X_train, y_train)
    try:
        print_prediction(clf.predict_proba(X_test))
    except:
        print_prediction_scalar(clf.predict(X_test))

In [None]:
def print_scores(scores):
    print('Scores')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))
    print("AUC: %0.2f (+/- %0.2f)" % (scores['test_roc_auc'].mean(), scores['test_roc_auc'].std() * 2))
    print("RMSE: %0.2f (+/- %0.2f)" % (np.sqrt(-scores['test_neg_mean_squared_error']).mean(), scores['test_neg_mean_squared_error'].std() * 2))

In [None]:
def print_prediction(prediction, reverse=False):
    for pred in prediction:
        if reverse:
            print(1 - pred[1], end=',')
        else:
            print(pred[1], end=',')

In [None]:
def print_prediction_scalar(prediction, reverse=False):
    for pred in prediction:
        if reverse:
            print(1 - pred, end=',')
        else:
            print(pred, end=',')

We make our training and test data from the students logs and the train_labels file

In [None]:
X_train = np.array([np.array(students_features.loc[student_id].tolist(), dtype=float) for student_id in train_idx])
y_train = np.array([train_labels['isSTEM'].loc[student_id] for student_id in train_idx])
X_train.shape

And our test data similarly

In [None]:
test_ids = test_labels.index.tolist()
X_test = np.array([np.array(students_test_features.loc[student_id].tolist(), dtype=float) for student_id in test_ids])
X_test.shape

### Linear Support vector machine (SVM)

We start by fitting the data to a SVM model

In [None]:
clf = svm.SVC(C=1, kernel='rbf')
runCV(clf, X_train, y_train, X_test, 7)

### Logistic Regression

In [None]:
clf = linear_model.LogisticRegressionCV(Cs=[1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4])
print('Using normal features')
runCV(clf, X_train, y_train, X_test, 7)

### Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=30, n_estimators=350, n_jobs=-1)
runCV(clf, X_train, y_train, X_test, 7)

## Ridge classifier

In [None]:
clf = linear_model.RidgeClassifierCV(alphas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
runCV(clf, X_train, y_train, X_test, 7)

## XGBoost

In [None]:

def xgboost_eval(X_train, y_train):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test)

    depths=[3, 5, 7, 10]
    etas=[0.01, 0.02, 0.05, 0.1, 0.15, 0.2]
    
    best_error = 0
    best_auc = 0
    best_rmse = 1
    max_score = 0
    best_num_round = 0
    best_param = {}

    for depth in depths:
        for eta in etas:
            # specify parameters via map
            param = {'max_depth':depth, 'eta':eta, 'silent':1, 'objective':'binary:logistic' }
            cv = xgb.cv(param, dtrain, 7, metrics=['error', 'rmse', 'auc'])

            results = cv.mean()

            auc = results['test-auc-mean']
            rmse = results['test-rmse-mean']
            error = results['test-error-mean']
            score = 1-rmse + auc

            if(score > max_score):
                max_score = score
                best_auc = auc
                best_rmse = rmse
                best_param = param
                best_error = error
                
                
    print('Best error:', best_error)
    print('Best auc:', best_auc)
    print('Best rmse:', best_rmse)
    print('Best score:', max_score)
    print('Best params:', best_param)
    print('')
    
    param = {'max_depth':40, 'eta':0.5, 'silent':1, 'objective':'binary:logistic' }
    bst = xgb.train(best_param, dtrain)
    # make prediction
    print_prediction_scalar(bst.predict(dtest))

In [None]:
xgboost_eval(X_train, y_train)