# ASSISTments Data Mining Competition 2017 - Optional Semester Project

## Imports and constants

In [1]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.decomposition import PCA, TruncatedSVD

DATA_DIR = 'Data/'
SCORING = ['accuracy', 'roc_auc', 'neg_mean_squared_error']

## Loading the data

We create a DataFrame with all the students logs information

In [2]:
student_logs = pd.concat([
    pd.read_csv(DATA_DIR + 'student_log_' + str(i) + '.csv') for i in range(1, 11)
], ignore_index=True)

  if self.run_code(code, result):


A DataFrame with train labels with index as the ITEST_id and we also drop duplicates
And a DataFrame with test labels with an index on ITEST_id

In [3]:
train_labels = pd.read_csv('Data/training_label.csv', index_col='ITEST_id')
train_labels.drop_duplicates(subset=None, keep='first', inplace=True)

test_labels = pd.read_csv(DATA_DIR + 'validation_test_label.csv', index_col='ITEST_id')

In [4]:
print(train_labels.shape)
train_labels.head()

(467, 4)


Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS,isSTEM
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,2,0.438492,32,1
27,1,0.348837,21,0
33,2,0.686391,52,0
35,2,0.379658,34,0
37,3,0.305785,-999,0


In [5]:
print(test_labels.shape)
test_labels.head()

(172, 3)


Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,2,0.438492,32
101,4,0.403553,29
161,1,0.483425,40
164,2,0.256983,9
176,2,0.575949,50


## Feature engineering

We take the mean of the values grouped by student id and put 0 values instead NA values.

In [6]:
students_features = student_logs.groupby('ITEST_id').mean()
students_features = students_features.fillna(0)
print(students_features.shape)
students_features.head()

(1709, 71)


Unnamed: 0_level_0,AveCarelessness,AveCorrect,AveKnow,AveResBored,AveResConf,AveResEngcon,AveResFrust,AveResGaming,AveResOfftask,NumActions,...,timeOver80,timeSinceSkill,timeTaken,totalFrAttempted,totalFrPastWrongCount,totalFrPercentPastWrong,totalFrSkillOpportunities,totalFrSkillOpportunitiesByScaffolding,totalFrTimeOnSkill,totalTimeByPercentCorrectForskill
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,1056.0,...,0.049242,1208484.0,22.264205,317.567235,1.254735,0.207548,6.567235,1.149585,202.714962,835.534599
9,0.099734,0.438492,0.185138,0.277149,0.098078,0.644744,0.162771,0.005554,0.213378,504.0,...,0.138889,574204.4,42.494048,127.646825,1.234127,0.236869,5.269841,0.883345,311.454365,1639.285807
11,0.090318,0.326957,0.124832,0.262076,0.147169,0.643418,0.118977,0.083459,0.208981,575.0,...,0.132174,574821.2,46.958261,126.706087,1.092174,0.288723,4.066087,0.914445,446.502609,2712.01906
25,0.165527,0.476718,0.284904,0.261453,0.137153,0.660054,0.087906,0.016774,0.22768,451.0,...,0.135255,639341.6,42.83592,132.569845,1.199557,0.288261,5.372506,0.977675,329.567627,1640.829868
27,0.069297,0.348837,0.142031,0.330226,0.122658,0.551367,0.09542,0.0369,0.34809,129.0,...,0.271318,125170.8,92.457364,33.333333,1.51938,0.315728,2.534884,0.819767,823.953488,5476.928203


## Algorithms

In [47]:
def runCV(clf, X_train, y_train, X_test, k):
    print_scores(cross_validate(clf, X_train, y_train, cv=k, scoring=SCORING, return_train_score=False))
    clf.fit(X_train, y_train)
    print_prediction(clf.predict(X_test))

In [8]:
def print_scores(scores):
    print('Scores')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))
    print("AUC: %0.2f (+/- %0.2f)" % (scores['test_roc_auc'].mean(), scores['test_roc_auc'].std() * 2))
    print("RMSE: %0.2f (+/- %0.2f)" % (np.sqrt(-scores['test_neg_mean_squared_error']).mean(), scores['test_neg_mean_squared_error'].std() * 2))

In [58]:
def print_prediction(prediction):
    for pred in prediction:
        print(pred, end=',')

We make our training and test data from the students logs and the train_labels file

In [9]:
train_ids = train_labels.index.tolist()
X_train = np.array([np.array(students_features.loc[student_id].tolist(), dtype=float) for student_id in train_ids])
y_train = np.array([train_labels['isSTEM'].loc[student_id] for student_id in train_ids])

And our test data similarly

In [10]:
test_ids = test_labels.index.tolist()
X_test = np.array([np.array(students_features.loc[student_id].tolist(), dtype=float) for student_id in test_ids])

### Linear Support vector machine (SVM)

We start by fitting the data to a SVM model

In [59]:
clf = svm.SVC(C=1)
runCV(clf, X_train, y_train, X_test, 7)

Scores
Accuracy: 0.75 (+/- 0.01)
AUC: 0.50 (+/- 0.00)
RMSE: 0.50 (+/- 0.01)
1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,

### Logistic Regression

In [60]:
clf = linear_model.LogisticRegressionCV(Cs=[1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4])
runCV(clf, X_train, y_train, X_test, 7)

Scores
Accuracy: 0.74 (+/- 0.03)
AUC: 0.63 (+/- 0.11)
RMSE: 0.51 (+/- 0.03)
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

### Random Forest

In [61]:
clf = RandomForestClassifier()
runCV(clf, X_train, y_train, X_test, 7)

Scores
Accuracy: 0.70 (+/- 0.04)
AUC: 0.53 (+/- 0.14)
RMSE: 0.55 (+/- 0.04)
1,0,0,0,1,1,0,1,1,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,

### Dimensionality reduction

**TruncatedSVD**

In [62]:
svd = TruncatedSVD(n_components=15)

In [63]:
X_train_truncated = svd.fit_transform(X_train)
X_test_truncated = svd.fit_transform(X_test)
X_train_truncated.shape

(467, 15)

In [64]:
clf = linear_model.LogisticRegressionCV()
runCV(clf, X_train_truncated, y_train, X_test_truncated, 7)

Scores
Accuracy: 0.74 (+/- 0.03)
AUC: 0.64 (+/- 0.12)
RMSE: 0.51 (+/- 0.03)
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

In [None]:
clf.predict(svd.fit_transform(X_test))