# ASSISTments Data Mining Competition 2017 - Optional Semester Project

## Imports and constants

In [1]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.decomposition import PCA, TruncatedSVD
import xgboost as xgb

DATA_DIR = 'Data/'
SCORING = ['accuracy', 'roc_auc', 'neg_mean_squared_error']



## Loading the data

We create a DataFrame with all the students logs information

In [2]:
student_logs = pd.read_pickle(DATA_DIR + 'student_train_logs')
student_test_logs = pd.read_pickle(DATA_DIR + 'student_test_logs')

A DataFrame with train labels with index as the ITEST_id and we also drop duplicates
And a DataFrame with test labels with an index on ITEST_id

In [3]:
train_labels = pd.read_csv('Data/training_label.csv', index_col='ITEST_id').sort_index()
train_labels.drop_duplicates(subset=None, keep='first', inplace=True)

test_labels = pd.read_csv(DATA_DIR + 'validation_test_label.csv', index_col='ITEST_id')
test_labels.drop_duplicates(subset=None, keep='first', inplace=True)

In [4]:
print(train_labels.shape)
train_labels.head()

(467, 4)


Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS,isSTEM
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,2,0.438492,32,1
27,1,0.348837,21,0
33,2,0.686391,52,0
35,2,0.379658,34,0
37,3,0.305785,-999,0


In [5]:
print(test_labels.shape)
test_labels.head()

(172, 3)


Unnamed: 0_level_0,SchoolId,AveCorrect,MCAS
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,2,0.438492,32
101,4,0.403553,29
161,1,0.483425,40
164,2,0.256983,9
176,2,0.575949,50


## Feature engineering

We take the mean of the values grouped by student id and put 0 values instead NA values.

In [6]:
students_features = pd.concat([student_logs.groupby('ITEST_id').mean(), student_logs.groupby('ITEST_id').std()], axis=1)
students_features = students_features.fillna(0)
print(students_features.shape)
students_features.head()

(467, 116)


Unnamed: 0_level_0,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,AveResConf,AveResFrust,AveResOfftask,AveResGaming,...,confidence(CONFUSED),confidence(FRUSTRATED),confidence(OFF TASK),confidence(GAMING),RES_BORED,RES_CONCENTRATING,RES_CONFUSED,RES_FRUSTRATED,RES_OFFTASK,RES_GAMING
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.185138,0.099734,0.438492,504.0,0.277149,0.644744,0.098078,0.162771,0.213378,0.005554,...,0.289391,0.35111,0.209475,0.122681,0.125372,0.160885,0.228893,0.333023,0.226656,0.023455
27,0.142031,0.069297,0.348837,129.0,0.330226,0.551367,0.122658,0.09542,0.34809,0.0369,...,0.314896,0.284357,0.306335,0.194901,0.135316,0.193386,0.257577,0.253684,0.336302,0.16129
33,0.459813,0.202787,0.686391,169.0,0.260426,0.650769,0.059971,0.061834,0.296286,0.010954,...,0.241292,0.21849,0.276394,0.120402,0.147747,0.153652,0.186395,0.214031,0.299967,0.050044
35,0.255164,0.158848,0.379658,993.0,0.222796,0.650079,0.069987,0.164347,0.153147,0.2368,...,0.254827,0.354863,0.198341,0.352301,0.112172,0.153547,0.208444,0.332016,0.199667,0.363046
37,0.071909,0.046183,0.305785,121.0,0.326384,0.653445,0.125325,0.094714,0.451467,0.016241,...,0.314989,0.287924,0.292861,0.152899,0.123891,0.137297,0.255149,0.273458,0.320803,0.068074


In [7]:
students_test_features = pd.concat([student_test_logs.groupby('ITEST_id').mean(), student_test_logs.groupby('ITEST_id').std()], axis=1)
students_test_features = students_test_features.fillna(0)
print(students_test_features.shape)
students_test_features.head()

(172, 116)


Unnamed: 0_level_0,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,AveResConf,AveResFrust,AveResOfftask,AveResGaming,...,confidence(CONFUSED),confidence(FRUSTRATED),confidence(OFF TASK),confidence(GAMING),RES_BORED,RES_CONCENTRATING,RES_CONFUSED,RES_FRUSTRATED,RES_OFFTASK,RES_GAMING
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.185138,0.099734,0.438492,504.0,0.277149,0.644744,0.098078,0.162771,0.213378,0.005554,...,0.289391,0.35111,0.209475,0.122681,0.125372,0.160885,0.228893,0.333023,0.226656,0.023455
101,0.195441,0.117634,0.403553,394.0,0.252399,0.619236,0.123829,0.241243,0.195611,0.069793,...,0.320853,0.411071,0.203885,0.229437,0.119954,0.18228,0.28072,0.398842,0.216833,0.202453
161,0.301437,0.145797,0.483425,362.0,0.262823,0.618728,0.103445,0.048144,0.211666,0.134778,...,0.296977,0.202576,0.232868,0.299762,0.136568,0.190959,0.245751,0.191347,0.241681,0.312756
164,0.117598,0.08144,0.256983,179.0,0.252204,0.597314,0.135032,0.105664,0.243513,0.214158,...,0.327819,0.300253,0.297758,0.347104,0.129691,0.18223,0.278749,0.257089,0.299434,0.337269
176,0.318761,0.154002,0.575949,316.0,0.235391,0.645941,0.086661,0.082399,0.208176,0.014226,...,0.276198,0.258004,0.212216,0.141028,0.122831,0.169517,0.216289,0.256821,0.224683,0.051597


## Polynomial basis

In [8]:
def build_poly(x, degree):
    """ Apply a polynomial basis to all the X features. """
    # First, we find the combinations of columns for which we have to
    # compute the product
    m, n = x.shape

    combinations = {}

    # Add combinations of same column power
    for i in range(n * degree):
        if i < n:
            combinations[i] = [i]
        else:
            col_number = i - n
            cpt = 2
            while col_number >= n:
                col_number -= n
                cpt += 1
            combinations[i] = [col_number] * cpt

    # Add combinations of products between columns
    cpt = i + 1

    for i in range(n):
        for j in range(i + 1, n):
            combinations[cpt] = [i, j]
            cpt = cpt + 1

    # Now we can fill a new matrix with the column combinations
    eval_poly = np.zeros(
        shape=(m, n + len(combinations))
    )

    for i, c in combinations.items():
        eval_poly[:, i] = x[:, c].prod(1)

    # Add square root
    for i in range(0, n):
        eval_poly[:, len(combinations) + i] = np.abs(x[:, i]) ** 0.5

    return eval_poly

## Algorithms

In [9]:
def runCV(clf, X_train, y_train, X_test, k):
    print_scores(cross_validate(clf, X_train, y_train, cv=k, scoring=SCORING, return_train_score=False))
    clf.fit(X_train, y_train)
    try:
        print_prediction(clf.predict_proba(X_test))
    except:
        print_prediction_scalar(clf.predict(X_test))

In [10]:
def print_scores(scores):
    print('Scores')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))
    print("AUC: %0.2f (+/- %0.2f)" % (scores['test_roc_auc'].mean(), scores['test_roc_auc'].std() * 2))
    print("RMSE: %0.2f (+/- %0.2f)" % (np.sqrt(-scores['test_neg_mean_squared_error']).mean(), scores['test_neg_mean_squared_error'].std() * 2))

In [24]:
def print_prediction(prediction, reverse=False):
    for pred in prediction:
        if reverse:
            print(1 - pred[1], end=',')
        else:
            print(pred[1], end=',')

In [25]:
def print_prediction_scalar(prediction, reverse=False):
    for pred in prediction:
        if reverse:
            print(1 - pred, end=',')
        else:
            print(pred, end=',')

We make our training and test data from the students logs and the train_labels file

In [13]:
train_ids = train_labels.index.tolist()
X_train = np.array([np.array(students_features.loc[student_id].tolist(), dtype=float) for student_id in train_ids])
y_train = np.array([train_labels['isSTEM'].loc[student_id] for student_id in train_ids])
X_train.shape

(467, 116)

And our test data similarly

In [14]:
test_ids = test_labels.index.tolist()
X_test = np.array([np.array(students_test_features.loc[student_id].tolist(), dtype=float) for student_id in test_ids])
X_test.shape

(172, 116)

In [15]:
X_train_poly = build_poly(X_train, 7)
X_test_poly = build_poly(X_test, 7)
print(X_train_poly.shape)
print(X_test_poly.shape)

(467, 7598)
(172, 7598)


### Linear Support vector machine (SVM)

We start by fitting the data to a SVM model

In [16]:
clf = svm.SVC(C=1, kernel='rbf')
runCV(clf, X_train, y_train, X_test, 7)

Scores
Accuracy: 0.75 (+/- 0.01)
AUC: 0.50 (+/- 0.00)
RMSE: 0.50 (+/- 0.01)
1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,

### Logistic Regression

In [17]:
clf = linear_model.LogisticRegressionCV(Cs=[1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4])
print('Using normal features')
runCV(clf, X_train, y_train, X_test, 7)

print('')
print('')
print('Using polynomial basis features')
runCV(clf, X_train_poly, y_train, X_test_poly, 7)

Using normal features
Scores
Accuracy: 0.75 (+/- 0.03)
AUC: 0.60 (+/- 0.16)
RMSE: 0.50 (+/- 0.03)
0.308592386503,0.224720065234,0.408501062218,0.219069893005,0.413899731742,0.496149100852,0.295953360359,0.340942649492,0.383869839261,0.17120569712,0.395820287011,0.435435046,0.241968021974,0.275363480023,0.368771402591,0.29449609442,0.282124560918,0.291547920871,0.492921026923,0.165747243159,0.43688248803,0.0938093149067,0.344679450889,0.21048520592,0.213463637,0.261268176103,0.14127640576,0.352377509647,0.29589538893,0.0754701055033,0.194844780389,0.183430263917,0.263598127647,0.341017457709,0.230778873486,0.157454063844,0.403073671405,0.170411484924,0.317351367413,0.58828685697,0.353567044455,0.287277339063,0.210894809212,0.297730372443,0.227972260048,0.272974307695,0.204043044521,0.357747191589,0.395812107262,0.237013855296,0.184214103117,0.150879136056,0.348384147868,0.090738568109,0.223690417566,0.324341832749,0.210366673233,0.343524792885,0.166031354387,0.311895473823,0.29519882545

### Random Forest

In [18]:
clf = RandomForestClassifier(max_depth=3, random_state=42)
runCV(clf, X_train, y_train, X_test, 7)

Scores
Accuracy: 0.73 (+/- 0.04)
AUC: 0.59 (+/- 0.14)
RMSE: 0.52 (+/- 0.04)
0.247614832036,0.316045394942,0.327267239052,0.140960674274,0.32899366943,0.349464166637,0.332607678586,0.450675115625,0.351277469542,0.092466378638,0.329344777863,0.375135984685,0.397929170422,0.265617345836,0.331042186922,0.32899366943,0.34635773044,0.123640544164,0.308012191077,0.153952879661,0.346158792497,0.226789959149,0.414880774143,0.170375790546,0.141445131563,0.34635773044,0.170375790546,0.19578815163,0.216941393967,0.168383672051,0.236139411293,0.452284396162,0.191806195556,0.3637223516,0.170375790546,0.225429369819,0.328295411216,0.182435741966,0.257438044845,0.563696566923,0.339853445524,0.259207076365,0.213991657773,0.231205303228,0.170375790546,0.282920659207,0.200348883063,0.254474756597,0.303317666166,0.25544507556,0.170375790546,0.123640544164,0.340601386512,0.146664148924,0.344895852261,0.4267963723,0.162860210416,0.357345050724,0.208571648068,0.376504432816,0.243530708344,0.259249363127,0.18

## Ridge classifier

In [19]:
clf = linear_model.RidgeClassifierCV(alphas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
runCV(clf, X_train, y_train, X_test, 7)

Scores
Accuracy: 0.73 (+/- 0.05)
AUC: 0.61 (+/- 0.10)
RMSE: 0.52 (+/- 0.05)
0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

### XGBoost

In [26]:

def xgboost_eval(X_train, y_train):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test)

    depths=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    etas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

    best_error = 0
    best_auc = 0
    best_rmse = 1
    max_score = 0
    best_param = {}

    for depth in depths:
        for eta in etas:
            # specify parameters via map
            param = {'max_depth':depth, 'eta':eta, 'silent':1, 'objective':'binary:logistic' }
            num_round = 7
            cv = xgb.cv(param, dtrain, num_round, 7, metrics=['error', 'rmse', 'auc'])

            results = cv.mean()

            auc = results['test-auc-mean']
            rmse = results['test-rmse-mean']
            error = results['test-error-mean']
            score = 1-rmse + auc

            if(score > max_score):
                max_score = score
                best_auc = auc
                best_rmse = rmse
                best_param = param
                best_error = error
                
    print('Best error:', best_error)
    print('Best auc:', best_auc)
    print('Best rmse:', best_rmse)
    print('Best score:', max_score)
    print('Best params:', best_param)
    print('')
    
    bst = xgb.train(best_param, dtrain, num_round)
    # make prediction
    print_prediction_scalar(bst.predict(dtest), reverse=True)

In [27]:
xgboost_eval(X_train, y_train)

Best error: 0.254174428571
Best auc: 0.620783591837
Best rmse: 0.432035959184
Best score: 1.18874763265
Best params: {'max_depth': 2, 'eta': 0.5, 'silent': 1, 'objective': 'binary:logistic'}

0.791966870427,0.698512643576,0.680508673191,0.877837114036,0.791966870427,0.457718253136,0.695687144995,0.631220251322,0.557380139828,0.898668780923,0.695687144995,0.3203343153,0.650717198849,0.639113634825,0.543503195047,0.416115760803,0.502973049879,0.738614469767,0.618422478437,0.884690895677,0.592821985483,0.870789140463,0.388364315033,0.890178203583,0.912566497922,0.612875670195,0.856516167521,0.91235011816,0.791966870427,0.707428187132,0.465460479259,0.65446215868,0.900293014944,0.834999606013,0.762708589435,0.804235503078,0.812836632133,0.812392234802,0.834999606013,0.289002895355,0.63231998682,0.667752206326,0.861751794815,0.791966870427,0.834999606013,0.803529307246,0.924038499594,0.754893958569,0.461096763611,0.915073946118,0.663186222315,0.94176056236,0.695687144995,0.72202450037,0.692

### Dimensionality reduction

**TruncatedSVD**

In [None]:
svd = TruncatedSVD(n_components=15)

In [None]:
X_train_truncated = svd.fit_transform(X_train)
X_test_truncated = svd.fit_transform(X_test)
X_train_truncated.shape

In [None]:
clf = linear_model.LogisticRegressionCV()
runCV(clf, X_train_truncated, y_train, X_test_truncated, 7)

In [None]:
clf.predict(svd.fit_transform(X_test))