In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb

# Constants
DATA_DIR = 'Data/'
SCORING = ['accuracy', 'roc_auc', 'neg_mean_squared_error']



In [2]:
student_logs = pd.read_pickle('student_train_logs')
student_test_logs = pd.read_pickle('student_test_logs')

In [3]:
train_idx = student_logs['ITEST_id'].unique()

In [4]:
train_labels = pd.read_csv('Data/training_label.csv', index_col='ITEST_id').sort_index()
train_labels.drop_duplicates(subset=None, keep='first', inplace=True)

In [5]:
test_labels = pd.read_csv(DATA_DIR + 'validation_test_label.csv', index_col='ITEST_id')
test_labels.drop_duplicates(subset=None, keep='first', inplace=True)

In [6]:
students_features = pd.concat([student_logs.groupby('ITEST_id').mean(), student_logs.groupby('ITEST_id').std()], axis=1)
students_features = students_features.fillna(0)
print(students_features.shape)
students_features.head()

(467, 116)


Unnamed: 0_level_0,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,AveResConf,AveResFrust,AveResOfftask,AveResGaming,...,confidence(CONFUSED),confidence(FRUSTRATED),confidence(OFF TASK),confidence(GAMING),RES_BORED,RES_CONCENTRATING,RES_CONFUSED,RES_FRUSTRATED,RES_OFFTASK,RES_GAMING
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.185138,0.099734,0.438492,504.0,0.277149,0.644744,0.098078,0.162771,0.213378,0.005554,...,0.289391,0.35111,0.209475,0.122681,0.125372,0.160885,0.228893,0.333023,0.226656,0.023455
27,0.142031,0.069297,0.348837,129.0,0.330226,0.551367,0.122658,0.09542,0.34809,0.0369,...,0.314896,0.284357,0.306335,0.194901,0.135316,0.193386,0.257577,0.253684,0.336302,0.16129
33,0.459813,0.202787,0.686391,169.0,0.260426,0.650769,0.059971,0.061834,0.296286,0.010954,...,0.241292,0.21849,0.276394,0.120402,0.147747,0.153652,0.186395,0.214031,0.299967,0.050044
35,0.255164,0.158848,0.379658,993.0,0.222796,0.650079,0.069987,0.164347,0.153147,0.2368,...,0.254827,0.354863,0.198341,0.352301,0.112172,0.153547,0.208444,0.332016,0.199667,0.363046
37,0.071909,0.046183,0.305785,121.0,0.326384,0.653445,0.125325,0.094714,0.451467,0.016241,...,0.314989,0.287924,0.292861,0.152899,0.123891,0.137297,0.255149,0.273458,0.320803,0.068074


In [7]:
students_test_features = pd.concat([student_test_logs.groupby('ITEST_id').mean(), student_test_logs.groupby('ITEST_id').std()], axis=1)
students_test_features = students_test_features.fillna(0)
print(students_test_features.shape)
students_test_features.head()

(172, 116)


Unnamed: 0_level_0,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,AveResConf,AveResFrust,AveResOfftask,AveResGaming,...,confidence(CONFUSED),confidence(FRUSTRATED),confidence(OFF TASK),confidence(GAMING),RES_BORED,RES_CONCENTRATING,RES_CONFUSED,RES_FRUSTRATED,RES_OFFTASK,RES_GAMING
ITEST_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.185138,0.099734,0.438492,504.0,0.277149,0.644744,0.098078,0.162771,0.213378,0.005554,...,0.289391,0.35111,0.209475,0.122681,0.125372,0.160885,0.228893,0.333023,0.226656,0.023455
101,0.195441,0.117634,0.403553,394.0,0.252399,0.619236,0.123829,0.241243,0.195611,0.069793,...,0.320853,0.411071,0.203885,0.229437,0.119954,0.18228,0.28072,0.398842,0.216833,0.202453
161,0.301437,0.145797,0.483425,362.0,0.262823,0.618728,0.103445,0.048144,0.211666,0.134778,...,0.296977,0.202576,0.232868,0.299762,0.136568,0.190959,0.245751,0.191347,0.241681,0.312756
164,0.117598,0.08144,0.256983,179.0,0.252204,0.597314,0.135032,0.105664,0.243513,0.214158,...,0.327819,0.300253,0.297758,0.347104,0.129691,0.18223,0.278749,0.257089,0.299434,0.337269
176,0.318761,0.154002,0.575949,316.0,0.235391,0.645941,0.086661,0.082399,0.208176,0.014226,...,0.276198,0.258004,0.212216,0.141028,0.122831,0.169517,0.216289,0.256821,0.224683,0.051597


In [8]:
def runCV(clf, X_train, y_train, X_test, k):
    print_scores(cross_validate(clf, X_train, y_train, cv=k, scoring=SCORING, return_train_score=False))
    clf.fit(X_train, y_train)
    print_prediction(clf.predict(X_test))

In [9]:
def print_scores(scores):
    print('Scores')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))
    print("AUC: %0.2f (+/- %0.2f)" % (scores['test_roc_auc'].mean(), scores['test_roc_auc'].std() * 2))
    print("RMSE: %0.2f (+/- %0.2f)" % (np.sqrt(-scores['test_neg_mean_squared_error']).mean(), scores['test_neg_mean_squared_error'].std() * 2))

In [10]:
def print_prediction(prediction):
    for pred in prediction:
        print(pred, end=',')

In [11]:
train_ids = train_labels.index.tolist()
X_train = np.array([np.array(students_features.loc[student_id].tolist(), dtype=float) for student_id in train_ids])
y_train = np.array([train_labels['isSTEM'].loc[student_id] for student_id in train_ids])

In [12]:
test_ids = test_labels.index.tolist()
X_test = np.array([np.array(students_test_features.loc[student_id].tolist(), dtype=float) for student_id in test_ids])
X_test.shape

(172, 116)

## XGBoost

In [13]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [14]:
depths=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
etas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

In [15]:
best_auc = 0
best_rmse = 1
max_score = 0
best_param = {}

for depth in depths:
    for eta in etas:
        # specify parameters via map
        param = {'max_depth':depth, 'eta':eta, 'silent':1, 'objective':'binary:logistic' }
        num_round = 7
        cv = xgb.cv(param, dtrain, num_round, 7, metrics=['rmse', 'auc'])
        auc = cv.mean()['test-auc-mean']
        rmse = cv.mean()['test-rmse-mean']
        score = 1-rmse + auc
        
        if(score > max_score):
            max_score = score
            best_auc = auc
            best_rmse = rmse
            best_param = param

In [16]:
print('Best auc:', best_auc)
print('Best rmse:', best_rmse)
print('Best score:', max_score)
print('Best params:', best_param)

Best auc: 0.620783591837
Best rmse: 0.432035959184
Best score: 1.18874763265
Best params: {'max_depth': 2, 'eta': 0.5, 'silent': 1, 'objective': 'binary:logistic'}


In [17]:
bst = xgb.train(best_param, dtrain, num_round)
# make prediction
print_prediction(bst.predict(dtest))

0.208033,0.301487,0.319491,0.122163,0.208033,0.542282,0.304313,0.36878,0.44262,0.101331,0.304313,0.679666,0.349283,0.360886,0.456497,0.583884,0.497027,0.261386,0.381578,0.115309,0.407178,0.129211,0.611636,0.109822,0.0874335,0.387124,0.143484,0.0876499,0.208033,0.292572,0.53454,0.345538,0.099707,0.165,0.237291,0.195764,0.187163,0.187608,0.165,0.710997,0.36768,0.332248,0.138248,0.208033,0.165,0.196471,0.0759615,0.245106,0.538903,0.0849261,0.336814,0.0582394,0.304313,0.277975,0.3072,0.571053,0.208033,0.36768,0.182129,0.44262,0.109822,0.261386,0.208033,0.485728,0.345538,0.3662,0.340671,0.500019,0.814174,0.500803,0.500803,0.143484,0.377043,0.088212,0.459744,0.157038,0.367121,0.273772,0.304313,0.309991,0.413663,0.500019,0.182129,0.234878,0.0822154,0.217424,0.269713,0.109822,0.172767,0.726563,0.461913,0.0759615,0.571053,0.413663,0.314741,0.622852,0.109822,0.694286,0.217424,0.196471,0.273772,0.397554,0.144623,0.243102,0.0849261,0.500803,0.571053,0.0849261,0.517591,0.414289,0.36768,0.234878,0.2