In [23]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import pickle as pkl
from skopt import BayesSearchCV

## Import data

In [2]:
path_cwd= os.getcwd()
path_data=path_cwd+'/Data/'

#train = pd.read_csv(path_data+ "train.csv")
#train.to_parquet(path_data+ "train.parquet", engine="fastparquet")
#train = pd.read_parquet(path_data+ "train.parquet", engine="fastparquet")
sample = pd.read_csv(path_data+'sample_submission.csv')
labels = pd.read_csv(path_data+'train_labels.csv')
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

test = pd.read_csv(path_data+'test.csv')
#times = pd.read_parquet("df_times.parquet", engine="fastparquet")

## Preprocessing

In [3]:
def preprocessing(df):
    # drop useless columns
    df.drop(['fullscreen','hq','music','level','room_coor_x','room_coor_y','screen_coor_x','screen_coor_y',
                    'hover_duration',], axis=1, inplace=True)
    # extract times
    times = df.elapsed_time.diff().fillna(0)
    times[times < 0] = 0
    df['times'] = times
    df.drop(['elapsed_time'], axis=1, inplace=True)
    # create room_changed column
    change = df['room_fqid'].ne(df['room_fqid'].shift().bfill()).astype(int)
    change[df['index'] == 0] = 1 # manually correct where session starts
    df['room_changed'] = change.cumsum()
    return df

In [4]:
train = pd.read_parquet(path_data + "train.parquet", engine="fastparquet")
train = preprocessing(train)

In [5]:
train.head()

Unnamed: 0,session_id,index,event_name,name,page,text,fqid,room_fqid,text_fqid,level_group,times,room_changed
0,20090312431273200,0,cutscene_click,basic,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4,0.0,1
1,20090312431273200,1,person_click,basic,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4,1323.0,1
2,20090312431273200,2,person_click,basic,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4,0.0,1
3,20090312431273200,3,person_click,basic,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4,316.0,1
4,20090312431273200,4,person_click,basic,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4,716.0,1


In [6]:
train['room_changed']

0                 1
1                 1
2                 1
3                 1
4                 1
             ...   
26296941    1704610
26296942    1704610
26296943    1704610
26296944    1704610
26296945    1704610
Name: room_changed, Length: 26296946, dtype: int32

In [7]:
times1 = train[train['level_group']=='0-4'].drop(['level_group'],axis = 1).reset_index(drop=True)
times2 = train[train['level_group']=='5-12'].drop(['level_group'],axis = 1).reset_index(drop=True)
times3 = train[train['level_group']=='13-22'].drop(['level_group'],axis = 1).reset_index(drop=True)

In [8]:
# save to parquet times
times1.to_parquet(path_data+ "times1.parquet", engine="fastparquet")
times2.to_parquet(path_data+ "times2.parquet", engine="fastparquet")
times3.to_parquet(path_data+ "times3.parquet", engine="fastparquet")

## Features extraction

In [None]:
# recover data without running preprocessing
times1 = pd.read_parquet(path_data+ "times1.parquet", engine="fastparquet")
times2 = pd.read_parquet(path_data+ "times2.parquet", engine="fastparquet")
times3 = pd.read_parquet(path_data+ "times3.parquet", engine="fastparquet")

In [4]:
def notification_text(level):
    if level == '0-4':
        useful = ['Gramps is in trouble for losing papers?','This looks like a clue!',
                "I'll record this in my notebook.","The slip is from 1916 but the team didn't start until 1974!"]
        useless = ['Found it!',"This can't be right!",'Gramps is a great historian!',
                "That's it!",'Hooray, a boring old shirt.','A boring old shirt.','Gramps is the best historian ever!']
    elif level == '5-12':
        useful = ['This place was around in 1916! I can start there!','Youmans was a suffragist!',
                  "And look! She's wearing the shirt!"]
        useless = ["It's a match!",'Theodora Youmans must be the owner!','She helped get votes for women!',
                   'Wells! What was he doing here? I should ask the librarian.','Hey, this is Youmans!','I should go to the Capitol and tell everyone!']
    elif level == '13-22':
        useful = ["That hoofprint doesn't match the flag!",'\\Ecology flag, by Ron Cobb.\\',"Hey! That's Governor Nelson in front of our flag!",
                  "And look! She's wearing the shirt!",'Ooh... \\Ecology flag, by Ron Cobb.\\']
        useless = ['Those are the same glasses!', "The archivist must've taken Teddy!",'Look at all those activists!', 'This is perfect for the exhibit.',
       'I should go to the Capitol and tell Mrs. M!','Hey, this is Youmans!','I should go to the Capitol and tell everyone!']
    return useful, useless

In [5]:
def extract_identifiers(times,level):
    # rooms
    rooms = times['room_fqid'].unique()
    # page
    notebook_t = times[times['page'].isnull() == 0]
    pages_id = list(notebook_t['page'].unique())
    # objects
    object_t = times[times['event_name'] == 'object_click']
    objects_id = list(object_t['fqid'].unique())
    # people
    people_t = times[times['event_name'] == 'person_click']
    people_id = people_t.fqid.unique()
    # observations
    obs_t = times[times['event_name'] == 'observation_click']
    obs_id = list(obs_t['fqid'].unique())
    # notifications
    noti_t = times[times['event_name'] == 'notification_click']
    useful = notification_text(level)
    noti_useful_t = noti_t[noti_t['text'].isin(useful)]
    noti_useful_id = list(noti_useful_t['text'].unique())
    # create dictionary
    identifiers = {'rooms' : rooms, 'pages_id' : pages_id, 'objects_id' : objects_id, 'persons': people_id, 'obs_id': obs_id, 'noti_useful_id': noti_useful_id}
    return identifiers

In [6]:
def feature_extractor(grouped,ids, identifiers, stats, stats_names, general_name):
    '''
    :param grouped: group by dataframe indexed by id and ONE other column whose values must be in identifiers
    :param identifiers: list of all possible identifiers
    :param stats: functions to use for aggregating
    :param stats_names: names for the fucntions precedeed by _
    :param general_name: followed by _
    :return: a pandas dataframe
    '''
    d = {general_name + str(identifier) + stat: [] for identifier in identifiers for stat in stats_names}
    for id in ids:
        if id not in grouped.index.get_level_values('session_id'):
            for identifier in identifiers:
                for stat, stat_name in zip(stats,stats_names):
                    d[general_name + str(identifier) + stat_name] +=[0]
        else:
            match = grouped.loc[id] # contains list of times for that player
            for identifier in identifiers:
                for stat, stat_name in zip(stats,stats_names):
                    if identifier in match.index:
                        d[general_name + str(identifier)+ stat_name] += [stat(match.loc[identifier].times)]
                    else:
                        d[general_name + str(identifier) + stat_name] +=[0]
    return pd.DataFrame(d, index=ids)

In [14]:
def features(times1, level):
    ids = times1['session_id'].unique()
    identifiers = extract_identifiers(times1,level)
    # times per room
    group_rooms = times1.groupby(by=['session_id','room_fqid','room_changed']).agg({'times':'sum'})
    group_rooms = group_rooms.reset_index(level=[0,1]).groupby(by=['session_id', 'room_fqid']).agg({'times': lambda x: list(x)})
    rooms_df=feature_extractor(group_rooms,ids,identifiers['rooms'],stats=[np.mean, np.std, len],stats_names=['_mean','_std', '_n'], general_name='rooms_')
    # checkpoint
    checkpoint_times = times1[times1['event_name'] == 'checkpoint']
    group_checkpoint = checkpoint_times.groupby(by=['session_id','event_name']).agg({'times':lambda x: list(x)})
    checkpoint_df = feature_extractor(group_checkpoint,ids, identifiers = ['checkpoint'], 
                                      stats=[np.mean, np.std, len], stats_names=['_mean','_std', '_n'], general_name='checkpoint_')
    # page
    notebook_times = times1[times1['page'].isnull() == 0]
    notebook_times = notebook_times[notebook_times['name']!='open']
    group_pages = notebook_times.groupby(by=['session_id','page']).agg({'times':lambda x: list(x)})
    pages_df = feature_extractor(group_pages,ids, identifiers = identifiers['pages_id'], stats=[np.mean, np.std, len], stats_names=['_mean','_std', '_n'], general_name='page_')
    # objects
    object_times = times1[(times1['event_name'] == 'object_click') | (times1['event_name'] == 'object_click')]
    group_objects = object_times.groupby(by=['session_id','fqid']).agg({'times':lambda x: list(x)})
    objects_df = feature_extractor( group_objects,ids, identifiers = identifiers['objects_id'], stats=[np.mean, np.std, len], stats_names=['_mean','_std', '_n'], general_name='object_')
    # cut-scene
    row_filter = times1.event_name=='cutscene_click'
    column_filter= ['session_id', 'room_fqid','times']
    cut_scene_clicks=times1.loc[row_filter,column_filter]
    grouped=cut_scene_clicks.groupby(by=['session_id', 'room_fqid']).agg({'times': lambda x: list(x)})
    cut_scene_clicks_df=feature_extractor(grouped,ids,identifiers['rooms'],stats=[np.mean, np.std],stats_names=['_mean','_std'], general_name='cutscene_')
    # people
    row_filter = times1.event_name=='person_click'
    column_filter= ['session_id', 'fqid','times']
    person_clicks=times1.loc[row_filter,column_filter]
    grouped = person_clicks.groupby(by=['session_id', 'fqid']).agg({'times': lambda x: list(x)})
    person_clicks_df=feature_extractor(grouped,ids,identifiers['persons'],stats=[np.mean, np.std,len],stats_names=['_mean','_std','_n'], general_name='persons_')
    # observation
    obs_times = times1[times1['event_name'] == 'observation_click']
    group_obs = obs_times.groupby(by=['session_id','fqid']).agg({'times':lambda x: list(x)})
    obs_df = feature_extractor(group_obs,ids, identifiers = identifiers['obs_id'], stats=[np.mean, np.std, len], stats_names=['_mean','_std', '_n'], general_name='obs_')
    # notification
    useful = notification_text(level)
    noti_times = times1[times1['event_name'] == 'notification_click']
    noti_useful = noti_times[noti_times['text'].isin(useful)]
    group_noti_useful = noti_useful.groupby(by=['session_id','text']).agg({'times':lambda x: list(x)})
    noti_useful_df = feature_extractor(group_noti_useful,ids, identifiers = identifiers['noti_useful_id'], stats=[np.mean, np.std, len], stats_names=['_mean','_std', '_n'], general_name='notification_')
    # aggregate features
    features = pd.concat([rooms_df, checkpoint_df, pages_df, objects_df, cut_scene_clicks_df, person_clicks_df, obs_df, noti_useful_df],axis=1)
    return features

In [15]:
features1 = features(times1,'0-4')
features1.to_csv(path_data + 'features1_new.csv')
features2 = features(times2,'5-12')
features2.to_csv(path_data + 'features2_new.csv')
features3 = features(times3,'13-22')
features3.to_csv(path_data + 'features3_new.csv')

## Features selection with Random Forest

In [3]:
features1 = pd.read_csv(path_data + 'features1_new.csv', index_col=0)
features2 = pd.read_csv(path_data + 'features2_new.csv', index_col=0)
features3 = pd.read_csv(path_data + 'features3_new.csv', index_col=0)

In [4]:
features1 = features1.loc[:, (features1 != features1.iloc[0]).any()] 
features2 = features2.loc[:, (features2 != features2.iloc[0]).any()]
features3 = features3.loc[:, (features3 != features3.iloc[0]).any()]

In [5]:
features1.head()

Unnamed: 0,rooms_tunic.historicalsociety.closet_mean,rooms_tunic.historicalsociety.closet_std,rooms_tunic.historicalsociety.closet_n,rooms_tunic.historicalsociety.basement_mean,rooms_tunic.historicalsociety.basement_std,rooms_tunic.historicalsociety.basement_n,rooms_tunic.historicalsociety.entry_mean,rooms_tunic.historicalsociety.entry_std,rooms_tunic.historicalsociety.entry_n,rooms_tunic.historicalsociety.collection_mean,...,obs_doorblock_n,obs_block_tomap2_mean,obs_block_tomap2_std,obs_block_tomap2_n,obs_block_tomap1_mean,obs_block_tomap1_std,obs_block_tomap1_n,obs_block_0_mean,obs_block_0_std,obs_block_0_n
20090312431273200,24519.5,13504.5,2,4153.0,1531.050837,3,10835.75,12478.724201,4,26613.0,...,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0
20090312433251036,20429.0,0.0,1,4814.0,0.0,1,23916.0,13814.0,2,89960.0,...,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0
20090312455206810,9887.5,8286.5,2,105393.5,104027.5,2,17012.0,8105.0,2,32396.0,...,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0
20090313091715820,23842.0,0.0,1,14187.0,0.0,1,16209.0,5947.0,2,26409.0,...,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0
20090313571836404,37649.0,0.0,1,3833.0,0.0,1,27308.5,6925.5,2,40223.0,...,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0


In [17]:
lst_valid = []
lst_pred = []
lst_pred_selected = []
conf_matrix = {}
conf_matrix_selected = {}
original_f1score = []
light_f1score = []
features_to_keep = {}
flag1 = 0
flag2 = 0
for q_no in range(1,19):
    if q_no <= 3:
        features = features1
    elif q_no <= 13:
        features = features2
        if flag1 == 0:
            print('Score original: ',f1_score(lst_valid, lst_pred,average='macro'))
            print('Score light: ',f1_score(lst_valid, lst_pred_selected,average='macro'))
            flag1 = 1
    else:
        if flag2 == 0:
            print('Score original: ',f1_score(lst_valid, lst_pred,average='macro'))
            print('Score light: ',f1_score(lst_valid, lst_pred_selected,average='macro'))
            flag2 = 1
        features = features3
    X_train, X_test, y_train, y_test = train_test_split(features, labels.loc[labels.q==q_no].set_index('session')['correct'], test_size=0.2, random_state=42,
                                                    stratify=labels.loc[labels.q==q_no].set_index('session')['correct'])
    lst_valid+=y_test.tolist()
    X_test = (X_test-X_train.mean())/X_train.std()
    X_train = (X_train-X_train.mean())/X_train.std()
    clf = RandomForestClassifier(class_weight='balanced').fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    original_f1score.append(f1_score(y_pred, y_test))
    idx = np.argsort(clf.feature_importances_)
    features_to_keep[f'{q_no}'] = idx[:len(idx)//2]
    X_train_selected = X_train.iloc[:,idx[:len(idx)//2]]
    X_test_selected = X_test.iloc[:,idx[:len(idx)//2]]
    clf_selected = RandomForestClassifier(class_weight='balanced').fit(X_train_selected, y_train)
    y_pred_selected = clf_selected.predict(X_test_selected)
    lst_pred+=y_pred.tolist()
    lst_pred_selected+=y_pred_selected.tolist()
    light_f1score.append(f1_score(y_pred_selected, y_test))
    conf_matrix[f'{q_no}'] = confusion_matrix(y_pred, y_test)
    conf_matrix_selected[f'{q_no}'] = confusion_matrix(y_pred_selected, y_test)
print('Score original: ',f1_score(lst_valid, lst_pred,average='macro'))
print('Score light: ',f1_score(lst_valid, lst_pred_selected,average='macro'))


Score original:  0.5827721728611109
Score light:  0.5426943482574975
Score original:  0.661041759185482
Score light:  0.6274734588242333
Score original:  0.6487155442282666
Score light:  0.613756739675164


In [18]:
features_light = {}
for q_no in range(1,19):
    if q_no <= 3:
        features = features1
    elif q_no <= 13:
        features = features2
    else:
        features = features3
    features_light[f'{q_no}'] = features.iloc[:,features_to_keep[f'{q_no}']]

In [20]:
with open(path_data + 'features_light.pkl', 'wb') as f:
    pkl.dump(features_light, f)

## Model training with Features Light

In [7]:
with open(path_data + 'features_light.pkl', 'rb') as f:
    features_light = pkl.load(f)

In [24]:
lst_valid = []
lst_pred = []
conf_matrix = {}
best_params = {}
for q_no in range(1,19):
    features = features_light[f'{q_no}']
    X_train, X_test, y_train, y_test = train_test_split(features, labels.loc[labels.q==q_no].set_index('session')['correct'], test_size=0.2, random_state=42,
                                                    stratify=labels.loc[labels.q==q_no].set_index('session')['correct'])
    lst_valid+=y_test.tolist()
    X_test = (X_test-X_train.mean())/X_train.std()
    X_train = (X_train-X_train.mean())/X_train.std()
    rf = RandomForestClassifier(class_weight='balanced')
    parameters = {'bootstrap': [True, False],
                'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                'min_samples_leaf': [1, 2, 4],
                'min_samples_split': [2, 5, 10],
                'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500]}
    rs = BayesSearchCV(rf, parameters, n_iter=10, verbose=1, scoring='f1_macro')
    rs.fit(X_train, y_train)
    y_pred = rs.predict(X_test)
    best_params[f'{q_no}'] = rs.best_params_
    lst_pred+=y_pred.tolist()
    conf_matrix[f'{q_no}'] = confusion_matrix(y_pred, y_test)
    print('Score: ',f1_score(lst_valid, lst_pred,average='macro'))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Score:  0.5850595380500765
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1

In [26]:
# save best_params with pickle
with open(path_data + 'best_params.pkl', 'wb') as f:
    pkl.dump(best_params, f)

In [27]:
for i in range(1,19):
    print(conf_matrix[str(i)])

[[ 532  826]
 [ 752 2603]]
[[  11  301]
 [  89 4312]]
[[  33  361]
 [ 278 4041]]
[[ 292  502]
 [ 659 3260]]
[[1058  755]
 [1071 1829]]
[[ 431  718]
 [ 625 2939]]
[[ 563  810]
 [ 681 2659]]
[[ 869 1074]
 [ 935 1835]]
[[ 607  958]
 [ 636 2512]]
[[1105  738]
 [1226 1644]]
[[ 819  937]
 [ 861 2096]]
[[ 282 1088]
 [ 364 2979]]
[[2318  643]
 [1098  654]]
[[ 516  754]
 [ 862 2581]]
[[1154  756]
 [1292 1511]]
[[ 262  647]
 [ 988 2816]]
[[ 441  827]
 [1030 2415]]
[[  26  131]
 [ 207 4349]]


In [None]:
# Observation: the light model with xgboost has same f1 score inside the for, but much worse overall (the same metric that they use)
# from .65 to .59