In [1]:
# ! pip install polars

In [2]:
import numpy as np
import pandas as pd
import polars as pl
import logging

from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

from functools import reduce

In [3]:
df_data = pd.read_csv('../Data/train.csv')

In [4]:
df_data.shape

(26296946, 20)

In [5]:
df = df_data.copy()

#### 1. Calculate the diff

In [6]:
df['elapsed_time_diff'] = -df.groupby('session_id')['elapsed_time'].diff(periods=-1)
df['elapsed_time_diff'] = df['elapsed_time_diff'].dropna()

In [7]:
def replace_negative_with_median(group, col):
    median_value = group[col].median()
    group[col] = group[col].apply(lambda x: x if x >= 0 else median_value)
    return group

df1 = df.groupby(['session_id', 'level_group']).apply(replace_negative_with_median, col = 'elapsed_time_diff')
df1['elapsed_time_diff'] = pd.to_numeric(df1['elapsed_time_diff'], errors='coerce')

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df1 = df.groupby(['session_id', 'level_group']).apply(replace_negative_with_median, col = 'elapsed_time_diff')


In [8]:
df1['elapsed_time_diff'].describe()

count    2.629695e+07
mean     7.233179e+03
std      1.162084e+06
min     -5.200000e+01
25%      5.000000e+02
50%      9.650000e+02
75%      1.666000e+03
max      1.743228e+09
Name: elapsed_time_diff, dtype: float64

#### 2. Create features of extreme elapsed time ratio 

In [13]:
def extreme_elapsed_time_ratio(group, col):
    return group[col].sum() / group[col].count()

In [14]:
extreme_elapsed_time_feature_list = []

In [15]:
extreme_elapsed_time_feature_df = df1[['session_id', 'level_group']].drop_duplicates()

In [16]:
for i in [10000, 20000, 50000, 100000, 200000, 500000]:
    df1[f'elapsed_time_over{i}'] = np.where(df1['elapsed_time_diff'] > i, 1, 0)
    tmp = df1.groupby(['session_id', 'level_group']).apply(extreme_elapsed_time_ratio, col=f'elapsed_time_over{i}').reset_index().rename(columns={0: f'elapsed_time_over{i}_ratio'})
    extreme_elapsed_time_feature_df = extreme_elapsed_time_feature_df.merge(tmp, on=['session_id', 'level_group'], how='left')

In [17]:
extreme_elapsed_time_feature_df

Unnamed: 0,session_id,level_group,elapsed_time_over10000_ratio,elapsed_time_over20000_ratio,elapsed_time_over50000_ratio,elapsed_time_over100000_ratio,elapsed_time_over200000_ratio,elapsed_time_over500000_ratio
0,20090312431273200,0-4,0.012121,0.012121,0.000000,0.000000,0.000000,0.000000
1,20090312431273200,5-12,0.003378,0.003378,0.003378,0.003378,0.003378,0.000000
2,20090312431273200,13-22,0.002381,0.002381,0.000000,0.000000,0.000000,0.000000
3,20090312433251036,0-4,0.014388,0.014388,0.000000,0.000000,0.000000,0.000000
4,20090312433251036,5-12,0.002532,0.002532,0.002532,0.002532,0.002532,0.000000
...,...,...,...,...,...,...,...,...
70681,22100219442786200,5-12,0.003597,0.003597,0.003597,0.003597,0.000000,0.000000
70682,22100219442786200,13-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
70683,22100221145014656,0-4,0.009479,0.009479,0.004739,0.000000,0.000000,0.000000
70684,22100221145014656,5-12,0.027778,0.006944,0.005208,0.003472,0.003472,0.003472


In [18]:
df1[df1['level_group'] == '13-22']['elapsed_time_diff'].describe()

count    1.347170e+07
mean     7.112364e+03
std      1.242947e+06
min     -5.200000e+01
25%      4.900000e+02
50%      9.340000e+02
75%      1.612000e+03
max      1.743228e+09
Name: elapsed_time_diff, dtype: float64

In [19]:
# extreme_elapsed_time_feature_df.to_csv('../data/feature_part3/extreme_elapsed_time_feature.csv')

#### 3. Swapping rate of level

In [20]:
df1['level_diff'] = df1.groupby('session_id')['level'].diff().fillna(0)
# df1['level_diff'] = pd.to_numeric(df['level_diff'], errors='coerce')

In [21]:
def level_swapping_ratio(group):
    col = (group['level_diff'] < 0)
    return col.sum() / len(col)

In [22]:
level_swapping_feature_df = df1.groupby(['session_id', 'level_group']).apply(level_swapping_ratio).reset_index().rename(columns={0: 'level_swapping_ratio'})

In [23]:
level_swapping_feature_df[level_swapping_feature_df['level_swapping_ratio'] != 0]

Unnamed: 0,session_id,level_group,level_swapping_ratio
249,20100017310338576,0-4,0.004444
504,20100110332615344,0-4,0.002976
645,20100112152091080,0-4,0.003497
714,20100113211402190,0-4,0.003817
741,20100113403690710,0-4,0.003984
...,...,...,...
69747,22090216055554660,0-4,0.005128
69916,22090314145897064,13-22,0.001261
70017,22090409201202380,0-4,0.003215
70299,22090510463732496,0-4,0.002299


In [24]:
# level_swapping_feature_df.to_csv('../data/feature_part3/level_swapping_feature.csv')

#### 4. Reading speed

In [25]:
df1.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,...,music,level_group,elapsed_time_diff,elapsed_time_over10000,elapsed_time_over20000,elapsed_time_over50000,elapsed_time_over100000,elapsed_time_over200000,elapsed_time_over500000,level_diff
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,1323.0,0,0,0,0,0,0,0.0
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,749.0,0,0,0,0,0,0,0.0
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,316.0,0,0,0,0,0,0,0.0
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,716.0,0,0,0,0,0,0,0.0
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,...,1,0-4,1560.0,0,0,0,0,0,0,0.0


In [26]:
def replace_text(value):
    if (pd.notna(value) and value.startswith('\\u')) or value == 'undefined':
        return None
    else:
        return value

In [27]:
df1['text'] = df1['text'].apply(replace_text)

In [28]:
df1.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,...,music,level_group,elapsed_time_diff,elapsed_time_over10000,elapsed_time_over20000,elapsed_time_over50000,elapsed_time_over100000,elapsed_time_over200000,elapsed_time_over500000,level_diff
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,1323.0,0,0,0,0,0,0,0.0
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,749.0,0,0,0,0,0,0,0.0
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,316.0,0,0,0,0,0,0,0.0
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,...,1,0-4,716.0,0,0,0,0,0,0,0.0
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,...,1,0-4,1560.0,0,0,0,0,0,0,0.0


In [29]:
df1['text_length'] = df1['text'].apply(lambda x: len(x) if x is not None else np.nan)

In [30]:
df1['reading_speed'] = df1['text_length'] / df1['elapsed_time_diff']

In [31]:
reading_speed = df1[['session_id', 'level_group', 'reading_speed']]

In [32]:
reading_speed_feature_df = pd.DataFrame()
reading_speed_feature_df['reading_speed_mean'] = reading_speed.groupby(['session_id', 'level_group']).mean()
reading_speed_feature_df['reading_speed_median'] = reading_speed.groupby(['session_id', 'level_group']).median()
reading_speed_feature_df['reading_speed_range'] = reading_speed.groupby(['session_id', 'level_group']).max() - reading_speed.groupby(['session_id', 'level_group']).min()

In [33]:
reading_speed_feature_df = reading_speed_feature_df.reset_index()

In [177]:
reading_speed_feature_df.loc[np.isinf(reading_speed_feature_df['reading_speed_median'])]

Unnamed: 0,session_id,level_group,reading_speed_mean,reading_speed_median,reading_speed_range


In [34]:
# reading_speed_feature_df.to_csv('../data/feature_part3/reading_speed_feature.csv')

#### 5. merge features

In [66]:
df_session_event_elapsed_time = pd.read_csv('../data/feature_others/session_event_elapsed_time.csv')
df_session_room_elapsed_time = pd.read_csv('../data/feature_others/session_room_elapsed_time.csv')
df_session_event_frequency = pd.read_csv('../data/feature_others/session_event_frequency.csv')

In [67]:
df_session_event_elapsed_time.columns = ['session_id', 'level_group', 'navigate_click_event_elapsed_time', 'person_click_event_elapsed_time', 'cutscene_click_event_elapsed_time', 'object_click_event_elapsed_time']

In [68]:
for i in range(2, len(df_session_room_elapsed_time.columns)):
    df_session_room_elapsed_time = df_session_room_elapsed_time.rename(columns={df_session_room_elapsed_time.columns[i]: df_session_room_elapsed_time.columns[i] + '_room' + '_elapsed_time'})

In [69]:
df_session_room_elapsed_time

Unnamed: 0,session_id,level_group,tunic.capitol_0.hall_room_elapsed_time,tunic.capitol_1.hall_room_elapsed_time,tunic.capitol_2.hall_room_elapsed_time,tunic.drycleaner.frontdesk_room_elapsed_time,tunic.flaghouse.entry_room_elapsed_time,tunic.historicalsociety.basement_room_elapsed_time,tunic.historicalsociety.cage_room_elapsed_time,tunic.historicalsociety.closet_room_elapsed_time,...,tunic.historicalsociety.collection_room_elapsed_time,tunic.historicalsociety.collection_flag_room_elapsed_time,tunic.historicalsociety.entry_room_elapsed_time,tunic.historicalsociety.frontdesk_room_elapsed_time,tunic.historicalsociety.stacks_room_elapsed_time,tunic.humanecology.frontdesk_room_elapsed_time,tunic.kohlcenter.halloffame_room_elapsed_time,tunic.library.frontdesk_room_elapsed_time,tunic.library.microfiche_room_elapsed_time,tunic.wildlife.center_room_elapsed_time
0,20090312431273200,0-4,30837.0,,,,,10610.0,,53062.0,...,25565.0,,45630.0,,5267.0,,26800.0,,,
1,20090312431273200,13-22,,5664.0,1971.0,,27499.0,56605.0,63964.0,,...,,11933.0,74457.0,18481.0,20034.0,,,57745.0,17778.0,86357.0
2,20090312431273200,5-12,6983.0,13439.0,,32178.0,,6334.0,,,...,,,20406.0,48980.0,30494.0,34880.0,,32080.0,14065.0,
3,20090312433251036,0-4,37409.0,,,,,5369.0,,25243.0,...,89645.0,,46768.5,,,,31138.5,,,
4,20090312433251036,13-22,,4506.0,190045.5,113583.0,104822.5,86904.5,143655.5,,...,,54224.0,575743.5,80724.0,145011.0,95466.5,170335.5,591256.0,79866.0,188703.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70681,22100219442786200,13-22,,8570.0,1970.0,,42515.0,53300.0,85947.0,,...,,23123.0,91015.0,33252.0,36439.0,,,36190.0,15691.0,123487.0
70682,22100219442786200,5-12,10254.0,1972.0,,29386.0,,6901.0,,,...,,,20082.0,66639.0,42628.0,34718.0,,52754.0,28409.0,
70683,22100221145014656,0-4,78817.0,,,,,3700.0,,86361.0,...,109624.0,,91757.0,,9458.0,,60101.5,,,
70684,22100221145014656,13-22,,13878.0,61427.0,,48647.0,108379.0,174144.0,,...,,23353.0,190459.0,90784.0,83800.0,,,47286.0,34592.0,308457.0


In [70]:
df_session_event_frequency.columns = ['session_id', 'level_group', 'navigate_click_event_frequency', 'person_click_event_frequency', 'cutscene_click_event_frequency', 'object_click_event_frequency']

In [71]:
df_session_event_frequency

Unnamed: 0,session_id,level_group,navigate_click_event_frequency,person_click_event_frequency,cutscene_click_event_frequency,object_click_event_frequency
0,20090312431273200,0-4,0.490909,0.133333,0.169697,0.066667
1,20090312431273200,13-22,0.404762,0.292857,0.142857,0.047619
2,20090312431273200,5-12,0.347973,0.351351,0.040541,0.094595
3,20090312433251036,0-4,0.352518,0.129496,0.258993,0.107914
4,20090312433251036,13-22,0.491133,0.111796,0.050116,0.063994
...,...,...,...,...,...,...
91,20100008103581056,13-22,0.470297,0.221122,0.125413,0.057756
92,20100008103581056,5-12,0.315476,0.351190,0.032738,0.119048
93,20100008262217530,0-4,0.389831,0.152542,0.228814,0.067797
94,20100008262217530,13-22,0.361111,0.270833,0.175926,0.050926


In [73]:
df_keyue = pd.read_csv('../data/feature_others/features_keyue.csv')
df_keyue

Unnamed: 0,session_id,level_group,map_hover_duration,object_hover_duration,room_change_ratio,text_change_ratio,bingo_time_mean,first_bingo_elapsed_time
0,20090312431273200,0-4,129.500000,4649.500000,0.073171,0.295082,,
1,20090312431273200,13-22,257.000000,1592.307692,0.071599,0.179487,193390.250000,1219035.0
2,20090312431273200,5-12,281.444444,1264.142857,0.071186,0.184000,20302.714286,346295.0
3,20090312433251036,0-4,284.333333,2035.400000,0.043478,0.200000,,
4,20090312433251036,13-22,587.236559,1095.621212,0.061728,0.219298,292371.700000,2317036.0
...,...,...,...,...,...,...,...,...
62701,22100219442786200,13-22,182.076923,2207.727273,0.071090,0.191617,166319.750000,1136990.0
62702,22100219442786200,5-12,322.100000,1887.222222,0.072202,0.165217,24204.000000,368860.0
62703,22100221145014656,0-4,292.500000,5025.111111,0.038095,0.225352,,
62704,22100221145014656,13-22,430.215385,1445.050000,0.080369,0.165179,447364.000000,5205501.0


In [74]:
df_rae = extreme_elapsed_time_feature_df.merge(level_swapping_feature_df, on=['session_id', 'level_group']).\
             merge(reading_speed_feature_df, on=['session_id', 'level_group'])

In [75]:
df_rae

Unnamed: 0,session_id,level_group,elapsed_time_over10000_ratio,elapsed_time_over20000_ratio,elapsed_time_over50000_ratio,elapsed_time_over100000_ratio,elapsed_time_over200000_ratio,elapsed_time_over500000_ratio,level_swapping_ratio,reading_speed_mean,reading_speed_median,reading_speed_range
0,20090312431273200,0-4,0.012121,0.012121,0.000000,0.000000,0.000000,0.000000,0.0,0.051334,0.038718,0.283814
1,20090312431273200,5-12,0.003378,0.003378,0.003378,0.003378,0.003378,0.000000,0.0,0.052444,0.042458,0.539224
2,20090312431273200,13-22,0.002381,0.002381,0.000000,0.000000,0.000000,0.000000,0.0,0.048174,0.041204,0.232685
3,20090312433251036,0-4,0.014388,0.014388,0.000000,0.000000,0.000000,0.000000,0.0,0.081025,0.052737,0.474079
4,20090312433251036,5-12,0.002532,0.002532,0.002532,0.002532,0.002532,0.000000,0.0,0.042012,0.030738,0.180984
...,...,...,...,...,...,...,...,...,...,...,...,...
70681,22100219442786200,5-12,0.003597,0.003597,0.003597,0.003597,0.000000,0.000000,0.0,0.059911,0.035328,0.360941
70682,22100219442786200,13-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.041844,0.029834,0.284616
70683,22100221145014656,0-4,0.009479,0.009479,0.004739,0.000000,0.000000,0.000000,0.0,0.032673,0.017817,0.258853
70684,22100221145014656,5-12,0.027778,0.006944,0.005208,0.003472,0.003472,0.003472,0.0,0.028798,0.016803,0.443814


In [77]:
dfs = [df_rae, df_keyue, df_session_event_elapsed_time, df_session_room_elapsed_time, df_session_event_frequency]

features = reduce(lambda left, right: pd.merge(left, right, on=['session_id', 'level_group'], how='left'), dfs)

In [78]:
features

Unnamed: 0,session_id,level_group,elapsed_time_over10000_ratio,elapsed_time_over20000_ratio,elapsed_time_over50000_ratio,elapsed_time_over100000_ratio,elapsed_time_over200000_ratio,elapsed_time_over500000_ratio,level_swapping_ratio,reading_speed_mean,...,tunic.historicalsociety.stacks_room_elapsed_time,tunic.humanecology.frontdesk_room_elapsed_time,tunic.kohlcenter.halloffame_room_elapsed_time,tunic.library.frontdesk_room_elapsed_time,tunic.library.microfiche_room_elapsed_time,tunic.wildlife.center_room_elapsed_time,navigate_click_event_frequency,person_click_event_frequency,cutscene_click_event_frequency,object_click_event_frequency
0,20090312431273200,0-4,0.012121,0.012121,0.000000,0.000000,0.000000,0.000000,0.0,0.051334,...,5267.0,,26800.0,,,,0.490909,0.133333,0.169697,0.066667
1,20090312431273200,5-12,0.003378,0.003378,0.003378,0.003378,0.003378,0.000000,0.0,0.052444,...,30494.0,34880.0,,32080.0,14065.0,,0.347973,0.351351,0.040541,0.094595
2,20090312431273200,13-22,0.002381,0.002381,0.000000,0.000000,0.000000,0.000000,0.0,0.048174,...,20034.0,,,57745.0,17778.0,86357.0,0.404762,0.292857,0.142857,0.047619
3,20090312433251036,0-4,0.014388,0.014388,0.000000,0.000000,0.000000,0.000000,0.0,0.081025,...,,,31138.5,,,,0.352518,0.129496,0.258993,0.107914
4,20090312433251036,5-12,0.002532,0.002532,0.002532,0.002532,0.002532,0.000000,0.0,0.042012,...,86247.0,36406.5,,57243.0,64811.5,,0.291139,0.245570,0.027848,0.187342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70681,22100219442786200,5-12,0.003597,0.003597,0.003597,0.003597,0.000000,0.000000,0.0,0.059911,...,42628.0,34718.0,,52754.0,28409.0,,,,,
70682,22100219442786200,13-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.041844,...,36439.0,,,36190.0,15691.0,123487.0,,,,
70683,22100221145014656,0-4,0.009479,0.009479,0.004739,0.000000,0.000000,0.000000,0.0,0.032673,...,9458.0,,60101.5,,,,,,,
70684,22100221145014656,5-12,0.027778,0.006944,0.005208,0.003472,0.003472,0.003472,0.0,0.028798,...,321551.0,169636.0,,70492.0,70252.0,,,,,


In [107]:
mean_by_session = df.groupby(['session_id', 'level_group'])[['fullscreen', 'hq', 'music']].mean().reset_index()

In [108]:
mean_by_session

Unnamed: 0,session_id,level_group,fullscreen,hq,music
0,20090312431273200,0-4,0.0,0.0,1.0
1,20090312431273200,13-22,0.0,0.0,1.0
2,20090312431273200,5-12,0.0,0.0,1.0
3,20090312433251036,0-4,0.0,0.0,0.0
4,20090312433251036,13-22,0.0,0.0,0.0
...,...,...,...,...,...
70681,22100219442786200,13-22,0.0,0.0,1.0
70682,22100219442786200,5-12,0.0,0.0,1.0
70683,22100221145014656,0-4,0.0,0.0,1.0
70684,22100221145014656,13-22,0.0,0.0,1.0


In [157]:
test_0_4 = pd.read_csv('../data/feature_others/feature_test_click_0-4.csv')
test_5_12 = pd.read_csv('../data/feature_others/feature_test_click_5-12.csv')
test_13_22 = pd.read_csv('../data/feature_others/feature_test_click_13-22.csv')

In [162]:
test_0_4.shape, test_5_12.shape, test_13_22.shape

((4713, 3), (4713, 3), (4713, 3))

In [158]:
test_0_4['level_group'] = '0-4'
test_5_12['level_group'] = '5-12'
test_13_22['level_group'] = '13-22'

test_click = pd.concat([test_0_4, test_5_12, test_13_22], axis=0)

In [179]:
features_test = test_click.merge(mean_by_session, on=['session_id', 'level_group'], how='left').\
                            merge(features, on=['session_id', 'level_group'], how='left')

In [180]:
features_test

Unnamed: 0,session_id,fraction_of_common_click,level_group,fullscreen,hq,music,elapsed_time_over10000_ratio,elapsed_time_over20000_ratio,elapsed_time_over50000_ratio,elapsed_time_over100000_ratio,...,tunic.historicalsociety.stacks_room_elapsed_time,tunic.humanecology.frontdesk_room_elapsed_time,tunic.kohlcenter.halloffame_room_elapsed_time,tunic.library.frontdesk_room_elapsed_time,tunic.library.microfiche_room_elapsed_time,tunic.wildlife.center_room_elapsed_time,navigate_click_event_frequency,person_click_event_frequency,cutscene_click_event_frequency,object_click_event_frequency
0,20090313571836404,0.317308,0-4,0.0,0.0,1.0,0.017857,0.017857,0.000000,0.000000,...,,,26317.0,,,,0.312500,0.169643,0.232143,0.080357
1,20090315085850788,0.050955,0-4,1.0,0.0,1.0,0.023952,0.017964,0.017964,0.000000,...,14976.0,,48747.5,,,,0.437126,0.107784,0.185629,0.053892
2,20090315170769824,0.588235,0-4,0.0,0.0,1.0,0.012658,0.012658,0.000000,0.000000,...,3348.0,,79949.0,,,,0.335443,0.113924,0.170886,0.272152
3,20090316152177500,0.448454,0-4,0.0,0.0,1.0,0.024752,0.009901,0.004950,0.000000,...,11247.0,,54863.0,,,,0.534653,0.133663,0.133663,0.054455
4,20090317080721164,0.321918,0-4,0.0,0.0,1.0,0.019737,0.013158,0.006579,0.000000,...,27842.0,,51834.0,,,,0.427632,0.125000,0.177632,0.131579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14134,22090618050629904,0.837647,13-22,0.0,0.0,1.0,0.017391,0.008696,0.004348,0.000000,...,131548.0,,,37597.0,70274.0,181225.0,,,,
14135,22100208551963804,0.785294,13-22,0.0,0.0,1.0,0.023499,0.020888,0.020888,0.018277,...,847126.0,,,38427.0,135483.0,113041.0,,,,
14136,22100209282577744,0.854610,13-22,1.0,1.0,1.0,0.003378,0.001689,0.001689,0.001689,...,35828.0,,,36822.0,22644.0,193954.0,,,,
14137,22100211280762644,0.816384,13-22,0.0,0.0,1.0,0.000000,0.000000,0.000000,0.000000,...,31537.5,,,31518.0,15004.0,101665.0,,,,


In [135]:
features1 = features[features['level_group'] == '0-4'].drop('level_group', axis=1)
features2 = features[features['level_group'] == '5-12'].drop('level_group', axis=1)
features3 = features[features['level_group'] == '13-22'].drop('level_group', axis=1)

In [136]:
mean_by_session1 = mean_by_session[mean_by_session['level_group'] == '0-4'].drop('level_group', axis=1)
mean_by_session2 = mean_by_session[mean_by_session['level_group'] == '5-12'].drop('level_group', axis=1)
mean_by_session3 = mean_by_session[mean_by_session['level_group'] == '13-22'].drop('level_group', axis=1)

In [137]:
features_test_0_4 = test_0_4.merge(features1, on='session_id').merge(mean_by_session1, on='session_id')
features_test_5_12 = test_5_12.merge(features2, on='session_id').merge(mean_by_session2, on='session_id')
features_test_13_22 = test_13_22.merge(features3, on='session_id').merge(mean_by_session3, on='session_id')

In [138]:
features_test_0_4 = features_test_0_4.dropna(axis=1, how='all')
features_test_5_12 = features_test_5_12.dropna(axis=1, how='all')
features_test_13_22 = features_test_13_22.dropna(axis=1, how='all')

In [139]:
features_test_5_12.shape

(4713, 42)

In [141]:
features_test_0_4.to_csv('../data/feature_merged/features_test_0-4.csv')
features_test_5_12.to_csv('../data/feature_merged/features_test_5-12.csv')
features_test_13_22.to_csv('../data/feature_merged/features_test_13-22.csv')

In [163]:
train_0_4 = pd.read_csv('../data/feature_others/feature_train_click_0-4.csv')
train_5_12 = pd.read_csv('../data/feature_others/feature_train_click_5-12.csv')
train_13_22 = pd.read_csv('../data/feature_others/feature_train_click_13-22.csv')

In [164]:
train_0_4['level_group'] = '0-4'
train_5_12['level_group'] = '5-12'
train_13_22['level_group'] = '13-22'

In [165]:
train_click = pd.concat([train_0_4, train_5_12, train_13_22], axis=0)

In [166]:
train_click

Unnamed: 0,session_id,fraction_of_common_click,level_group
0,20090312431273200,0.865385,0-4
1,20090312433251036,0.861538,0-4
2,20090312455206810,0.737931,0-4
3,20090313091715820,0.755952,0-4
4,20090314035813970,0.653061,0-4
...,...,...,...
18844,22090619362224080,0.831094,13-22
18845,22100212552203824,0.891700,13-22
18846,22100213081672770,0.816635,13-22
18847,22100215460321130,0.743719,13-22


In [128]:
features_train_0_4 = train_0_4.merge(features1, on='session_id').merge(mean_by_session1, on='session_id')
features_train_5_12 = train_5_12.merge(features2, on='session_id').merge(mean_by_session2, on='session_id')
features_train_13_22 = train_13_22.merge(features3, on='session_id').merge(mean_by_session3, on='session_id')

In [129]:
features_train_0_4 = features_train_0_4.dropna(axis=1, how='all')
features_train_5_12 = features_train_5_12.dropna(axis=1, how='all')
features_train_13_22 = features_train_13_22.dropna(axis=1, how='all')

In [130]:
features_train_0_4.shape

(18849, 34)

In [131]:
features_train_5_12.shape

(18849, 42)

In [132]:
features_train_13_22.shape

(18849, 46)

In [181]:
features_train = train_click.merge(mean_by_session, on=['session_id', 'level_group'], how='left').\
                             merge(features, on=['session_id', 'level_group'], how='left')


In [182]:
features_train

Unnamed: 0,session_id,fraction_of_common_click,level_group,fullscreen,hq,music,elapsed_time_over10000_ratio,elapsed_time_over20000_ratio,elapsed_time_over50000_ratio,elapsed_time_over100000_ratio,...,tunic.historicalsociety.stacks_room_elapsed_time,tunic.humanecology.frontdesk_room_elapsed_time,tunic.kohlcenter.halloffame_room_elapsed_time,tunic.library.frontdesk_room_elapsed_time,tunic.library.microfiche_room_elapsed_time,tunic.wildlife.center_room_elapsed_time,navigate_click_event_frequency,person_click_event_frequency,cutscene_click_event_frequency,object_click_event_frequency
0,20090312431273200,0.865385,0-4,0.0,0.0,1.0,0.012121,0.012121,0.000000,0.000000,...,5267.0,,26800.0,,,,0.490909,0.133333,0.169697,0.066667
1,20090312433251036,0.861538,0-4,0.0,0.0,0.0,0.014388,0.014388,0.000000,0.000000,...,,,31138.5,,,,0.352518,0.129496,0.258993,0.107914
2,20090312455206810,0.737931,0-4,1.0,1.0,1.0,0.020134,0.013423,0.006711,0.006711,...,,,34643.5,,,,0.382550,0.120805,0.328859,0.067114
3,20090313091715820,0.755952,0-4,1.0,1.0,1.0,0.011364,0.011364,0.000000,0.000000,...,,,45789.0,,,,0.454545,0.125000,0.193182,0.102273
4,20090314035813970,0.653061,0-4,0.0,0.0,1.0,0.032468,0.012987,0.006494,0.000000,...,,,69999.0,,,,0.344156,0.155844,0.201299,0.077922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56542,22090619362224080,0.831094,13-22,0.0,0.0,1.0,0.019678,0.003578,0.001789,0.001789,...,38729.0,10722.0,,99049.0,68783.0,201488.0,,,,
56543,22100212552203824,0.891700,13-22,0.0,0.0,1.0,0.009560,0.002868,0.000956,0.000956,...,100054.0,,,54769.0,43971.0,499463.0,,,,
56544,22100213081672770,0.816635,13-22,0.0,0.0,1.0,0.001742,0.001742,0.001742,0.001742,...,31865.0,,,30338.0,1330799.0,124570.0,,,,
56545,22100215460321130,0.743719,13-22,0.0,0.0,1.0,0.002203,0.000000,0.000000,0.000000,...,39673.0,26169.0,,41449.0,21022.0,166590.0,,,,


In [183]:
features_train = features_train.drop('reading_speed_range', axis=1)
features_test = features_test.drop('reading_speed_range', axis=1)

In [187]:
features_train.to_csv('../data/feature_merged/features_train.csv', index=False)
features_test.to_csv('../data/feature_merged/features_test.csv', index=False)

In [133]:
features_train_0_4.to_csv('../data/feature_merged/features_train_0-4.csv')
features_train_5_12.to_csv('../data/feature_merged/features_train_5-12.csv')
features_train_13_22.to_csv('../data/feature_merged/features_train_13-22.csv')

In [104]:
features_train_0_4.columns

Unnamed: 0,session_id,fraction_of_common_click,level_group,elapsed_time_over10000_ratio,elapsed_time_over20000_ratio,elapsed_time_over50000_ratio,elapsed_time_over100000_ratio,elapsed_time_over200000_ratio,elapsed_time_over500000_ratio,level_swapping_ratio,...,tunic.historicalsociety.basement_room_elapsed_time,tunic.historicalsociety.closet_room_elapsed_time,tunic.historicalsociety.collection_room_elapsed_time,tunic.historicalsociety.entry_room_elapsed_time,tunic.historicalsociety.stacks_room_elapsed_time,tunic.kohlcenter.halloffame_room_elapsed_time,navigate_click_event_frequency,person_click_event_frequency,cutscene_click_event_frequency,object_click_event_frequency
0,20090312431273200,0.865385,0-4,0.012121,0.012121,0.0,0.0,0.0,0.0,0.0,...,10610.0,53062.0,25565.0,45630.0,5267.0,26800.0,0.490909,0.133333,0.169697,0.066667
1,20090312433251036,0.861538,0-4,0.014388,0.014388,0.0,0.0,0.0,0.0,0.0,...,5369.0,25243.0,89645.0,46768.5,,31138.5,0.352518,0.129496,0.258993,0.107914
2,20090312455206810,0.737931,0-4,0.020134,0.013423,0.006711,0.006711,0.006711,0.0,0.0,...,10920.0,229269.5,28600.0,35175.5,,34643.5,0.38255,0.120805,0.328859,0.067114
3,20090313091715820,0.755952,0-4,0.011364,0.011364,0.0,0.0,0.0,0.0,0.0,...,10107.0,33451.0,28467.0,31690.0,,45789.0,0.454545,0.125,0.193182,0.102273
4,20090314035813970,0.653061,0-4,0.032468,0.012987,0.006494,0.0,0.0,0.0,0.0,...,22159.0,108491.0,78356.0,88859.0,,69999.0,0.344156,0.155844,0.201299,0.077922
