In [46]:
%reload_ext autoreload
%autoreload 2

In [2]:
import jupytools.syspath
jupytools.syspath.add('..')

In [3]:
import json
import catboost as cb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dataset import load, n_unique, missing_info, existing_info, Subset
from style import make_colors, create_axes_if_needed, tableau, NotebookStyle
from utils import parallel

In [4]:
trn_data, trn_target, trn_specs = load(Subset.Train)

(11341042, 11) (17690, 7) (386, 3) 

In [5]:
trn_data.sample(5)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
6469561,1bb5fbdb,e5c0db0eeafc9d1f,2019-09-10T14:51:02.128Z,"{""description"":""Great job! You did it!"",""ident...",8ffeffb1,55,3110,40621,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4436131,76babcde,880969db92dfe98a,2019-08-02T16:22:15.122Z,"{""coordinates"":{""x"":267,""y"":170,""stage_width"":...",6231f60c,33,4070,64311,Dino Dive,Game,MAGMAPEAK
8821615,f71c4741,565a615a4b5c325a,2019-09-14T02:04:44.750Z,"{""description"":""Which tub is the right size fo...",c795f94d,94,3010,119041,Scrub-A-Dub,Game,MAGMAPEAK
1745398,5e3ea25a,0d2fe8bd363a8a6f,2019-08-01T19:54:42.598Z,"{""coordinates"":{""x"":921,""y"":439,""stage_width"":...",281f09d5,6,4070,14541,Crystals Rule,Game,TREETOPCITY
4857912,0a08139c,1eb28bfc1a0bfb06,2019-09-17T02:52:40.532Z,"{""description"":""This bug is so tiny!"",""identif...",6cb38ea0,111,3010,84681,Bug Measurer (Activity),Activity,TREETOPCITY


In [6]:
trn_target.sample(5)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
16771,e865f94eae139622,f1c21eda,Bird Measurer (Assessment),0,14,0.0,0
16974,1cd93a676ff3f27f,f4fcab26,Cauldron Filler (Assessment),1,3,0.25,1
9566,ab5400a4e11d843c,858ace47,Mushroom Sorter (Assessment),1,0,1.0,3
9196,f5ac8a88ab9b3057,8015e006,Cauldron Filler (Assessment),1,0,1.0,3
1723,f17b9cd839f7f55a,16276a94,Mushroom Sorter (Assessment),1,0,1.0,3


In [7]:
trn_specs.sample(5)

Unnamed: 0,event_id,info,args
44,53c6e11a,The beat round event is triggered when the pla...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
75,08fd73f3,The beat round event is triggered when the pla...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
19,3afde5dd,The system-initiated feedback (Correct) event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
239,e4d32835,This event occurs when the player hovers the m...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
264,dcb55a27,This event is triggered when the player clicks...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."


In [32]:
class EventParser:
    def __init__(self, keys):
        self.keys = keys
    def __call__(self, json_str):
        obj = json.loads(json_str)
        obj = pd.io.json.json_normalize(obj)
        obj = obj.T[0].to_dict()
        row = {k: obj.get(k) for k in self.keys}
        return row

In [33]:
def fillna(df, column, method='mean', value=None):
    if method == 'mean':
        value = df[column].mean()
    elif method == 'mode':
        value = df[column].value_counts().index[0]
    elif method == 'const':
        assert value is not None
    else:
        raise RuntimeError(f'invalid imputing method: {method}')
    df[column] = df[column].fillna(value)
    return df

In [64]:
def extract_event_features(df, num_workers=12):
    parse_row = EventParser([
        'game_time', 'coordinates.y', 'coordinates.stage_height',
        'coordinates.stage_width', 'coordinates.x',
        'description', 'media_type', 'identifier',
        'duration', 'total_duration'])
    df = pd.DataFrame(parallel(parse_row, df.event_data, num_workers))
    df = fillna(df, 'game_time', method='mean')
    df = fillna(df, 'coordinates.x', method='mode')
    df = fillna(df, 'coordinates.y', method='mode')
    df = fillna(df, 'coordinates.stage_height', method='mode')
    df = fillna(df, 'coordinates.stage_width', method='mode')
    df = fillna(df, 'description', method='mode')
    df = fillna(df, 'media_type', method='const', value='none')
    df = fillna(df, 'identifier', method='const', value='none')
    df = fillna(df, 'duration', method='mean')
    df = fillna(df, 'total_duration', method='mean')
    return df

In [65]:
trn_event = extract_event_features(trn_data)

HBox(children=(IntProgress(value=0, max=11341042), HTML(value='')))




In [68]:
trn_event.to_feather('trn_event.feather')

In [72]:
dataset = pd.concat([trn_data.drop(columns='event_data'), trn_event], axis=1)

In [73]:
dataset

Unnamed: 0,event_id,game_session,timestamp,installation_id,event_count,event_code,game_time,title,type,world,game_time.1,coordinates.y,coordinates.stage_height,coordinates.stage_width,coordinates.x,description,media_type,identifier,duration,total_duration
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,200876.905661,207.0,762.0,1015.0,782.0,That's not enough food. Try putting a piece on...,none,none,33962.599642,3843.75213
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,200876.905661,207.0,762.0,1015.0,782.0,That's not enough food. Try putting a piece on...,none,none,33962.599642,3843.75213
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,0.000000,207.0,762.0,1015.0,782.0,That's not enough food. Try putting a piece on...,none,none,33962.599642,3843.75213
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK,53.000000,207.0,762.0,1015.0,782.0,"Let's build a sandcastle! First, fill up your ...",audio,"Dot_LetsSandcastle,Dot_FillMold,Dot_MoldShape",33962.599642,6758.00000
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK,6972.000000,207.0,762.0,1015.0,782.0,"Let's build a sandcastle! First, fill up your ...",audio,"Dot_LetsSandcastle,Dot_FillMold,Dot_MoldShape",6919.000000,3843.75213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11341037,ab3136ba,c09b94eebfdf50a6,2019-10-10T15:11:15.584Z,fffc0583,66,3110,118615,Dino Dive,Game,MAGMAPEAK,118615.000000,207.0,762.0,1015.0,782.0,Tap another dinosaur.,animation,clip_307_touch,1450.000000,3843.75213
11341038,27253bdc,6ad8ab25003ef1b0,2019-10-10T15:12:17.364Z,fffc0583,1,2000,0,Crystal Caves - Level 2,Clip,CRYSTALCAVES,200876.905661,207.0,762.0,1015.0,782.0,That's not enough food. Try putting a piece on...,none,none,33962.599642,3843.75213
11341039,27253bdc,71904dca23b982c6,2019-10-10T15:13:04.713Z,fffc0583,1,2000,0,Crystal Caves - Level 3,Clip,CRYSTALCAVES,200876.905661,207.0,762.0,1015.0,782.0,That's not enough food. Try putting a piece on...,none,none,33962.599642,3843.75213
11341040,27253bdc,2267b18f069e875a,2019-10-10T15:13:54.545Z,fffc0583,1,2000,0,Crystal Caves - Level 3,Clip,CRYSTALCAVES,200876.905661,207.0,762.0,1015.0,782.0,That's not enough food. Try putting a piece on...,none,none,33962.599642,3843.75213


In [94]:
is_assessment = dataset.title.str.contains('\(Assessment\)')

In [113]:
from collections import OrderedDict
sessions = OrderedDict()
for k, g in dataset.query('installation_id == "0006a69f"').groupby('game_session'):
    session_titles = g.title.unique().tolist()
    sessions[k] = session_titles

In [74]:
trn_target

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.000000,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.000000,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.000000,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.500000,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.000000,3
...,...,...,...,...,...,...,...
17685,c996482b11d149dd,ffc90c32,Bird Measurer (Assessment),1,0,1.000000,3
17686,b05a02b52d5c1f4c,ffd2871d,Cauldron Filler (Assessment),1,0,1.000000,3
17687,5448d652309a6324,ffeb0b1b,Cauldron Filler (Assessment),1,2,0.333333,1
17688,a6885ab824fbc32c,ffeb0b1b,Mushroom Sorter (Assessment),0,1,0.000000,0


In [119]:
[tst_data] = load(Subset.Test)

(1156414, 11) 

In [120]:
tst_data

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES
...,...,...,...,...,...,...,...,...,...,...,...
1156409,c74f40cd,46ff9d3ad2be09f2,2019-09-28T21:20:40.918Z,"{""description"":""Alright! This one is the littl...",ffe774cc,39,3121,32030,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156410,6c930e6e,46ff9d3ad2be09f2,2019-09-28T21:20:41.493Z,"{""duration"":20008,""misses"":0,""event_count"":40,...",ffe774cc,40,2030,32584,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156411,a5be6304,46ff9d3ad2be09f2,2019-09-28T21:20:45.499Z,"{""session_duration"":36607,""exit_type"":""game_co...",ffe774cc,41,2010,36607,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1156412,27253bdc,96d7dc31e822cedc,2019-09-28T21:21:05.670Z,"{""event_code"": 2000, ""event_count"": 1}",ffe774cc,1,2000,0,Tree Top City - Level 3,Clip,TREETOPCITY
