In [1]:
import os
import sys
import json
from tqdm import tqdm_notebook as tqdm
from pprint import pprint

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from matplotlib_venn import venn2
import seaborn as sns

In [2]:
pd.set_option('max_rows', 200)
pd.set_option('max_columns', 200)

## data load

In [3]:
BASE_DIR = '../mnt/inputs/origin'
TRN_BASE_DIR = '../mnt/inputs/features/train'
TST_BASE_DIR = '../mnt/inputs/features/test'
# BASE_DIR = '../mnt/inputs/features/test'

In [4]:
!ls $BASE_DIR
!ls $TRN_BASE_DIR

data-science-bowl-2019.zip  test.csv	  train2.csv
sample_submission.csv	    train.csv	  train_labels.csv
specs.csv		    train.pkl.gz
specs_w_event_id.csv	    train1.csv
EncodingTitles.pkl  PrevAssessAccByTitle.pkl  befTargetCntFeatures.pkl
EventCount.pkl	    PrevAssessResult.pkl      immediatelyBeforeFeatures.pkl
EventCount2.pkl     SessionTime2.pkl	      targetFeatures.pkl
GameDurMiss.pkl     TypeEventCounts.pkl       worldGameNumeriacalFeatures.pkl
KernelBasics3.pkl   UserActivityCount.pkl
PrevAssessAcc.pkl   Worldcount.pkl


In [5]:
trn_df = pd.read_pickle(f'{BASE_DIR}/train.pkl.gz')
specs_df = pd.read_csv(f'{BASE_DIR}/specs_w_event_id.csv')
display(trn_df.shape, trn_df.head())

(11341042, 11)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [6]:
trn_labels_df = pd.read_csv(f'{BASE_DIR}/train_labels.csv')
display(trn_labels_df.shape, trn_labels_df.head())

(17690, 7)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


In [7]:
tst_df = pd.read_csv(f'{BASE_DIR}/test.csv')
tst_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [8]:
sub_df = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')
sub_df.head()

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3


## trn vs tst 見てみる

In [9]:
ls '../mnt/inputs/features/train/'

EncodingTitles.pkl  PrevAssessAccByTitle.pkl  befTargetCntFeatures.pkl
EventCount.pkl      PrevAssessResult.pkl      immediatelyBeforeFeatures.pkl
EventCount2.pkl     SessionTime2.pkl          targetFeatures.pkl
GameDurMiss.pkl     TypeEventCounts.pkl       worldGameNumeriacalFeatures.pkl
KernelBasics3.pkl   UserActivityCount.pkl
PrevAssessAcc.pkl   Worldcount.pkl


In [16]:
trn_kb_df = pd.read_pickle('../mnt/inputs/features/train/KernelBasics3.pkl')
tst_kb_df = pd.read_pickle('../mnt/inputs/features/test/KernelBasics3.pkl')

In [17]:
trn_bef_target_cnt_df = pd.read_pickle('../mnt/inputs/features/train/befTargetCntFeatures.pkl')
tst_bef_target_cnt_df = pd.read_pickle('../mnt/inputs/features/test/befTargetCntFeatures.pkl')

#### last truncated のもののみ使う

In [18]:
trn_kb_df = trn_kb_df.merge(trn_bef_target_cnt_df, on=['installation_id', 'game_session'], how='left')
tst_kb_df = tst_kb_df.merge(tst_bef_target_cnt_df, on=['installation_id', 'game_session'], how='left')

In [22]:
trn_kb_df = trn_kb_df.sort_values(['game_session', 'f019_bef_target_cnt']).drop_duplicates(['game_session'], keep='last')

#### util funcs

In [23]:
def plot_trn_vs_tst(trn_col, tst_col):
    fig, axs = plt.subplots(1, 2, figsize=(5, 10))
    
    ax = axs[0]
    ax.hist(, alpha=0.5)