In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import jupytools.syspath
jupytools.syspath.add('..')

In [3]:
import csv
import os
import catboost as cb
import feather
import pandas as pd
from multiprocessing import cpu_count
from tqdm.auto import tqdm
from basedir import TEST
from extract_features import prepare, prepare_groups, baseline_features, extend_with_event_data
from utils import chunks, parallel

In [4]:
dataset = pd.read_csv(TEST)
dataset['timestamp'] = pd.to_datetime(dataset['timestamp'])
dataset.sort_values(by='timestamp', inplace=True)

In [5]:
group_sizes = dataset.groupby('installation_id', as_index=False).game_session.count()
single_session = group_sizes.query('game_session == 1').installation_id
dataset[dataset.installation_id.isin(single_session)]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
1092631,3bfd1a65,9faa2ed7e24fb868,2019-07-29 17:38:16.338000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",f162b7a4,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
821135,f56e0afc,5870e21c1f62da24,2019-07-30 17:48:39.638000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",b0efc6f4,1,2000,0,Bird Measurer (Assessment),Assessment,TREETOPCITY
448327,7ad3efc6,497db6c34c3c70f2,2019-08-02 17:42:37.407000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",69164a28,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
718619,5b49460a,bb96ca5af4753305,2019-08-11 16:53:56.967000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",9885ddd8,1,2000,0,Chest Sorter (Assessment),Assessment,CRYSTALCAVES
172313,5b49460a,da2e207ad231eec8,2019-08-17 20:39:09.582000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",27e272e5,1,2000,0,Chest Sorter (Assessment),Assessment,CRYSTALCAVES
342565,90d848e0,e0f0cac705f51cde,2019-08-21 20:04:53.473000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",4fc92163,1,2000,0,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
355342,7ad3efc6,1f9ff2be9e1f18f8,2019-08-23 18:27:20.194000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",51bc6b81,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
456038,f56e0afc,2d49cb098633bcc4,2019-08-23 18:34:42.875000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",6d15f7d6,1,2000,0,Bird Measurer (Assessment),Assessment,TREETOPCITY
959445,90d848e0,cf6e301cee976969,2019-08-29 20:55:43.107000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",d1e3bd8c,1,2000,0,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
894122,90d848e0,2b38a684956b61f2,2019-09-09 20:59:24.506000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",c2cfee57,1,2000,0,Cauldron Filler (Assessment),Assessment,MAGMAPEAK


In [6]:
def save_installation_groups(dataset, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    groups = dataset.groupby('installation_id')
    with tqdm(total=groups.ngroups) as bar:
        for key, group in groups:
            bar.set_description(key)
            filename = os.path.join(output_dir, key)
            group = group.reset_index(drop=True)
            group.to_feather(filename)
            bar.update(1)
    filenames = [os.path.join(output_dir, fn) for fn in os.listdir(output_dir)]
    return filenames

In [7]:
def prepare_on_disk(items, features, buffer=16, dedup=True,
                    num_workers=cpu_count(),
                    output_dir='/tmp/prepared'):
    
    from uuid import uuid4
    os.makedirs(output_dir, exist_ok=True)
    file_batches = list(chunks(items, buffer))
    prepared_files = []
    for filenames in tqdm(file_batches, desc='Processing files'):
        datasets = parallel(feather.read_dataframe, filenames, num_workers=num_workers)
        datasets = parallel(extend_with_event_data, datasets, num_workers=num_workers)
        prepared = prepare_groups(datasets, features, num_workers=num_workers)
        if dedup:
            prepared = prepared.drop_duplicates(
                subset=['game_session', 'installation_id'],
                keep='last')
            prepared = prepared.reset_index(drop=True)
        filename = os.path.join(output_dir, str(uuid4()))
        prepared.to_feather(filename)
        prepared_files.append(filename)
    return prepared_files

In [8]:
grouped_filenames = save_installation_groups(dataset, '/tmp/installations')

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [9]:
prepared_filenames = prepare_on_disk(grouped_filenames, baseline_features)

HBox(children=(IntProgress(value=0, description='Processing files', max=63, style=ProgressStyle(description_wi…




In [None]:
def run_model(model_file, data_files):
    booster = cb.CatBoost()
    booster.load_model(model_file)
    for filename in data_files:
        