In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter

# local imports
from prepare import *

In [None]:
raw_train = pd.read_csv('data/train.csv')
raw_train_labels = pd.read_csv('data/train_labels.csv')
raw_test = pd.read_csv('data/test.csv')
specs = pd.read_csv('data/specs.csv')
sample = pd.read_csv('data/sample_submission.csv')

## Shorten dataframes to speed up testing

In [None]:
%%script false
## Comment out above line to run this cell

frames = [raw_train, raw_train_labels, raw_test]#, specs, sample]

for i, df in enumerate(frames):
    frames[i] = frames[i][0:round(0.1*frames[i].shape[0])]
    print(frames[i].shape)

In [None]:
train = raw_train.copy()
train_labels = raw_train_labels.copy()
test = raw_test.copy()

## Run modified remove_dead_weight function

In [None]:
train = remove_dead_weight(train, train_labels)
test = remove_dead_weight(test, train_labels, test_set=True)
assert len(set(sample.installation_id).difference(set(test.installation_id))) == 0

In [None]:
train.shape

## Process time

In [None]:
# convert timestamp to correct datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format=True)
test['timestamp'] = pd.to_datetime(test['timestamp'], infer_datetime_format=True)
assert len(set(sample.installation_id).difference(set(test.installation_id))) == 0

In [None]:
# have the ability to break datetime into year, month, day, etc with add_datepart in library

## Encode data

https://towardsdatascience.com/understanding-feature-engineering-part-2-categorical-data-f54324193e63

In [None]:
train, test = encode_col(train, test, 'title')
train, test = encode_col(train, test, 'world')
#assert 
len(set(sample.installation_id).difference(set(test.installation_id)))

In [57]:
train.shape

(7696400, 59)

## Bin event codes

In [58]:
#TBD

## Process Data

https://www.kaggle.com/ragnar123/truncated-train-ensemble

In [61]:
#compiled_train_data = process_data(train)
#compiled_test_data = process_data(test, test_set=True)

In [62]:
len(compiled_train_data)

17690

In [65]:
import joblib
#joblib.dump(compiled_train_data, 'compiled_train_data.pkl')
#joblib.dump(compiled_test_data, 'compiled_test_data.pkl')


In [66]:
compiled_train_data = joblib.load('compiled_train_data.pkl')
compiled_test_data = joblib.load('compiled_test_data.pkl')

In [67]:
compiled_train = pd.concat(compiled_train_data, axis=1).T

In [68]:
compiled_train.drop('installation_id_slice', axis=1, inplace=True)

In [69]:
compiled_test = pd.concat(compiled_test_data, axis=1).T

In [70]:
print(set(compiled_train.columns).difference(set(compiled_test.columns)))
compiled_train.shape, compiled_test.shape

set()


((17690, 69), (3614, 69))

## Add accuracy group from train_labels dataframe

In [71]:
# just verifying
train_labels.columns

Index(['game_session', 'installation_id', 'title', 'num_correct',
       'num_incorrect', 'accuracy', 'accuracy_group'],
      dtype='object')

In [72]:
compiled_train = pd.merge(compiled_train, train_labels[['installation_id','game_session','accuracy_group']], \
                  on=['installation_id','game_session'])
compiled_train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,...,avg_event_count,avg_review_incorrect_feedback,avg_review_correct_feedback,total_rounds_beat,total_movies_skipped,total_movies_watched,total_elsewhere_clicks,total_help_button_clicks,total_play_again,accuracy_group
0,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:01.344000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,56.2855,313.143,4015.89,18,1,2,94,4,0,3
1,f56e0afc,77b8ee947eb84b4e,2019-08-06 05:35:19.167000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Bird Measurer (Assessment),Assessment,...,67.083,1771.27,3103.88,23,1,2,156,4,0,0
2,3bfd1a65,6bdf9623adc94d89,2019-08-06 05:37:50.020000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,65.4517,1871.73,3103.88,23,1,2,160,4,0,3
3,3bfd1a65,9501794defd84e4d,2019-08-06 20:34:53.812000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,62.609,2427.52,2544.59,47,2,5,348,4,1,2
4,f56e0afc,a9ef3ecb3d1acc6a,2019-08-06 20:49:59.095000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Bird Measurer (Assessment),Assessment,...,62.1572,2467.7,2309.92,58,2,5,387,4,1,3


## Convert as many columns as possible to numeric

In [73]:
def numerize(df):
    for i, column in enumerate(df.columns):
        col = df.columns[i]
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df

In [74]:
# for i, column in enumerate(compiled_train.columns):
#     col = compiled_train.columns[i]
#     compiled_train[col] = pd.to_numeric(compiled_train[col], errors='ignore')

# compiled_train.info()

compiled_train = numerize(compiled_train)
compiled_test = numerize(compiled_test)


In [75]:
compiled_train.shape, compiled_test.shape

((17690, 70), (3614, 69))

## Train RF model

In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [77]:
scaler = StandardScaler()

# get numeric fields only
#X = compiled_train._get_numeric_data().drop('accuracy_group', axis=1).iloc[:,:-3]

X = compiled_train.drop('accuracy_group', axis=1)._get_numeric_data()
print(X.shape)
#print(compiled_test.shape)
X_scaled = scaler.fit_transform(X)
y = compiled_train.accuracy_group

compiled_test_scaled = scaler.transform(compiled_test._get_numeric_data())
print(compiled_test_scaled.shape)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

(17690, 62)
(3614, 62)




0.4861503674392312

In [78]:
#test_prediction = rf.predict(compiled_test_scaled)
#list(zip(compiled_test.installation_id, test_prediction))

In [79]:
# To create a submission:
# submission = pd.DataFrame()
# submission['installation_id'] = compiled_test.installation_id
# submission['accuracy_group'] = test_prediction
# submission.head()

In [80]:
#submission.accuracy_group.value_counts()

In [81]:
#submission.to_csv('submission_1_1_2020.csv',index=False)

## Balance classes

In [82]:
from sklearn.utils import resample

# Separate classes
df_0 = compiled_train[compiled_train.accuracy_group==0]
df_1 = compiled_train[compiled_train.accuracy_group==1]
df_2 = compiled_train[compiled_train.accuracy_group==2]
df_3 = compiled_train[compiled_train.accuracy_group==3]

# Downsample 3, 0, 1 to 2's level - n=419
resampled_dfs = [df_2]

for i in [df_0,df_1,df_3]:  
    downsampled_df = resample(i, 
                              replace=False, # sample without replacement
                              n_samples = min(compiled_train.accuracy_group.value_counts()), #  to match minority
                              random_state = 42)  # reproducibility
    resampled_dfs.append(downsampled_df)

balanced_compiled_train = pd.concat(resampled_dfs, axis=0)

balanced_compiled_train.accuracy_group.value_counts()

3    2205
2    2205
1    2205
0    2205
Name: accuracy_group, dtype: int64

In [83]:
scaler = StandardScaler()

X = balanced_compiled_train._get_numeric_data().drop('accuracy_group', axis=1)
X_scaled = scaler.fit_transform(X)
y = balanced_compiled_train.accuracy_group

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)



0.31462585034013607

In [84]:
test_prediction = rf.predict(compiled_test_scaled)
list(zip(compiled_test.installation_id, test_prediction))

[('0006a69f', 0),
 ('0006c192', 0),
 ('00129856', 1),
 ('001d0ed0', 0),
 ('00225f67', 0),
 ('00279ac5', 1),
 ('002db7e3', 0),
 ('003372b0', 2),
 ('004c2091', 0),
 ('00634433', 1),
 ('00667b88', 0),
 ('00691033', 0),
 ('00a0dbeb', 0),
 ('00a53963', 0),
 ('00ad158e', 1),
 ('00b9d8e6', 0),
 ('00cef781', 1),
 ('00e17272', 0),
 ('00e536bf', 1),
 ('00fa8681', 1),
 ('00fc65b6', 1),
 ('010bc1d5', 0),
 ('01120f12', 0),
 ('0153c957', 0),
 ('0155dd86', 0),
 ('015776b4', 0),
 ('01582211', 0),
 ('0160e7c5', 0),
 ('01825124', 1),
 ('01bdd720', 3),
 ('01cc53f3', 2),
 ('01d5e1f5', 2),
 ('01f71702', 2),
 ('01faa4bf', 3),
 ('022183ab', 3),
 ('0235fe9a', 2),
 ('023c4bcb', 1),
 ('02490b06', 2),
 ('026035e9', 0),
 ('026e3733', 1),
 ('027b3c4c', 3),
 ('0281f6bf', 0),
 ('02875dc2', 1),
 ('0293ae74', 1),
 ('02aa515f', 0),
 ('02b1fab4', 2),
 ('02c975bc', 0),
 ('02cdd48e', 2),
 ('02cdfffd', 2),
 ('02e76f43', 0),
 ('031af3b8', 0),
 ('0328fc8f', 0),
 ('03401e19', 3),
 ('03560e11', 1),
 ('03957682', 0),
 ('039e4ac

In [85]:
submission = pd.DataFrame()
submission['installation_id'] = compiled_test.installation_id
submission['accuracy_group'] = test_prediction
submission.head()

Unnamed: 0,installation_id,accuracy_group
3783,0006a69f,0
2220,0006c192,0
826,00129856,1
1019,001d0ed0,0
928,00225f67,0


In [86]:
submission.accuracy_group.value_counts()

0    1500
1    1130
2     585
3     399
Name: accuracy_group, dtype: int64

In [87]:
submission.to_csv('submission_01022020.csv',index=False)

## determine why installation_ids in sample submission aren't found

In [35]:
sample.installation_id.nunique(), test.installation_id.nunique()

(1000, 3614)

In [36]:
compiled_test.installation_id.nunique()

3614

In [93]:
len(set(sample.installation_id).difference(set(raw_test.installation_id)))

0

In [40]:
sample.installation_id[0]

'00abaee7'

In [46]:
set(compiled_test.installation_id)

{'68068902',
 '3d0b67e3',
 '59d9c2e8',
 'db5af314',
 'f4ca9e73',
 '2bce37dc',
 'd628cbf9',
 'f6ddbc4c',
 'f88f8adf',
 '7f894163',
 '2c183ed4',
 '6dac8a6c',
 'e8d98c5d',
 'fa5bd0d1',
 '62ddb449',
 '78dbdfef',
 '10189f14',
 'b1c3c6a9',
 '380dfa02',
 'f1b19758',
 '3009466e',
 'bf1b624c',
 'abca6f9e',
 '35fb0787',
 '9a970c94',
 '9f0b2bdf',
 'a5c1d52c',
 '2b2ba346',
 'f80821d2',
 '124beffd',
 '03560e11',
 'e2847d33',
 '8b2e60f3',
 '978fc68f',
 '2df817bf',
 'b4373bbc',
 '7e9b82f6',
 'a2691355',
 '6f7ffb2d',
 '2f44d796',
 '6edc7710',
 'e98977de',
 'a680eb5e',
 '494eec4b',
 '94d856a4',
 'cda83994',
 'b3c7e06b',
 'ed5b26f8',
 '8f2b8c82',
 '5f7418c9',
 '28351d4e',
 '4483cbc3',
 '5fa4925b',
 '16e07456',
 'b3bbe930',
 '5fa1276b',
 '6570dabb',
 '6b984505',
 '2d7762bc',
 'cf0df157',
 '6fbc5ee2',
 '6222b811',
 '9abb6144',
 '6cc7ae2e',
 '06435d15',
 'e01e9843',
 '66720918',
 '026035e9',
 '2c4f896e',
 '69392d5c',
 '35e5b972',
 '8f4c404c',
 '15b84ab7',
 '503c1a79',
 '38664000',
 '5cf8375a',
 'a07fb570',

In [None]:
raw