In [3]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from matplotlib import pyplot


In [4]:
from time import time
from tqdm import tqdm_notebook as tqdm

## Feature engineering taken from 
https://www.kaggle.com/shahules/xgboost-feature-selection-dsbowl

In [5]:
train = pd.read_csv('../data/raw/train.csv')

In [6]:
train_labels = pd.read_csv('../data/raw/train_labels.csv')

In [7]:
test = pd.read_csv('../data/raw/test.csv')

In [8]:
time_features=['month','hour','year','dayofweek','weekofyear']

In [9]:
def extract_time_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].dt.month
    df['hour'] = df['timestamp'].dt.hour
    df['year'] = df['timestamp'].dt.year
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['weekofyear'] = df['timestamp'].dt.weekofyear
    return df

In [11]:
submission = pd.read_csv('../data/raw/sample_submission.csv')

In [16]:
train = extract_time_features(train)

In [17]:
def prepare_data(df):    
    df=df.drop('timestamp',axis=1)
    #df['timestamp']=pd.to_datetime(df['timestamp'])
    #df['hour_of_day']=df['timestamp'].map(lambda x : int(x.hour))
    

    join_one=pd.get_dummies(df[['event_code','installation_id','game_session']],
                            columns=['event_code']).groupby(['installation_id','game_session'],
                                                            as_index=False,sort=False).agg(sum)

    agg={'event_count':sum,'game_time':['sum','mean'],'event_id':'count'}

    join_two=df.drop(time_features,axis=1).groupby(['installation_id','game_session']
                                                   ,as_index=False,sort=False).agg(agg)
    
    join_two.columns= [' '.join(col).strip() for col in join_two.columns.values]
    

    join_three=df[['installation_id','game_session','type','world','title']].groupby(
                ['installation_id','game_session'],as_index=False,sort=False).first()
    
    join_four=df[time_features+['installation_id','game_session']].groupby(['installation_id',
                'game_session'],as_index=False,sort=False).agg(mode)[time_features].applymap(lambda x: x.mode[0])
    
    join_one=join_one.join(join_four)
    
    join_five=(join_one.join(join_two.drop(['installation_id','game_session'],axis=1))). \
                        join(join_three.drop(['installation_id','game_session'],axis=1))
    
    return join_five



In [18]:

join_train=prepare_data(train)
cols=join_train.columns.to_list()[2:-3]
join_train[cols]=join_train[cols].astype('int16')

In [19]:
cols=join_train.columns[2:-12].to_list()
cols.append('event_id count')
cols.append('installation_id')

In [20]:
final_train=pd.merge(train_labels,join_train,on=['installation_id','game_session'],
                                         how='left').drop(['game_session'],axis=1)

In [21]:
final_train['title'] = final_train['title_x']

In [22]:
df=final_train[['event_count sum','game_time mean','game_time sum','installation_id']]. \
    groupby('installation_id',as_index=False,sort=False).agg('mean')

df_two=final_train[cols].groupby('installation_id',as_index=False,
                                 sort=False).agg('sum').drop('installation_id',axis=1)

df_three=final_train[['accuracy_group','title','type','world','installation_id']]. \
        groupby('installation_id',as_index=False,sort=False). \
        last().drop('installation_id',axis=1)

df_four=join_train[time_features+['installation_id']].groupby('installation_id',as_index=False,sort=False). \
        agg(mode)[time_features].applymap(lambda x : x.mode[0])



final_train=(df.join(df_two)).join(df_three.join(df_four)).drop('installation_id',axis=1)

In [23]:
np.unique(final_train['type'])

array(['Assessment'], dtype=object)

In [24]:
np.unique(join_train['type'])

array(['Activity', 'Assessment', 'Clip', 'Game'], dtype=object)

In [31]:
final_train

Unnamed: 0,event_count sum,game_time mean,game_time sum,event_code_2000,event_code_2010,event_code_2020,event_code_2025,event_code_2030,event_code_2035,event_code_2040,...,event_id count,accuracy_group,title,type,world,month,hour,year,dayofweek,weekofyear
0,1413.000000,7970.200000,16046.000000,5,4,6,3,5,3,0,...,244,3,Bird Measurer (Assessment),Assessment,TREETOPCITY,9,18,2019,4,36
1,9062.000000,-12144.333333,8219.666667,3,0,4,1,2,1,0,...,340,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,10,14,2019,4,41
2,820.000000,20073.000000,16502.000000,1,1,2,0,2,0,0,...,40,3,Bird Measurer (Assessment),Assessment,TREETOPCITY,8,5,2019,1,32
3,927.000000,18240.200000,-6346.800000,5,3,5,2,3,2,0,...,194,3,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,9,15,2019,5,37
4,703.000000,13623.000000,-20216.000000,1,0,1,0,0,0,0,...,37,0,Bird Measurer (Assessment),Assessment,TREETOPCITY,9,18,2019,2,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,6448.000000,7627.000000,3260.750000,4,0,6,0,2,0,0,...,395,0,Chest Sorter (Assessment),Assessment,CRYSTALCAVES,9,16,2019,3,39
3610,933.555556,7351.444444,-7905.111111,9,7,10,3,8,3,0,...,358,3,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,9,19,2019,1,39
3611,905.000000,16591.666667,-2625.666667,6,5,8,2,7,2,0,...,242,3,Bird Measurer (Assessment),Assessment,TREETOPCITY,9,21,2019,3,38
3612,820.000000,27651.000000,-8034.000000,1,1,2,0,2,0,0,...,40,3,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,8,18,2019,6,32


In [None]:
final_train.to_csv('../data/processed/processed_train.csv')

## Try using improvement as a feature

In [26]:
df = train
join_one=pd.get_dummies(df[['event_code','installation_id','game_session']],
                        columns=['event_code']).groupby(['installation_id','game_session'],
                                                        as_index=False,sort=False).agg(sum)

In [29]:
df = df.sort_values(['installation_id', 'timestamp'])

In [34]:
installation_ids = np.unique(df['installation_id'])

In [38]:
tid = installation_ids[0]

In [41]:
temp_df = df[df['installation_id'] == tid]

In [46]:
next_game_time = temp_df['game_time'].shift(-1)

In [48]:
temp_df['next_game_time'] = next_game_time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [51]:
temp_df['improvement'] = temp_df['next_game_time'] - temp_df['game_time']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [52]:
temp_df['improvement']

0           0.0
1           0.0
2          53.0
3        6919.0
4        3019.0
         ...   
1352        0.0
1353     1217.0
1354   -86619.0
1355        0.0
1356        NaN
Name: improvement, Length: 1357, dtype: float64

In [53]:
new_dfs = []

In [None]:
for installation_id in installation_ids:
    temp_df = df[df['installation_id'] == installation_id]
    next_game_time = temp_df['game_time'].shift(-1)
    temp_df['next_game_time'] = next_game_time
    temp_df['improvement'] = temp_df['next_game_time'] - temp_df['game_time']
    new_dfs.append(temp_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
new_df = pd.concat(new_dfs)

In [None]:
new_df