In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import IPython

def display(*dfs):
    for df in dfs:
        IPython.display.display(df)
        
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%time df = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv', engine='c')
# labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
df_test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')

df = reduce_mem_usage(df)
# labels = reduce_mem_usage(labels)
df_test = reduce_mem_usage(df_test)
specs = reduce_mem_usage(specs)

In [None]:
df_counter_ini = pd.read_feather('/kaggle/input/temp-ds-bowl-2019/df_counter_ini')
df_counter = pd.read_feather('/kaggle/input/temp-ds-bowl-2019/df_counter')
labels_created = pd.read_feather('/kaggle/input/temp-ds-bowl-2019/labels_created_ini')

In [None]:
import gc
gc.collect()

# Labels

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(palette='deep')
sns.set_style("dark")

In [None]:
def autolabel(ratio, ax):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for x, t in enumerate(ax.get_xticklabels()):
        f = t.get_text()
        try: f = int(f)
        except: pass
        ax.annotate(f'{ratio[f]:.2f}%',
                    xy=(x, 0),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

In [None]:
ax = sns.countplot(x='accuracy_group', data=labels_created);
autolabel(labels_created.accuracy_group.value_counts()/labels_created.shape[0]*100, ax)

# Features: title, type and world

In [None]:
# temp.world = temp.world.astype('category')
# print(temp.world.cat.categories)
# temp.world = temp.world.cat.codes

In [None]:
df_counter_ini.groupby('game_session').agg({'world': lambda x: (x == x.iloc[0]).all(),
                                            'title': lambda x: (x == x.iloc[0]).all(),
                                            'type': lambda x: (x == x.iloc[0]).all()}).all(axis=0)

In [None]:
temp = df_counter_ini.groupby('game_session', as_index=False).agg({'world': lambda x: x.iloc[0],
                                           # 'title': lambda x: x.iloc[0],
                                            'type': lambda x: x.iloc[0]})

In [None]:
def df_na(df): return print("There aren't NaNs:",  ~df.isna().all(axis=None))

In [None]:
labels_created = labels_created.merge(temp, how='left', on='game_session')
#print('There are non nans:',  ~labels_created.world.isna().all())
df_na(labels_created)
labels_created.shape

In [None]:
df_group = df.groupby('game_session', as_index=False).agg({'world': lambda x: x.iloc[0],
                                            'title': lambda x: x.iloc[0],
                                            'type': lambda x: x.iloc[0]})

In [None]:
order = labels_created.world.unique()

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
ax = sns.countplot(x='world', data=df_group , order=np.append(order, 'NONE'))
ax.set_title('Distribution of INITIAL worlds DATA')
autolabel(df_group.world.value_counts()/df_group.shape[0]*100, ax)

plt.subplot(1,2,2)
ax = sns.countplot(x='world', data=labels_created, order=order);
ax.set_title('Distribution of worlds LABELS');
autolabel(labels_created.world.value_counts()/labels_created.shape[0]*100, ax)

plt.show()


ax = sns.countplot(x='world', data=labels_created, hue='accuracy_group', order=order);
ax.set_title('Distribution of worlds LABELS');

In [None]:
order = labels_created.title.unique()
leaved_cat = np.isin(df_group.title.unique(), order)
leaved_cat = df_group.title.unique()[~leaved_cat]

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
ax = sns.countplot(x='title', data=df_group , order=np.append(order, leaved_cat))
ax.set_title('Distribution of INITIAL titles DATA')
#autolabel(df.title.value_counts()/df.shape[0]*100, ax)

plt.subplot(1,2,2)
ax = sns.countplot(x='title', data=labels_created, order=order);
ax.set_title('Distribution of titles LABELS');
autolabel(labels_created.title.value_counts()/labels_created.shape[0]*100, ax)
ax.set_xticklabels(['1','2','3','4','5'])

plt.show()

ax = sns.countplot(x='title', data=labels_created, hue='accuracy_group', order=order);
ax.set_title('Distribution of titels LABELS');
ax.set_xticklabels(['1','2','3','4','5']);
print(order)

One can see for title 2 = 'Chest Sorter (Assessment)' most of samples have *accuracy_group* = 0 despire *accuracy_group* = 3 is majority class.

In [None]:
test = labels_created[labels_created.installation_id=='0006a69f']

In [None]:
title_unique = labels_created.title.unique()
counter_title = {'title': [lambda x,y=t: np.sum(x==y) for t in title_unique]}
counter_title.update({'game_session':'count'})
counter_title

In [None]:
labels_title = labels_created.groupby('installation_id').agg(counter_title)
labels_title.head()

In [None]:
temp = df_counter_ini.sort_values(by='timestamp').groupby('game_session', as_index=False)\
                     .agg({'timestamp': lambda x: x.iloc[-1]})
temp.head()

In [None]:
labels_created = labels_created.merge(temp, how='left', on='game_session')
df_na(labels_created)
labels_created.shape

In [None]:
labels_created.to_feather('labels_created')

In [None]:
counter_title = {'title': [lambda x,y=t: np.sum(x==y) for t in title_unique]}
counter_title['title'].append(lambda x: x.iloc[-1])
counter_title.update({'game_session':'count',
                      'timestamp': lambda x: x.iloc[-1]})
counter_title

In [None]:
labels_title = labels_created.sort_values(by='timestamp')\
                             .groupby('installation_id', as_index=False).agg(counter_title)
labels_title.head()

In [None]:
labels_title.columns

In [None]:
col = ['installation_id', 'title_MS', 'title_CS', 'title_BM', 'title_CB', 'title_CF', 'title_last',
       'n_games', 'timestamp_last']
labels_title.columns = col

labels_title.head()