In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import IPython

def display(*dfs):
    for df in dfs:
        IPython.display.display(df)

# display(df1, df2)

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%time df = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv', engine='c')
labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')

In [None]:
df = reduce_mem_usage(df)
labels = reduce_mem_usage(labels)

In [None]:
df.shape, labels.shape

In [None]:
# # Format and make date / hour features
df['timestamp'] = pd.to_datetime(df['timestamp'])
# df['date'] = df['timestamp'].dt.date
# df['hour'] = df['timestamp'].dt.hour
# df['weekday_name'] = df['timestamp'].dt.weekday_name
# # Same for test
# # test['timestamp'] = pd.to_datetime(test['timestamp'])
# # test['date'] = test['timestamp'].dt.date
# # test['hour'] = test['timestamp'].dt.hour
# # test['weekday_name'] = test['timestamp'].dt.weekday_name

In [None]:
import gc
gc.collect()

In [None]:
df_test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')

In [None]:
df_test.shape

In [None]:
sample_subm = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
sample_subm.head()

In [None]:
# df_train = df.copy()
# df = df_test

In [None]:
# df = df_train

# Create labels

In [None]:
def create_label_all(df, labels=None):
#     id_unique = labels.installation_id.unique()
#     print('Numer of unique installation_id:', id_unique.shape[0])
    
    if labels is not None:
        # labels_part = labels[labels.installation_id.isin(id_unique[:n])]
        labels_part = labels.copy()
        
    # df_ids = df[df.installation_id.isin(id_unique[:n])].reset_index()
    df_ids = df.reset_index()
    assert df_ids.shape[0] == df.shape[0]
    
    extracted_event = df_ids[df_ids.event_data.str.contains('correct')]
    extracted_event['correct'] = np.where(extracted_event.event_data.str.find('"correct":true')!=-1, True, False)
    print('Number of samples with "correct" in event_data:', extracted_event.shape)

    df_ids.loc[:, 'correct'] = np.nan
    df_ids.update(extracted_event.correct)
    del extracted_event
    print('Number of unique ids in train set:', df_ids.installation_id.unique().shape, 
          'Shape of data:', df_ids.shape)
    
    index_for_correct = ~df_ids.correct.isna()
#     df_ids[index_for_correct]

    df_counter = df_ids[index_for_correct]
    df_counter_wt_BM = df_counter[(df_counter.event_code == 4100) & (df_counter.title.str.find('Bird Measurer')==-1)]
    df_counter_BM = df_counter[(df_counter.event_code == 4110) & (df_counter.title.str.find('Bird Measurer')!=-1)]

    df_counter_ini = df_counter_wt_BM.append(df_counter_BM)
    df_counter = df_counter_ini.groupby('game_session').agg({'correct': 'sum',
                                                         'event_code': 'count',
                                                         'installation_id': lambda x: x.iloc[0],
                                                         'title': lambda x: x.iloc[0]})

    df_counter['accuracy'] = df_counter.correct/df_counter.event_code
    if labels is not None:
        df_check = df_counter.merge(labels_part, on='game_session', how='left')
    
    return df_counter_ini, df_counter, df_check

# df_counter_ini, df_counter, df_check = create_label_all(df, labels)

In [None]:
df_counter_ini, df_counter, df_check = create_label_all(df, labels)


In [None]:
df_counter.correct = df_counter.correct.astype(int)
df_check.correct = df_check.correct.astype(int)

In [None]:
df_counter_ini.reset_index().to_feather('df_counter_ini')
df_counter.reset_index().to_feather('df_counter')

In [None]:
# import json 
# extracted_event_data = pd.io.json.json_normalize(df_ids.event_data.apply(json.loads))
# print(extracted_event_data.shape)

In [None]:
'New lables:', df_counter.shape[0] - labels.shape[0]

In [None]:
df_error = df_check[(np.abs(df_check.accuracy_x-df_check.accuracy_y) > 0.001) & 
                     (~df_check.accuracy_y.isna())]
print('Number of error calculated labels:', df_error.shape[0])
#df_error

In [None]:
df_counter[df_counter.installation_id=='0006a69f']

# Data study

## Check ids which never took assessments

In [None]:
print('# unique ids with assessment:', df.installation_id.unique().shape)
print('# unique ids in train:', df_counter.installation_id.unique().shape)
print('# unique ids in train_labels:', labels.installation_id.unique().shape)

## LAbels

In [None]:
display(labels[labels.num_correct >1], df_counter[df_counter.correct >1])

Why in our df_counter there is corrected answer > 1? 

In [None]:
df_counter_ini['correct_str'] = df_counter_ini.correct.astype(int).astype(str)
temp = df_counter_ini.groupby('game_session').agg({'correct_str': 'sum'})

In [None]:
df_check2 = df_check.merge(temp, how='left', on='game_session')
df_check2

In [None]:
import re

In [None]:
temp = df_check2[(df_check2.correct <= 1) & (df_check2.accuracy_group.isna())]

template = r'^[01]0*$'

temp

In [None]:
temp.correct_str = temp.correct_str.str[::-1]
temp[~temp.correct_str.str.contains(template, regex=True)]
#temp

In [None]:
temp[temp.correct_str.str.contains(template, regex=True)]
temp.correct_str[:20]

# Form labels view to ini

In [None]:
df_counter[df_counter.correct <2]

In [None]:
df_counter.head()

In [None]:
df_counter.reset_index(inplace=True)

df_counter['num_incorrect'] = df_counter.event_code - df_counter.correct
df_counter.rename(columns={'correct': 'num_correct', 'event_code': 'total_attempts'}, inplace=True)
df_counter

In [None]:
df_check[(df_check.correct > 1)]

In [None]:
user = df_ids[(df_ids.installation_id == '4d7392c0') & (df_ids.game_session == '0006663578b58ced')
        & (~df_ids.correct.isna()) ]
user.reset_index(inplace=True)
user

In [None]:
import json 
extracted_event_data = pd.io.json.json_normalize(user.event_data.apply(json.loads))
print(extracted_event_data.shape)
extracted_event_data

In [None]:
user = user.merge(extracted_event_data)
user