In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import IPython

def display(*dfs):
    for df in dfs:
        IPython.display.display(df)
        
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%time df = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv', engine='c')
# labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
df_test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')

df = reduce_mem_usage(df)
# labels = reduce_mem_usage(labels)
df_test = reduce_mem_usage(df_test)
specs = reduce_mem_usage(specs)

In [None]:
# df_counter = pd.read_feather('/kaggle/input/temp-ds-bowl-2019/df_counter_ini')
# df_counter = pd.read_feather('/kaggle/input/temp-ds-bowl-2019/df_counter')
labels = pd.read_feather('/kaggle/input/temp-ds-bowl-2019/labels_created')

In [None]:
# all_users = pd.read_pickle('/kaggle/input/temp-ds-bowl-2019/all_users.pkl')
# all_users.installation_id.unique().shape, labels.installation_id.unique().shape

# test_all_users = pd.read_pickle('/kaggle/input/temp-ds-bowl-2019/test_all_users.pkl')
# test_all_users.installation_id.unique().shape, test_all_users.shape

In [None]:
import gc
gc.collect()

# Part I - Create all_users
## One user

In [None]:
#user_id, user = next(iter(df.groupby('installation_id')))
user_id = '8e44d2de' # 'f10d8174'#'ea49ea9c'#'29d1aaee' # '3f0dca37'
user = df[df.installation_id == user_id].copy()

totall_attempts = labels[labels.installation_id == user_id].total_attempts.sum()
print(totall_attempts)

totall_assessments  = labels[labels.installation_id == user_id].shape[0]
totall_assessments

In [None]:
user['timestamp'] = pd.to_datetime(user['timestamp'])
print(user.shape)
user.sort_values(by='timestamp').head()

user['attempt'] = ((user.type == 'Assessment') &
                       (((user.event_code == 4100) & (user.title != 'Bird Measurer (Assessment)')) |
                        ((user.event_code == 4110)&(user.title == 'Bird Measurer (Assessment)')) )
                   ).astype('int8')
#attempt_idx = user[user.attempt == 1].index
assert user.attempt.sum() == totall_attempts

user['correct'] = 0
user.loc[user.attempt == 1, 'correct'] = user[user.attempt == 1].event_data.str.contains('"correct":true').astype('int8')
#user

In [None]:
def current_attempt_save(attempt, user_temp, session):
    temp = user_temp.copy()
    attempt['total_accuracy'] = np.mean(temp['total_accuracy']) \
                                   if isinstance(temp['total_accuracy'], list) else -1
    attempt['total_accuracy_groups'] = np.mean(temp['total_accuracy_groups']) \
                                          if isinstance(temp['total_accuracy_groups'], list) else -1        
    attempt['curr_attempt_world'] = session.world.iloc[0]
    attempt['curr_attempt_title'] = session.title.iloc[0]
    attempt['curr_attempt_start_time'] = session.timestamp.iloc[0]
    attempt['curr_attempt_duration'] = attempt['time_start'] - attempt['curr_attempt_start_time']
        
    return attempt

def current_world_save(attempt, user_temp, world):
    # current WORLD - write all previous info in current world
    features = [x for x in user_temp.keys() if world in str(x)]
    temp = defaultdict(int)
    temp.update({k.replace(world, 'world'):user_temp[k] for k in features})
    f = [x.replace(world, 'world') for x in features if 'unique' in x]
    assert len(f) <= 4
    temp.update({k: len(set(temp[k])) for k in f})
    temp['world_accuracy'] = np.mean(temp['world_accuracy']) \
                             if isinstance(temp['world_accuracy'], list) else -1
    temp['world_accuracy_groups'] = np.mean(temp['world_accuracy_groups']) \
                                    if isinstance(temp['world_accuracy_groups'], list) else -1

    f = [x for x in features if 'game_time' in x]
    temp['world_total_game_time'] = np.sum([user_temp[x] for x in f])
    f = [x for x in features if 'event_count' in x]
    temp['world_total_event_count'] = np.sum([user_temp[x] for x in f])
    f = [x for x in features if 'n_game_session' in x]
    temp['world_total_n_game_session'] = np.sum([user_temp[x] for x in f])
    attempt.update(temp)
    
    return attempt


def current_title_save(attempt, user_temp, title):
    # current TITLE - write all previous info in attempts with the same title
    features = [x for x in user_temp.keys() if title in str(x)]
    temp = defaultdict(int)
    temp.update({k.replace(title, 'title'):user_temp[k] for k in features})
    temp['title_accuracy'] = np.mean(temp['title_accuracy']) \
                             if isinstance(temp['title_accuracy'], list) else -1
    temp['title_accuracy_groups'] = np.mean(temp['title_accuracy_groups']) \
                                    if isinstance(temp['title_accuracy_groups'], list) else -1
    attempt.update(temp)
    
    return attempt

def create_sample(user_data, user_temp, session):
    attempt = user_data.copy()
    title = session.title.iloc[0]
    world = session.world.iloc[0]
    
    attempt = current_attempt_save(attempt, user_temp, session)
    attempt = current_world_save(attempt, user_temp, world)
    attempt = current_title_save(attempt, user_temp, title)

    # to labels
    attempt['n_correct'] = session.correct.sum()
    attempt['n_incorrect'] = session.attempt.sum() - attempt['n_correct']

    # TOTAL info
    for t in ['Clip', 'Game', 'Activity', 'Assessment']:
        attempt['total_' + t +'_unique'] = len(set(attempt['total_' + t +'_unique'])) \
                                                if isinstance(attempt['total_' + t +'_unique'], list) \
                                                and len(attempt['total_' + t +'_unique']) >0 else 0

    return attempt

In [None]:
accuracy_to_group = {1: 3, 0.5: 2, 0: 0}

def get_common_data(session, user_data, user_temp, user_results, test=False):
    """
    params: 
        session: DataFrame - one session form groupby('game_session')
        user_data: dict - dict with data info about user. Accumulate all infor for user though attempts.
    return:
        user_results: list - list of dictionaries, one for each attempt of one user
    """
    data_type = session.type.iloc[0]
    world = session.world.iloc[0]
    template = world +'_'+ data_type
    user_temp[template+'_unique'] = user_temp.get(template+'_unique', [])
    user_data['total_' + data_type+'_unique'] = user_data.get('total_' + data_type+'_unique', [])
    
    if data_type == 'Assessment':
        if session.attempt.sum() >= 1 or test:  
            # create sample for train set
            attempt = create_sample(user_data, user_temp, session)
            user_results.append(attempt)

        #  ======= Save info for future attempt ====== title = session.title.iloc[0]       
        # current title - CALCULATE (accumulate) all previous info in attempts with the same title
        n_correct = session.correct.sum()
        n_incorrect = session.attempt.sum() - n_correct
        
        title = session.title.iloc[0]
        user_temp[title + '_n_assessments'] += 1
        user_temp[title + '_n_assess_with_attempt'] += 1 if session.attempt.sum() > 0 else 0
        user_temp[title + '_n_attempts'] += session.attempt.sum()
        user_temp[title + '_n_correct'] += n_correct
        user_temp[title + '_n_incorrect'] += n_incorrect
        user_temp[title + '_n_unfinished'] += 0  if session.attempt.sum() > 0 else 1
        
        if session.attempt.sum() != 0:
            accuracy = session.correct.sum()/ session.attempt.sum()
            user_temp[title + '_accuracy'] = user_temp.get(title+'_accuracy_groups', []) +[accuracy]
            group = accuracy_to_group.get(accuracy, 1)
            user_temp[title+'_accuracy_groups'] = user_temp.get(title+'_accuracy_groups', []) +[group]

        user_temp[title + '_event_count'] += session.shape[0]
        user_temp[title + '_game_time'] += session.game_time.max()
        
        #print(session.shape[0], user_temp[title + '_event_count'])
        # Currect attempts - to future
        user_temp[world+'_n_correct'] += n_correct
        user_temp[world+'_n_incorrect'] += n_incorrect
        user_data['total_n_correct'] += n_correct
        user_data['total_n_incorrect'] += n_incorrect
        
        if n_correct + n_incorrect != 0:
            accuracy = n_correct/(n_correct+n_incorrect)
            group = accuracy_to_group.get(accuracy, 1)
            
            user_temp[world+'_accuracy'] = user_temp.get(world+'_accuracy', []) +[accuracy]
            user_temp[world+'_accuracy_groups'] = user_temp.get(world+'_accuracy_groups', []) +[group]
            user_temp['total_accuracy'] = user_temp.get('total_accuracy', []) +[accuracy]
            user_temp['total_accuracy_groups'] = user_temp.get('total_accuracy_groups', []) +[group]        
        
    event_code_counts = session['event_code'].value_counts()
    for i, j in zip(event_code_counts.index, event_code_counts.values):
        user_data[i] = user_data.get(i,0) + j
        
    # add common info, include attempt info for next study
    user_temp[template+'_unique'].append(session.title.iloc[0])
    user_temp[template+'_n_game_session'] += 1
    user_temp[template+'_event_count'] += session.shape[0]
    
    template_total = 'total_' + data_type
    user_data[template_total+'_unique'].append(session.title.iloc[0])
    user_data[template_total+'_n_game_session'] += 1
    user_data[template_total+'_event_count'] += session.shape[0]
    
    if data_type != 'Clip':
        user_temp[template+'_game_time'] += session.game_time.max()
        user_data[template_total+'_game_time'] += session.game_time.max()
        user_data['total_game_time'] += session.game_time.max()
        
    user_data['total_' + data_type] += 1
    user_data['total_n_game_session'] += 1
    user_data['total_event_count'] += session.shape[0]
    
        
        
    return user_results


In [None]:
%%time 
user_results = []

from collections import defaultdict
user_data, user_temp = defaultdict(int), defaultdict(int)
user_data['installation_id'] = user.installation_id.iloc[0]
user_data['time_start'] = user.timestamp.iloc[0]

for i, session in user.groupby('game_session', sort=False):
    #print(user_data)
    get_common_data(session, user_data, user_temp, user_results)
    
tt = pd.DataFrame(user_results)

In [None]:
tt[tt.curr_attempt_start_time == '2019-09-12 23:56:55.085000+00:00'].title_event_count

## Total: for all users

### Info for last attemp

In [None]:
def create_all_users(df, test=False):
    global labels
    all_users = pd.DataFrame()
    
    if test: labels = df  # all unique users with attemptions; for test - all users
    for n,user_id in enumerate(labels.installation_id.unique()):
        # print(n, user_id)
        user = df[df.installation_id == user_id].copy()
        user['timestamp'] = pd.to_datetime(user['timestamp'])
        user.sort_values(by='timestamp', inplace=True)

        user['attempt'] = ((user.type == 'Assessment') &
                           (((user.event_code == 4100) & (user.title != 'Bird Measurer (Assessment)')) |
                            ((user.event_code == 4110)&(user.title == 'Bird Measurer (Assessment)')) )
                       ).astype('int8')

        user['correct'] = 0
        user.loc[user.attempt == 1, 'correct'] = user[user.attempt == 1]\
                                                .event_data.str.contains('"correct":true').astype('int8')

        user_results = []

        user_data, user_temp = defaultdict(int), defaultdict(int)
        user_data['installation_id'] = user.installation_id.iloc[0]
        user_data['time_start'] = user.timestamp.iloc[0]
        
        for i, session in user.groupby('game_session', sort=False):
            get_common_data(session, user_data, user_temp, user_results, test=test)

        temp = all_users.shape
        all_users = pd.concat((all_users,pd.DataFrame(user_results)), ignore_index=True, sort=False)
        
        assert all_users.shape > temp, user_id
#         display(all_users)
#         input()
    
    return all_users

In [None]:
%time all_users = create_all_users(df)

print(all_users.shape, all_users.installation_id.unique().shape)
all_users.columns = all_users.columns.astype(str)
# all_users.to_pickle('all_users.pkl')

In [None]:
all_users[all_users.curr_attempt_start_time == '2019-09-12 23:56:55.085000+00:00'][title_columns]

# Creating test data

In [None]:
labels.installation_id.unique().shape, df_test.installation_id.unique().shape

In [None]:
%time test_all_users = create_all_users(df_test, test=True)

# take only last assessment
test_all_users = test_all_users.groupby('installation_id', sort=False, as_index=False)\
                            .agg([lambda x: x.iloc[-1]])
test_all_users.columns = [x[0] for x in test_all_users.columns]
test_all_users.reset_index(inplace=True)

print(test_all_users.shape, test_all_users.installation_id.unique().shape)
test_all_users.columns = test_all_users.columns.astype(str)
# test_all_users.to_pickle('test_all_users.pkl')

# Part II - preprocess train data
## Features names

In [None]:
c = test_all_users.columns
# c = all_users.columns
world_columns = [x for x in c if 'world' in x and 'curr' not in x]
title_columns = [x for x in c if 'title' in x and 'curr' not in x]
current_columns = [x for x in c if 'curr' in x] + ['installation_id', 'time_start', 'n_correct', 'n_incorrect', 
                                                   'total_n_game_session', 'total_event_count', 'total_game_time',
                                                   'total_n_correct', 'total_n_incorrect', 
                                                   'total_accuracy', 'total_accuracy_groups']
total_columns = [x for x in c if 'total' in x and 'world' not in x and x not in current_columns]
event_columns = [x for x in c if x.isdigit()]
all_columns = world_columns+title_columns+current_columns+total_columns+event_columns
assert c.shape[0] == len(all_columns), (c.shape[0] ,len(all_columns))

## Fill NAN

In [None]:
test_all_users.fillna(0., inplace=True)
all_users.fillna(0., inplace=True)

## Work with time

In [None]:
def preprocess_time(test_all_users):
    time_columns = ['time_start', 'curr_attempt_start_time'] 
    for t in time_columns:
        test_all_users[t+'_month'] = test_all_users[t].dt.month
        test_all_users[t+'_day'] = test_all_users[t].dt.day
        test_all_users[t+'_hour'] = test_all_users[t].dt.hour
        test_all_users[t+'_dayofweek'] = test_all_users[t].dt.dayofweek
        test_all_users[t+'_month'] = test_all_users[t].dt.month
        test_all_users[t+'_quarter'] = test_all_users[t].dt.quarter

    t =  'curr_attempt_duration'
    test_all_users[t+'seconds'] = -test_all_users[t].dt.total_seconds()
    test_all_users[t+'hours'] = -test_all_users[t].dt.total_seconds()/60/60
    
    return test_all_users
    
    
def create_means(all_users, temp, t=''):
    if t != '': t ='_' + t
    if t != '_Clip':
        all_users[temp+t+'_game_time_mean'] =  all_users[temp+t+'_game_time']/ \
                                                    all_users[temp+t+'_n_game_session']
        all_users[temp+t+'_event_time_mean'] =  all_users[temp+t+'_game_time']/ \
                                                    all_users[temp+t+'_event_count']
            
    all_users[temp+t+'_event_mean'] = all_users[temp+t+'_event_count']/ \
                                          all_users[temp+t+'_n_game_session']
    return all_users


def preprocess_data(all_users):
    for temp in ['world', 'total']:
        for t in ['Clip', 'Game', 'Assessment', 'Activity', 'total']:
            if temp=='total' and t=='total': continue
            all_users = create_means(all_users, temp, t)
        
        
    all_users['title_game_time_mean'] = all_users['title_game_time']/ all_users['title_n_assessments']                                            
    all_users['title_event_time_mean'] = all_users['title_game_time']/ all_users['title_event_count']                                            
    all_users['title_event_mean'] = all_users['title_event_count']/ all_users['title_n_assessments']
    
    all_users = preprocess_time(all_users)
    all_users = create_means(all_users, temp='total', t='')
        
    all_users.fillna(0., inplace=True)               
        
    return all_users

In [None]:
test_all_users = preprocess_data(test_all_users)
all_users = preprocess_data(all_users)

In [None]:
test_all_users.shape, all_users.shape

In [None]:
c = test_all_users.columns
world_columns = [x for x in c if 'world' in x and 'curr' not in x]
title_columns = [x for x in c if 'title' in x and 'curr' not in x]
current_columns = [x for x in c if 'curr' in x] + ['installation_id', 'time_start', 'n_correct', 'n_incorrect', 
                                                   'total_n_game_session', 'total_event_count', 'total_game_time',
                                                   'total_n_correct', 'total_n_incorrect', 
                                                   'total_accuracy', 'total_accuracy_groups',
                                                   'total_game_time_mean', 'total_event_time_mean', 'total_event_mean',
                                                   'time_start_month', 'time_start_day', 'time_start_hour',
                                                   'time_start_dayofweek', 'time_start_quarter']
total_columns = [x for x in c if 'total' in x and 'world' not in x and x not in current_columns]
event_columns = [x for x in c if x.isdigit()]
all_columns = world_columns+title_columns+current_columns+total_columns+event_columns
assert c.shape[0] == len(all_columns), (c.shape[0] ,len(all_columns))

In [None]:
test_all_users.to_pickle('test_all_users.pkl')
all_users.to_pickle('all_users.pkl')

In [None]:
all_users[all_users.curr_attempt_start_time == '2019-09-12 23:56:55.085000+00:00'][title_columns]

# try parralel


In [None]:
from joblib import Parallel, delayed
from tqdm import tqdm


In [None]:
def create_attempt_and_correct_features(df_test):
    df_test['attempt'] = ((df_test.type == 'Assessment') &
                       (((df_test.event_code == 4100) & (df_test.title != 'Bird Measurer (Assessment)')) |
                        ((df_test.event_code == 4110)&(df_test.title == 'Bird Measurer (Assessment)')) )
                   ).astype('int8')

    df_test['correct'] = 0
    df_test.loc[df_test.attempt == 1, 'correct'] = df_test[df_test.attempt == 1]\
                                            .event_data.str.contains('"correct":true').astype('int8')
    df_test.timestamp = pd.to_datetime(df_test.timestamp)

    return df_test

In [None]:
df_test = create_attempt_and_correct_features(df_test)

In [None]:
def create_one_user(user, test):
    user['timestamp'] = pd.to_datetime(user['timestamp'])
    user.sort_values(by='timestamp', inplace=True)

    user_results = []
    user_data, user_temp = defaultdict(int), defaultdict(int)
    user_data['installation_id'] = user.installation_id.iloc[0]
    user_data['time_start'] = user.timestamp.iloc[0]

    for i, session in user.groupby('game_session', sort=False):
        get_common_data(session, user_data, user_temp, user_results, test=test)

    return user_results


def group_by_user(labels, df):
    for n,user_id in enumerate(labels.installation_id.unique()):
        user = df[df.installation_id == user_id].copy()
        yield user
    

def create_all_users_parallel(df, test=False):
    global labels
    all_users = pd.DataFrame()
    
    if test: labels = df  # all unique users with attemptions; for test - all users
    
    
    res = Parallel(n_jobs=-1, backend='threading', verbose=10)(delayed(create_one_user)(user, test=test)
                                                        for user in group_by_user(labels, df))
    all_users = []
    for r in res:
        for r1 in r:
            all_users.append(r1)
    #for user in group_by_user(labels, df):
    #    user_results = create_one_user(user, test=test)
        
    #    all_users = pd.concat((all_users,pd.DataFrame(user_results)), ignore_index=True, sort=False)

    
    return pd.DataFrame(all_users)

In [None]:
%time t2 = create_all_users_parallel(df_test, test=True)

In [None]:
t[t.installation_id == '01bc6cb6']

In [None]:
test_all_users[test_all_user.installation_id == '01bc6cb6']