In [1]:
import os
os.chdir('../')

In [2]:
import data
import preprocess_utils.session2vec as sess2vec
import pandas as pd
import utils.sparsedf as sparsedf
import numpy as np
import scipy.sparse as sps

from sklearn.preprocessing import MultiLabelBinarizer

import tqdm.auto as tqdmauto
from tqdm import tqdm
from IPython.display import display
pd.options.display.max_columns = None

In [3]:
mode = 'small' #menu.mode_selection()
path = f'dataset/preprocessed/cluster_recurrent/{mode}'

In [4]:
train_df = data.train_df(mode, cluster='cluster_recurrent')
test_df = data.test_df(mode, cluster='cluster_recurrent')
attributes_df = data.accomodations_one_hot()

folder_path = f'dataset/preprocessed/cluster_recurrent/{mode}_vec'

Loading accomodations one-hot...


In [5]:
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)

(39767, 12)
(10242, 12)
Index(['user_id', 'session_id', 'timestamp', 'step', 'action_type',
       'reference', 'platform', 'city', 'device', 'current_filters',
       'impressions', 'prices'],
      dtype='object')


In [6]:
train_df.tail(5)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
39867,ZXEXR9ETL1N9,ede97ece03e93,1541256321,16,interaction item image,2753200,PL,"Łódź, Poland",tablet,,,
39868,ZXEXR9ETL1N9,ede97ece03e93,1541256321,17,interaction item image,2753200,PL,"Łódź, Poland",tablet,,,
39869,ZXEXR9ETL1N9,ede97ece03e93,1541256321,18,interaction item image,2753200,PL,"Łódź, Poland",tablet,,,
39870,ZXEXR9ETL1N9,ede97ece03e93,1541256321,19,interaction item image,2753200,PL,"Łódź, Poland",tablet,,,
39871,ZXEXR9ETL1N9,ede97ece03e93,1541256324,20,clickout item,2753200,PL,"Łódź, Poland",tablet,,2753200|234416|2250126|5818036|3179640|906283|...,35|40|39|32|48|34|53|108|39|39|86|41|61|50|35|...


### add_impressions_columns_as_new_actions

In [None]:
train_df, final_new_index = sess2vec.add_impressions_as_new_actions(train_df)

In [None]:
print(train_df.shape)
#print(test_df.shape)
print(train_df.columns)

In [None]:
train_df.tail(5)

### pad/truncate sessions

In [None]:
MAX_SESSION_LENGTH = 70
train_df = sess2vec.pad_sessions(train_df, max_session_length=MAX_SESSION_LENGTH)
#test_df = sess2vec.pad_sessions(test_df, max_session_length=MAX_SESSION_LENGTH)

In [None]:
print(train_df.shape)
#print(test_df.shape)

In [None]:
train_df.tail()

### Get the indices of the last clickouts

In [None]:
train_clickouts_df = sess2vec.get_last_clickout(train_df, index_name='index', rename_index='orig_index')
train_clickouts_indices = train_clickouts_df.orig_index.values
train_clickouts_indices.sort()

### One-hot device and action type

In [7]:
actions_classes = ["show_impression", "clickout item", "interaction item rating", "interaction item info",
           "interaction item image", "interaction item deals", "change of sort order", "filter selection",
           "search for item", "search for destination", "search for poi"]
devices_classes = ['mobile', 'desktop', 'tablet']
#actions = [ f'action_type_{a.replace(" ","_")}' for a in actions ]

In [None]:
print('Adding one-hot columns of device...')
train_df = sess2vec.one_hot_df_column(train_df, 'device', classes=devices_classes)
print(train_df.shape)

print('Adding one-hot columns of action_type...')
train_df = sess2vec.one_hot_df_column(train_df, 'action_type', classes=actions_classes)
print(train_df.shape)

In [None]:
train_df.tail()

In [None]:
X_train_path = os.path.join(path, 'X_train.csv')

### Add accomodations features

In [None]:
def add_accomodations_features(df, path_to_save, logic='skip', row_indices=[]):
    """
    Add the features (one-hot) to the dataframe that match the 'reference' and save the resulting dataframe.
    It is possible to specify a list of rows to skip (logic='skip'), or to join only for some rows (logic='subset').
    Return the target columns and the one-hot columns that have been added to the dataframe
    """
    # save the references series and then set the reference to NaN to skip the join on that rows
    join_data = dict()
    join_data['backup_reference_series'] = df.reference.values.copy()
    if len(row_indices) > 0:
        if logic == 'skip':
            # set to NaN the rows to be skipped
            df.loc[row_indices, 'reference'] = np.nan
        if logic == 'subset':
            # set to NaN all rows, except for the specified rows
            backup_serie = df.loc[row_indices].reference.copy()
            df.reference = np.nan            
            df.loc[row_indices, 'reference'] = backup_serie

    # cast the reference column to Int64 removing the string values
    df.reference = pd.to_numeric(df.reference, errors='coerce') #.astype('Int64')
    
    attributes_df = data.accomodations_one_hot()
    return
    
    def post_join(chunk_df, data):
        # reset the original references
        #chunk_df.loc[:,'reference'] = data['backup_reference_series'][data['$i1']:data['$i2']]
        return chunk_df.drop('reference', axis=1)
    
    sparsedf.left_join_in_chunks(df, attributes_df, left_on='reference', right_on=None, right_index=True,
                                post_join_fn=post_join, data=join_data, path_to_save=path_to_save)

In [None]:
add_accomodations_features(train_df.copy(), X_train_path, logic='skip', row_indices=train_clickouts_indices)

In [None]:
train_df.tail()

## Reload the sparse dataframe

In [None]:
X_path = os.path.join(path, 'X_train.csv')
X_sparsecols = sess2vec.load_sparse_columns(path, 'X_sparsecols')
cols_to_drop_in_X = ['user_id','session_id','step','platform','city','current_filters']

In [None]:
sparse_df = sparsedf.read(X_path, sparse_cols=X_sparsecols).set_index('orig_index')

In [None]:
sparse_df.head(50)

In [None]:
X_path = os.path.join(path, 'X_train.csv')
Y_path = os.path.join(path, 'Y_train.csv')

X_sparsecols = sess2vec.load_sparse_columns(path, 'X_sparsecols')
Y_sparsecols = sess2vec.load_sparse_columns(path, 'Y_sparsecols')

X_train_df = sparsedf.read(X_path, sparse_cols=X_sparsecols).set_index('orig_index')
Y_train_df = sparsedf.read(Y_path, sparse_cols=Y_sparsecols).set_index('orig_index')

In [None]:
%time chunk_df = X_train_df['session_id'].iloc[0:70*10]
%time chunk_df = X_train_df.iloc[0:70*10]

#%time chunk_df.loc[:, X_sparsecols] = chunk_df[X_sparsecols].astype('Int8')

#%time chunk_df.groupby('session_id').apply(lambda g: g.values)

In [None]:
%time pd.read_csv(X_path, index_col=0, skiprows=range(1, 70*2*100+1), nrows=7000)

In [None]:
%time Y_train_df.groupby('session_id').apply(lambda g: g.values)

In [None]:
x,y = sess2vec.load_training_dataset_for_regression(mode)

In [None]:
x

In [None]:
sparse_df = sparsedf.read(path_to_save, sparse_cols=features_columns).set_index('orig_index')

In [None]:
print('Full dataframe shape:', sparse_df.shape)

In [None]:
sparse_df.tail()