# Data preparation

Let's start from data preparation:
1. Download files from `https://huggingface.co/datasets/dllllb/age-group-prediction/tree/main` to `data/` folder
2. Run this script to split it into 5 folds

In [1]:
!ls -l data/

total 816964
drwxr-xr-x 2 jovyan jovyan      4096 Jan 16 01:04 fold_0
-rwxr-xr-x 1 jovyan jovyan     10694 Oct 30  2019 small_group_description.csv
-rwxr-xr-x 1 jovyan jovyan    115609 Oct 30  2019 test.csv
-rwxr-xr-x 1 jovyan jovyan    233306 Oct 30  2019 train_target.csv
-rwxr-xr-x 1 jovyan jovyan 333549487 Oct 30  2019 transactions_test.csv
-rwxr-xr-x 1 jovyan jovyan 499354971 Oct 30  2019 transactions_train.csv


In [2]:
import pickle

import numpy as np
import pandas as pd

from pathlib import Path

In [3]:
df_trx_test = pd.read_csv('data/transactions_test.csv')
df_trx_train = pd.read_csv('data/transactions_train.csv')

In [4]:
df_target = pd.read_csv('data/train_target.csv')

In [5]:
df_trx_test.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,46445,3,0,19.555
1,46445,3,1,27.774
2,46445,4,0,18.114
3,46445,4,1,22.183
4,46445,5,2,45.795


In [6]:
df_trx_train.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [7]:
df_trx_train['client_id'].unique().size, df_trx_test['client_id'].unique().size

(30000, 20000)

In [8]:
df_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [9]:
def trx_transform(df):
    df['event_time'] = df['trans_date'].astype(float)
    df['weekday'] = df['trans_date'] % 7
    return df

df_trx_train = trx_transform(df_trx_train)
df_trx_test = trx_transform(df_trx_test)

In [10]:
df_trx_train.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur,event_time,weekday
0,33172,6,4,71.463,6.0,6
1,33172,6,35,45.017,6.0,6
2,33172,8,11,13.887,8.0,1
3,33172,9,11,15.983,9.0,2
4,33172,10,11,21.341,10.0,3


In [11]:
from ptls.preprocessing import PandasDataPreprocessor


libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [12]:
pdp = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group', 'weekday'],
    category_transformation='frequency',
    cols_numerical=['amount_rur'],
    return_records=False,
)

In [13]:
pdp.fit(pd.concat([df_trx_train, df_trx_test], axis=0));

In [14]:
df_group_train = pdp.transform(df_trx_train)
df_group_test = pdp.transform(df_trx_test)

In [15]:
del df_group_train['trans_date']
del df_group_test['trans_date']

In [16]:
from sklearn.model_selection import StratifiedKFold, train_test_split

In [17]:
for fold_i, (i_train, i_test) in enumerate(
    StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(df_target['client_id'], df_target['bins'])):

    i_trx_pretrain, i_seq_pretrain = train_test_split(
        i_train, train_size=8000, shuffle=True, random_state=42, stratify=df_target.iloc[i_train]['bins'])

    i_trx_pretrain_uns, i_seq_pretrain_uns = train_test_split(
        df_trx_test[['client_id']].drop_duplicates(), train_size=0.5, shuffle=True, random_state=42)



    df_trx_pretrain = pd.concat([
        df_group_train.merge(df_target.iloc[i_trx_pretrain][['client_id']], left_on='client_id', right_on='client_id'),
        df_group_test.merge(i_trx_pretrain_uns, left_on='client_id', right_on='client_id'),
    ], axis=0)

    df_seq_pretrain = pd.concat([
        df_group_train.merge(df_target.iloc[i_seq_pretrain][['client_id']], left_on='client_id', right_on='client_id'),
        df_group_test.merge(i_seq_pretrain_uns, left_on='client_id', right_on='client_id'),
    ], axis=0)

    df_gbm_train = pd.concat([
        df_group_train.merge(df_target.iloc[i_trx_pretrain], left_on='client_id', right_on='client_id'),
        df_group_train.merge(df_target.iloc[i_seq_pretrain], left_on='client_id', right_on='client_id'),
    ], axis=0)

    df_gbm_test = pd.concat([
        df_group_train.merge(df_target.iloc[i_test], left_on='client_id', right_on='client_id'),
    ], axis=0)

    print(fold_i, [len(df) for df in [df_trx_pretrain, df_seq_pretrain, df_gbm_train, df_gbm_test]])

    Path(f'data/fold_{fold_i}/').mkdir(exist_ok=True)

    df_trx_pretrain.to_pickle(f'data/fold_{fold_i}/df_trx_pretrain.pickle')
    df_seq_pretrain.to_pickle(f'data/fold_{fold_i}/df_seq_pretrain.pickle')
    df_gbm_train.to_pickle(f'data/fold_{fold_i}/df_gbm_train.pickle')
    df_gbm_test.to_pickle(f'data/fold_{fold_i}/df_gbm_test.pickle')

    with open(f'data/fold_{fold_i}/pdp.pickle', 'wb') as f:
        pickle.dump(pdp, f)

print('Done')

0 [18000, 26000, 24000, 6000]
1 [18000, 26000, 24000, 6000]
2 [18000, 26000, 24000, 6000]
3 [18000, 26000, 24000, 6000]
4 [18000, 26000, 24000, 6000]
Done
