## Setup

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import copy
import pickle
import sys
import os

import pytorch_lightning as pl

import logging

sys.path.insert(0, 'C:\\Users\\peter\\anaconda3\\envs\\cv\\Scripts\\ptls-glove')

from functools import partial
from ptls.nn import RnnSeqEncoder, TrxEncoder
from ptls.nn.trx_encoder.trx_encoders_custom import TrxEncoderGlove, TrxEncoderCat, TrxEncoderTrans
from ptls.nn.trx_encoder import GloveEmbedding
from ptls.preprocessing.baseline_discretizer import KDiscretizer, SingleTreeDiscretizer
from ptls.preprocessing.deeptlf.src import DeepTLF
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule, TestModule
from ptls.frames.coles.metric import BatchRecallTopK
from ptls.preprocessing.deeptlf import DeepTLFDisc

import ptls
import torch
from torch import nn
from ptls.preprocessing import PandasDataPreprocessor
from sklearn.model_selection import train_test_split
from ptls.data_load.datasets import inference_data_loader
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

2025-02-10 13:20:22.768836: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-10 13:20:22.784580: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739182822.803827    9665 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739182822.809530    9665 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 13:20:22.829849: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
def get_df_trx(df_train, df_test):
    COL_EVENT_TIME = 'TRDATETIME'
    df_train = df_train.drop(['PERIOD', 'target_flag', 'target_sum'], axis=1)
    df_test = df_test.drop(['PERIOD'], axis=1)

    df = pd.concat([df_train, df_test], ignore_index=True)

    # event_time mapping
    df['_et_day'] = pd.to_datetime(df[COL_EVENT_TIME].str[:7], format='%d%b%y').astype(np.int64) // 10**9
    df['_et_time'] = (pd.to_datetime(df[COL_EVENT_TIME].str[8:], format='%H:%M:%S').astype(np.int64) // 10**9)

    df['event_time'] = df['_et_day'] + df['_et_time']
    df = df.drop(['_et_day', '_et_time', COL_EVENT_TIME], axis=1)

    df.columns = df.columns.str.lower()
    df['amount'] = df['amount'].astype(float)

    return df

def get_df_target(df_train):
    df_target = df_train.groupby("cl_id")["target_flag"].first().astype({"target_flag": 'int'})

    return pd.DataFrame(df_target)

def split_target(df):
    df['seq_len'] =  np.ceil(np.log(df['seq_len'] / np.log(4))).astype('string')
    df['hash'] = (pd.Series(df.index.astype(str) + "42").apply(hash) / (2 ** 32) + 0.5).values
    df['fold_id'] = df.groupby(["target_flag", 'seq_len'])['hash'].rank(method='first', ascending=True) % 6
    
    df = df.drop(columns = ['hash', 'seq_len'])
    return df

def split_fold(fold_id, df_target, df_trx):
    folds_path = Path(conf_pp.folds_path)
    preproc = PysparkDataPreprocessor(
        col_id=conf_pp.col_client_id,
        col_event_time='event_time',
        event_time_transformation='none',
        cols_category=["mcc", "channel_type", "currency", "trx_category"],
        cols_numerical=['amount'],
        cols_last_item=[conf_pp.col_target],
    )
    df_train_trx = df_trx.merge(df_target, how='left', on=conf_pp.col_client_id).loc[lambda x: x['fold_id'].fillna(-1) != fold_id].drop('fold_id', axis=1)
    df_test_trx = df_trx.merge(df_target.loc[lambda x: x['fold_id'] == fold_id].drop('fold_id', axis=1), how='inner', on=conf_pp.col_client_id)
    df_train_data = preproc.fit_transform(df_train_trx)
    df_test_data = preproc.transform(df_test_trx)
    file_name_train = get_file_name_train(fold_id)
    file_name_test = get_file_name_test(fold_id)
    df_train_data.to_parquet(folds_path / file_name_train, mode='overwrite')
    df_test_data.to_parquet(folds_path / file_name_test, mode='overwrite')
    with open(folds_path / f'preproc_{fold_id}.p', 'wb') as f:
        pickle.dump(preproc, f)
    logger.info(f'Preprocessor[{fold_id}].category_dictionary_sizes={preproc.get_category_dictionary_sizes()}')

    for df_name in [file_name_train, file_name_test]:
        df = pd.read_parquet(folds_path / df_name)
        logger.info(f'{df_name:30} {df.shape}')

    for df_name in [file_name_train, file_name_test]:
        df = pd.read_parquet(folds_path / df_name)
        r_counts = df[conf_pp.col_target].value_counts(dropna=False).to_dict()
        cnt_str = ', '.join([f'{r_counts.get(0, 0):5d} - 0 class', f'{r_counts.get(1, 0):5d} - 1 class'])
        logger.info(f'{df_name:30} {cnt_str}')

In [2]:
import os
import pandas as pd

data_path = 'data/rosbank'

train = pd.read_csv(os.path.join(data_path, 'train.csv'))
test = pd.read_csv(os.path.join(data_path, 'test.csv'))

FileNotFoundError: [Errno 2] No such file or directory: 'data/rosbank/train.csv'

In [2]:
df_trx = get_df_trx(train, test)
df_trx = df_trx.fillna("unknown_type")
df_target = get_df_target(train)

NameError: name 'get_df_trx' is not defined

In [6]:
df_trx = df_trx.sort_values(by=["cl_id", "event_time"], ascending=True)

In [7]:
source_data = df_trx.rename({"cl_id" : "client_id", "event_time" : "tr_datetime"})

In [8]:
source_data.rename(columns = {"cl_id" : "client_id", "event_time" : "tr_datatime"}, inplace=True)

In [9]:
source_data

Unnamed: 0,client_id,mcc,channel_type,currency,amount,trx_category,tr_datatime
1,0,6011,unknown_type,810,20000.00,DEPOSIT,-701177753
0,0,5200,unknown_type,810,5023.00,POS,-700444800
3,0,5411,unknown_type,810,2031.00,POS,-700444800
4,0,6012,unknown_type,810,36562.00,C2C_OUT,-700137936
2,0,5921,unknown_type,810,767.00,POS,-696556800
...,...,...,...,...,...,...,...
1008299,10216,5411,type1,810,132.13,POS,-712886400
1008293,10216,5691,type1,810,2090.00,POS,-712540800
1008296,10216,5411,type1,810,1259.31,POS,-712540800
1008285,10216,5691,type1,810,1185.00,POS,-712454400


In [5]:
data_path = 'data/kaggle_nir_card_trans'

data = pd.read_csv(os.path.join(data_path, 'credit_card_transactions.csv'))

## Data preproccessing

In [2]:
def prepare_data_age_bins_scenario():
    data_path = '../data/age_bins'

    source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))

    df_params = {
        "numeric_cols" : ["amount_rur"],
        "cat_cols" : ["small_group"],
        "date_col" : "trans_date",
        "cat_unique" : [],
        "id_col" : "client_id"
    }

    for f in df_params["cat_cols"] + [df_params["date_col"]]:
        df_params["cat_unique"].append(source_data[f].unique().shape[0])

    return source_data, df_params

def prepare_data_gender_scenario():
    data_path = '../data/gender'

    source_data = pd.read_csv(os.path.join(data_path, 'transactions.csv'))
    source_data = source_data.drop(columns=["term_id"]).rename(columns={'customer_id' : 'client_id'})

    source_data.tr_datetime = [int(i.split()[0]) for i in source_data.tr_datetime.values]

    df_params = {
        "numeric_cols" : ["amount"],
        "cat_cols" : ["mcc_code", "tr_type"],
        "cat_unique" : [],
        "date_col" : "tr_datetime",
        "id_col" : "client_id"
    }

    for f in df_params["cat_cols"] + [df_params["date_col"]]:
        df_params["cat_unique"].append(source_data[f].unique().shape[0])
    
    return source_data, df_params

In [3]:
#gender
TARGET_NAME = 'gender'
source_data, df_params = prepare_data_gender_scenario()
#targets = pd.read_csv(os.path.join('../data/gender', 'gender_train.csv')).drop(columns=['Unnamed: 0'])
targets = pd.read_csv(os.path.join('../data/gender', 'gender_train.csv')).rename(columns={'customer_id' : 'client_id'})
targets = source_data[['client_id']].drop_duplicates().merge(targets, on='client_id', how='left').fillna(2)
source_data.head(2)

Unnamed: 0,client_id,tr_datetime,mcc_code,tr_type,amount
0,39026145,0,4814,1030,-2245.92
1,39026145,1,6011,7010,56147.89


In [None]:
TARGET_NAME = 'bins'
source_data, df_params = prepare_data_age_bins_scenario()
targets = pd.read_csv(os.path.join('../data/age_bins', 'train_target.csv'))
source_data.head(2)

In [30]:
source_data.shape, targets.shape

((26450577, 4), (30000, 2))

In [4]:
EMBED_SIZE = 16

df_params = {'numeric_cols': ['amount'],
             'cat_cols': ['mcc_code', 'tr_type'],
             'cat_unique': [184, 77, 457],
             'date_col': 'tr_datetime',
             'id_col': 'client_id'}

#best discretizers

disc_quant100 = KDiscretizer(
    f_names = df_params['numeric_cols'],
    k_bins = 100,
    d_type = 'quantile'
)

deeptlf_params9 = DeepTLFDisc({"n_est" : 9,
                  "min_freq" : 2,
                  "features" : df_params['numeric_cols'] + df_params['cat_cols'],
                  "features_to_split" : df_params['numeric_cols'],
                  })

disc_st20 = SingleTreeDiscretizer(
    f_names = df_params['numeric_cols'], 
    target_name = TARGET_NAME, 
    target_type = 'classification', 
    k_bins = [20],
)

glove_config1= {"alpha" : 0.75, 
                "x_max" : 100, 
                "embedding_size" : EMBED_SIZE, 
                "num_epochs_train" : 50}

glove_config2= {"alpha" : 0.75, 
                "x_max" : 15000, 
                "embedding_size" : EMBED_SIZE, 
                "num_epochs_train" : 50}

glove_config3= {"alpha" : 1.0, 
                "x_max" : 20000, 
                "embedding_size" : EMBED_SIZE, 
                "num_epochs_train" : 50}

experiments = [
    {
        'name' : 'glove_emb_cfg2_no_num_gender',
        'glove_config' : copy.deepcopy(glove_config2),
        'disc' : None
    },
    {
        'name' : 'glove_emb_cfg1_ST20_gender',
        'glove_config' : copy.deepcopy(glove_config1),
        'disc' : copy.deepcopy(disc_st20)
    },
    {
        'name' : 'glove_emb_cfg2_quant100_gender',
        'glove_config' : copy.deepcopy(glove_config2),
        'disc' : copy.deepcopy(disc_quant100)
    },
    {
        'name' : 'glove_emb_cfg2_deeptlf9_gender',
        'glove_config' : copy.deepcopy(glove_config2),
        'disc' : copy.deepcopy(deeptlf_params9)
    },
]

In [8]:
for exp in experiments:
    print(exp['name'])
    data = copy.deepcopy(source_data)
    disc = exp['disc']
    if disc is not None:
        if "ST" not in exp['name']:
            disc.fit(data)
            data = disc.transform(data, to_embeds=False)
        else:
            disc.fit(data.sample(int(1e+5), random_state=42).merge(targets, on=df_params['id_col'], how='left'))
            data = disc.transform(data, to_embeds=False)
        print(f"{exp['name']}: DATA DISCRETIZED")
        
        embedded_feats = df_params['numeric_cols'] + df_params['cat_cols'] + [df_params["date_col"]]
    else:
        embedded_feats = df_params['cat_cols'] + [df_params["date_col"]]
        
    print(embedded_feats)
    glove_embedding = GloveEmbedding(
            feature_names=embedded_feats,
            calculate_cooccur=True,
            embedding_folder=f'../glove_embeddings/{exp["name"]}',
            glove_params=exp['glove_config']
        )
    glove_embedding.fit(data)
    print(f"{exp['name']}: GLOVE EMB CALCULATED")

glove_emb_cfg1_ST20_gender


amount:   0%|          | 0/15 [00:00<?, ?it/s]

glove_emb_cfg1_ST20_gender: DATA DISCRETIZED
['amount', 'mcc_code', 'tr_type', 'tr_datetime']
train started


  2%|█▋                                                                                 | 1/50 [00:03<02:37,  3.21s/it]

Epoch 0: loss = 22958.36617102751


  4%|███▎                                                                               | 2/50 [00:06<02:33,  3.19s/it]

Epoch 1: loss = 10523.992249534625


  6%|████▉                                                                              | 3/50 [00:09<02:29,  3.18s/it]

Epoch 2: loss = 6326.358097757712


  8%|██████▋                                                                            | 4/50 [00:12<02:26,  3.17s/it]

Epoch 3: loss = 4155.217152508508


 10%|████████▎                                                                          | 5/50 [00:15<02:23,  3.20s/it]

Epoch 4: loss = 2911.1233751609507


 12%|█████████▉                                                                         | 6/50 [00:19<02:20,  3.19s/it]

Epoch 5: loss = 2151.796983737031


 14%|███████████▌                                                                       | 7/50 [00:22<02:17,  3.19s/it]

Epoch 6: loss = 1662.06335705988


 16%|█████████████▎                                                                     | 8/50 [00:25<02:13,  3.19s/it]

Epoch 7: loss = 1331.679476435582


 18%|██████████████▉                                                                    | 9/50 [00:28<02:10,  3.18s/it]

Epoch 8: loss = 1099.344674281207


 20%|████████████████▍                                                                 | 10/50 [00:31<02:06,  3.17s/it]

Epoch 9: loss = 930.5155555404872


 22%|██████████████████                                                                | 11/50 [00:35<02:03,  3.18s/it]

Epoch 10: loss = 804.3756171798061


 24%|███████████████████▋                                                              | 12/50 [00:38<02:00,  3.18s/it]

Epoch 11: loss = 707.6770620252612


 26%|█████████████████████▎                                                            | 13/50 [00:41<01:57,  3.18s/it]

Epoch 12: loss = 632.0498216359333


 28%|██████████████████████▉                                                           | 14/50 [00:44<01:55,  3.21s/it]

Epoch 13: loss = 571.7643885448082


 30%|████████████████████████▌                                                         | 15/50 [00:47<01:52,  3.22s/it]

Epoch 14: loss = 522.9468107612936


 32%|██████████████████████████▏                                                       | 16/50 [00:51<01:48,  3.20s/it]

Epoch 15: loss = 482.76049719272015


 34%|███████████████████████████▉                                                      | 17/50 [00:54<01:45,  3.19s/it]

Epoch 16: loss = 449.2795993789903


 36%|█████████████████████████████▌                                                    | 18/50 [00:57<01:41,  3.19s/it]

Epoch 17: loss = 420.96700789220654


 38%|███████████████████████████████▏                                                  | 19/50 [01:00<01:38,  3.19s/it]

Epoch 18: loss = 396.84408070957585


 40%|████████████████████████████████▊                                                 | 20/50 [01:03<01:35,  3.18s/it]

Epoch 19: loss = 376.0810565249416


 42%|██████████████████████████████████▍                                               | 21/50 [01:06<01:32,  3.17s/it]

Epoch 20: loss = 357.9397175855667


 44%|████████████████████████████████████                                              | 22/50 [01:10<01:29,  3.18s/it]

Epoch 21: loss = 342.06457926039604


 46%|█████████████████████████████████████▋                                            | 23/50 [01:13<01:25,  3.18s/it]

Epoch 22: loss = 328.0447257811901


 48%|███████████████████████████████████████▎                                          | 24/50 [01:16<01:23,  3.20s/it]

Epoch 23: loss = 315.59777687199636


 50%|█████████████████████████████████████████                                         | 25/50 [01:19<01:20,  3.20s/it]

Epoch 24: loss = 304.4683704580725


 52%|██████████████████████████████████████████▋                                       | 26/50 [01:22<01:16,  3.19s/it]

Epoch 25: loss = 294.51397111372734


 54%|████████████████████████████████████████████▎                                     | 27/50 [01:26<01:13,  3.18s/it]

Epoch 26: loss = 285.3670911847229


 56%|█████████████████████████████████████████████▉                                    | 28/50 [01:29<01:10,  3.19s/it]

Epoch 27: loss = 277.10890855861186


 58%|███████████████████████████████████████████████▌                                  | 29/50 [01:32<01:06,  3.18s/it]

Epoch 28: loss = 269.6145964197185


 60%|█████████████████████████████████████████████████▏                                | 30/50 [01:35<01:03,  3.18s/it]

Epoch 29: loss = 262.731207288144


 62%|██████████████████████████████████████████████████▊                               | 31/50 [01:38<01:00,  3.20s/it]

Epoch 30: loss = 256.4084936716876


 64%|████████████████████████████████████████████████████▍                             | 32/50 [01:42<00:57,  3.20s/it]

Epoch 31: loss = 250.5793704160927


 66%|██████████████████████████████████████████████████████                            | 33/50 [01:45<00:54,  3.20s/it]

Epoch 32: loss = 245.20702105860875


 68%|███████████████████████████████████████████████████████▊                          | 34/50 [01:48<00:51,  3.21s/it]

Epoch 33: loss = 240.23507430795718


 70%|█████████████████████████████████████████████████████████▍                        | 35/50 [01:51<00:48,  3.21s/it]

Epoch 34: loss = 235.52437218793486


 72%|███████████████████████████████████████████████████████████                       | 36/50 [01:54<00:44,  3.20s/it]

Epoch 35: loss = 231.16890927956098


 74%|████████████████████████████████████████████████████████████▋                     | 37/50 [01:58<00:41,  3.19s/it]

Epoch 36: loss = 227.1286360787513


 76%|██████████████████████████████████████████████████████████████▎                   | 38/50 [02:01<00:38,  3.19s/it]

Epoch 37: loss = 223.3137454245851


 78%|███████████████████████████████████████████████████████████████▉                  | 39/50 [02:04<00:35,  3.21s/it]

Epoch 38: loss = 219.72741762142638


 80%|█████████████████████████████████████████████████████████████████▌                | 40/50 [02:07<00:32,  3.20s/it]

Epoch 39: loss = 216.40419161298777


 82%|███████████████████████████████████████████████████████████████████▏              | 41/50 [02:10<00:28,  3.21s/it]

Epoch 40: loss = 213.20171208749252


 84%|████████████████████████████████████████████████████████████████████▉             | 42/50 [02:14<00:25,  3.19s/it]

Epoch 41: loss = 210.2063249060743


 86%|██████████████████████████████████████████████████████████████████████▌           | 43/50 [02:17<00:22,  3.19s/it]

Epoch 42: loss = 207.38154598091728


 88%|████████████████████████████████████████████████████████████████████████▏         | 44/50 [02:20<00:19,  3.20s/it]

Epoch 43: loss = 204.70033109617006


 90%|█████████████████████████████████████████████████████████████████████████▊        | 45/50 [02:23<00:15,  3.19s/it]

Epoch 44: loss = 202.14609255922528


 92%|███████████████████████████████████████████████████████████████████████████▍      | 46/50 [02:26<00:12,  3.19s/it]

Epoch 45: loss = 199.7573057390693


 94%|█████████████████████████████████████████████████████████████████████████████     | 47/50 [02:30<00:09,  3.20s/it]

Epoch 46: loss = 197.4514770673403


 96%|██████████████████████████████████████████████████████████████████████████████▋   | 48/50 [02:33<00:06,  3.19s/it]

Epoch 47: loss = 195.2645889965052


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 49/50 [02:36<00:03,  3.19s/it]

Epoch 48: loss = 193.17191697960183


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [02:39<00:00,  3.19s/it]

Epoch 49: loss = 191.1738770240357
glove_emb_cfg1_ST20_gender: GLOVE EMB CALCULATED
glove_emb_cfg2_quant100_gender





glove_emb_cfg2_quant100_gender: DATA DISCRETIZED
['amount', 'mcc_code', 'tr_type', 'tr_datetime']
train started


  2%|█▋                                                                                 | 1/50 [00:05<04:16,  5.24s/it]

Epoch 0: loss = 4988.602693868986


  4%|███▎                                                                               | 2/50 [00:10<04:05,  5.12s/it]

Epoch 1: loss = 3002.113253057503


  6%|████▉                                                                              | 3/50 [00:15<04:00,  5.11s/it]

Epoch 2: loss = 2191.828488380358


  8%|██████▋                                                                            | 4/50 [00:20<03:54,  5.09s/it]

Epoch 3: loss = 1681.6202367533415


 10%|████████▎                                                                          | 5/50 [00:25<03:49,  5.11s/it]

Epoch 4: loss = 1328.1425988794183


 12%|█████████▉                                                                         | 6/50 [00:30<03:44,  5.09s/it]

Epoch 5: loss = 1071.3452951271925


 14%|███████████▌                                                                       | 7/50 [00:35<03:39,  5.10s/it]

Epoch 6: loss = 879.8085724224208


 16%|█████████████▎                                                                     | 8/50 [00:40<03:34,  5.10s/it]

Epoch 7: loss = 733.9020935287026


 18%|██████████████▉                                                                    | 9/50 [00:45<03:29,  5.10s/it]

Epoch 8: loss = 620.791485133929


 20%|████████████████▍                                                                 | 10/50 [00:51<03:23,  5.10s/it]

Epoch 9: loss = 531.9559749750656


 22%|██████████████████                                                                | 11/50 [00:56<03:19,  5.11s/it]

Epoch 10: loss = 461.22405230351615


 24%|███████████████████▋                                                              | 12/50 [01:01<03:14,  5.12s/it]

Epoch 11: loss = 404.292967590949


 26%|█████████████████████▎                                                            | 13/50 [01:06<03:09,  5.12s/it]

Epoch 12: loss = 357.96212009985834


 28%|██████████████████████▉                                                           | 14/50 [01:11<03:03,  5.10s/it]

Epoch 13: loss = 319.87098032515894


 30%|████████████████████████▌                                                         | 15/50 [01:16<02:59,  5.12s/it]

Epoch 14: loss = 288.2350899832037


 32%|██████████████████████████▏                                                       | 16/50 [01:21<02:53,  5.11s/it]

Epoch 15: loss = 261.7261420605278


 34%|███████████████████████████▉                                                      | 17/50 [01:26<02:48,  5.11s/it]

Epoch 16: loss = 239.30395875145513


 36%|█████████████████████████████▌                                                    | 18/50 [01:31<02:43,  5.10s/it]

Epoch 17: loss = 220.18571168009387


 38%|███████████████████████████████▏                                                  | 19/50 [01:37<02:38,  5.12s/it]

Epoch 18: loss = 203.7410010006235


 40%|████████████████████████████████▊                                                 | 20/50 [01:42<02:33,  5.11s/it]

Epoch 19: loss = 189.49613678167455


 42%|██████████████████████████████████▍                                               | 21/50 [01:47<02:28,  5.12s/it]

Epoch 20: loss = 177.03796832403268


 44%|████████████████████████████████████                                              | 22/50 [01:52<02:23,  5.11s/it]

Epoch 21: loss = 166.0883758602466


 46%|█████████████████████████████████████▋                                            | 23/50 [01:57<02:18,  5.12s/it]

Epoch 22: loss = 156.39864032782856


 48%|███████████████████████████████████████▎                                          | 24/50 [02:02<02:12,  5.12s/it]

Epoch 23: loss = 147.77516266025614


 50%|█████████████████████████████████████████                                         | 25/50 [02:07<02:08,  5.13s/it]

Epoch 24: loss = 140.0465795553


 52%|██████████████████████████████████████████▋                                       | 26/50 [02:12<02:02,  5.12s/it]

Epoch 25: loss = 133.08524580761437


 54%|████████████████████████████████████████████▎                                     | 27/50 [02:18<01:58,  5.13s/it]

Epoch 26: loss = 126.78990073601096


 56%|█████████████████████████████████████████████▉                                    | 28/50 [02:23<01:52,  5.12s/it]

Epoch 27: loss = 121.05337987605486


 58%|███████████████████████████████████████████████▌                                  | 29/50 [02:28<01:47,  5.13s/it]

Epoch 28: loss = 115.8232476679292


 60%|█████████████████████████████████████████████████▏                                | 30/50 [02:33<01:42,  5.12s/it]

Epoch 29: loss = 111.0273441995998


 62%|██████████████████████████████████████████████████▊                               | 31/50 [02:38<01:37,  5.13s/it]

Epoch 30: loss = 106.61023585554418


 64%|████████████████████████████████████████████████████▍                             | 32/50 [02:43<01:32,  5.13s/it]

Epoch 31: loss = 102.53386953644939


 66%|██████████████████████████████████████████████████████                            | 33/50 [02:48<01:27,  5.14s/it]

Epoch 32: loss = 98.75930086718571


 68%|███████████████████████████████████████████████████████▊                          | 34/50 [02:54<01:22,  5.13s/it]

Epoch 33: loss = 95.25660058692775


 70%|█████████████████████████████████████████████████████████▍                        | 35/50 [02:59<01:17,  5.15s/it]

Epoch 34: loss = 91.99160826118758


 72%|███████████████████████████████████████████████████████████                       | 36/50 [03:04<01:11,  5.13s/it]

Epoch 35: loss = 88.94017296336227


 74%|████████████████████████████████████████████████████████████▋                     | 37/50 [03:09<01:06,  5.14s/it]

Epoch 36: loss = 86.08750939302861


 76%|██████████████████████████████████████████████████████████████▎                   | 38/50 [03:14<01:01,  5.12s/it]

Epoch 37: loss = 83.41477473246844


 78%|███████████████████████████████████████████████████████████████▉                  | 39/50 [03:19<00:56,  5.14s/it]

Epoch 38: loss = 80.90776361238458


 80%|█████████████████████████████████████████████████████████████████▌                | 40/50 [03:24<00:51,  5.16s/it]

Epoch 39: loss = 78.53480202365606


 82%|███████████████████████████████████████████████████████████████████▏              | 41/50 [03:30<00:47,  5.26s/it]

Epoch 40: loss = 76.30391139261158


 84%|████████████████████████████████████████████████████████████████████▉             | 42/50 [03:35<00:42,  5.31s/it]

Epoch 41: loss = 74.19668775207836


 86%|██████████████████████████████████████████████████████████████████████▌           | 43/50 [03:41<00:37,  5.35s/it]

Epoch 42: loss = 72.20612779252674


 88%|████████████████████████████████████████████████████████████████████████▏         | 44/50 [03:46<00:32,  5.37s/it]

Epoch 43: loss = 70.31483757896109


 90%|█████████████████████████████████████████████████████████████████████████▊        | 45/50 [03:52<00:27,  5.40s/it]

Epoch 44: loss = 68.52550300586952


 92%|███████████████████████████████████████████████████████████████████████████▍      | 46/50 [03:57<00:21,  5.40s/it]

Epoch 45: loss = 66.82276656628916


 94%|█████████████████████████████████████████████████████████████████████████████     | 47/50 [04:02<00:16,  5.41s/it]

Epoch 46: loss = 65.20931358399126


 96%|██████████████████████████████████████████████████████████████████████████████▋   | 48/50 [04:08<00:10,  5.41s/it]

Epoch 47: loss = 63.66732880364436


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 49/50 [04:13<00:05,  5.42s/it]

Epoch 48: loss = 62.19962322595992


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:19<00:00,  5.19s/it]

Epoch 49: loss = 60.798676678265366
glove_emb_cfg2_quant100_gender: GLOVE EMB CALCULATED
glove_emb_cfg2_deeptlf9_gender





Counter({'mcc_code': 150, 'tr_type': 143, 'amount': 64})
glove_emb_cfg2_deeptlf9_gender: DATA DISCRETIZED
['amount', 'mcc_code', 'tr_type', 'tr_datetime']
train started


  2%|█▋                                                                                 | 1/50 [00:03<02:44,  3.35s/it]

Epoch 0: loss = 3805.5229135961567


  4%|███▎                                                                               | 2/50 [00:06<02:39,  3.33s/it]

Epoch 1: loss = 2070.0608300871563


  6%|████▉                                                                              | 3/50 [00:10<02:36,  3.34s/it]

Epoch 2: loss = 1394.059788785447


  8%|██████▋                                                                            | 4/50 [00:13<02:32,  3.31s/it]

Epoch 3: loss = 995.9911619094213


 10%|████████▎                                                                          | 5/50 [00:16<02:28,  3.30s/it]

Epoch 4: loss = 743.4208096079205


 12%|█████████▉                                                                         | 6/50 [00:19<02:25,  3.31s/it]

Epoch 5: loss = 575.7754895339234


 14%|███████████▌                                                                       | 7/50 [00:23<02:21,  3.30s/it]

Epoch 6: loss = 457.03808982283346


 16%|█████████████▎                                                                     | 8/50 [00:26<02:18,  3.30s/it]

Epoch 7: loss = 373.14108765893184


 18%|██████████████▉                                                                    | 9/50 [00:29<02:15,  3.30s/it]

Epoch 8: loss = 311.14780113005725


 20%|████████████████▍                                                                 | 10/50 [00:33<02:11,  3.29s/it]

Epoch 9: loss = 264.19424469428907


 22%|██████████████████                                                                | 11/50 [00:36<02:15,  3.47s/it]

Epoch 10: loss = 227.91685134512934


 24%|███████████████████▋                                                              | 12/50 [00:40<02:09,  3.42s/it]

Epoch 11: loss = 199.2736963011205


 26%|█████████████████████▎                                                            | 13/50 [00:43<02:06,  3.43s/it]

Epoch 12: loss = 176.3528754936025


 28%|██████████████████████▉                                                           | 14/50 [00:47<02:04,  3.45s/it]

Epoch 13: loss = 157.67765002853872


 30%|████████████████████████▌                                                         | 15/50 [00:50<02:01,  3.47s/it]

Epoch 14: loss = 142.33947260546464


 32%|██████████████████████████▏                                                       | 16/50 [00:54<01:58,  3.47s/it]

Epoch 15: loss = 129.5610024880736


 34%|███████████████████████████▉                                                      | 17/50 [00:57<01:55,  3.49s/it]

Epoch 16: loss = 118.80312965296947


 36%|█████████████████████████████▌                                                    | 18/50 [01:01<01:51,  3.50s/it]

Epoch 17: loss = 109.60810828377537


 38%|███████████████████████████████▏                                                  | 19/50 [01:04<01:48,  3.51s/it]

Epoch 18: loss = 101.72626528011403


 40%|████████████████████████████████▊                                                 | 20/50 [01:08<01:45,  3.53s/it]

Epoch 19: loss = 94.909748976637


 42%|██████████████████████████████████▍                                               | 21/50 [01:11<01:43,  3.57s/it]

Epoch 20: loss = 88.95940377753588


 44%|████████████████████████████████████                                              | 22/50 [01:15<01:39,  3.55s/it]

Epoch 21: loss = 83.682228362428


 46%|█████████████████████████████████████▋                                            | 23/50 [01:18<01:35,  3.53s/it]

Epoch 22: loss = 79.07220952006809


 48%|███████████████████████████████████████▎                                          | 24/50 [01:22<01:31,  3.53s/it]

Epoch 23: loss = 74.88029885659478


 50%|█████████████████████████████████████████                                         | 25/50 [01:26<01:28,  3.53s/it]

Epoch 24: loss = 71.1572345026142


 52%|██████████████████████████████████████████▋                                       | 26/50 [01:29<01:24,  3.52s/it]

Epoch 25: loss = 67.79030240418481


 54%|████████████████████████████████████████████▎                                     | 27/50 [01:33<01:20,  3.51s/it]

Epoch 26: loss = 64.76422863921741


 56%|█████████████████████████████████████████████▉                                    | 28/50 [01:36<01:17,  3.51s/it]

Epoch 27: loss = 61.978349947781524


 58%|███████████████████████████████████████████████▌                                  | 29/50 [01:40<01:13,  3.50s/it]

Epoch 28: loss = 59.408912517560736


 60%|█████████████████████████████████████████████████▏                                | 30/50 [01:43<01:10,  3.52s/it]

Epoch 29: loss = 57.066086564771865


 62%|██████████████████████████████████████████████████▊                               | 31/50 [01:47<01:06,  3.52s/it]

Epoch 30: loss = 54.92260112644076


 64%|████████████████████████████████████████████████████▍                             | 32/50 [01:50<01:03,  3.50s/it]

Epoch 31: loss = 52.918234829356926


 66%|██████████████████████████████████████████████████████                            | 33/50 [01:54<00:59,  3.50s/it]

Epoch 32: loss = 51.069981166306356


 68%|███████████████████████████████████████████████████████▊                          | 34/50 [01:57<00:56,  3.50s/it]

Epoch 33: loss = 49.37788416193112


 70%|█████████████████████████████████████████████████████████▍                        | 35/50 [02:01<00:52,  3.50s/it]

Epoch 34: loss = 47.743530474292456


 72%|███████████████████████████████████████████████████████████                       | 36/50 [02:04<00:48,  3.50s/it]

Epoch 35: loss = 46.23500146924682


 74%|████████████████████████████████████████████████████████████▋                     | 37/50 [02:08<00:46,  3.55s/it]

Epoch 36: loss = 44.85954385084393


 76%|██████████████████████████████████████████████████████████████▎                   | 38/50 [02:11<00:42,  3.55s/it]

Epoch 37: loss = 43.50157410035174


 78%|███████████████████████████████████████████████████████████████▉                  | 39/50 [02:15<00:38,  3.53s/it]

Epoch 38: loss = 42.247939066506135


 80%|█████████████████████████████████████████████████████████████████▌                | 40/50 [02:18<00:35,  3.52s/it]

Epoch 39: loss = 41.07341047558742


 82%|███████████████████████████████████████████████████████████████████▏              | 41/50 [02:22<00:31,  3.51s/it]

Epoch 40: loss = 39.96463606713913


 84%|████████████████████████████████████████████████████████████████████▉             | 42/50 [02:25<00:28,  3.52s/it]

Epoch 41: loss = 38.915193159406456


 86%|██████████████████████████████████████████████████████████████████████▌           | 43/50 [02:29<00:24,  3.51s/it]

Epoch 42: loss = 37.91654958247047


 88%|████████████████████████████████████████████████████████████████████████▏         | 44/50 [02:32<00:21,  3.51s/it]

Epoch 43: loss = 36.966020939030926


 90%|█████████████████████████████████████████████████████████████████████████▊        | 45/50 [02:36<00:17,  3.52s/it]

Epoch 44: loss = 36.081807552991314


 92%|███████████████████████████████████████████████████████████████████████████▍      | 46/50 [02:39<00:14,  3.51s/it]

Epoch 45: loss = 35.210645702852766


 94%|█████████████████████████████████████████████████████████████████████████████     | 47/50 [02:43<00:10,  3.52s/it]

Epoch 46: loss = 34.4003160272037


 96%|██████████████████████████████████████████████████████████████████████████████▋   | 48/50 [02:46<00:07,  3.51s/it]

Epoch 47: loss = 33.61998963153888


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 49/50 [02:50<00:03,  3.51s/it]

Epoch 48: loss = 32.88045614265761


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [02:53<00:00,  3.48s/it]

Epoch 49: loss = 32.1688981010407
glove_emb_cfg2_deeptlf9_gender: GLOVE EMB CALCULATED





In [15]:
glove_embedding.load()

In [9]:
data = glove_embedding.tokenize_data(data)

In [4]:
EMBED_SIZE = 16

df_params = {'numeric_cols': ['amount'],
             'cat_cols': ['mcc_code', 'tr_type'],
             'cat_unique': [184, 77, 457],
             'date_col': 'tr_datetime',
             'id_col': 'client_id'
    }

disc_quant30 = KDiscretizer(
    f_names = df_params['numeric_cols'],
    k_bins = 30,
    d_type = 'quantile',
    emb_sz = EMBED_SIZE
)

disc_quant100 = KDiscretizer(
    f_names = df_params['numeric_cols'],
    k_bins = 100,
    d_type = 'quantile'
)

disc_st20 = SingleTreeDiscretizer(
    f_names = df_params['numeric_cols'], 
    target_name = TARGET_NAME, 
    target_type = 'classification', 
    k_bins = [20],
)

disc_st30 = SingleTreeDiscretizer(
    f_names = df_params['numeric_cols'], 
    target_name = TARGET_NAME, 
    target_type = 'classification', 
    k_bins = [30],
    emb_sz = EMBED_SIZE
)



deeptlf_params6_ne = DeepTLFDisc({"n_est" : 6,
                  "min_freq" : 2,
                  "features" : df_params['numeric_cols'] + df_params['cat_cols'],
                  "features_to_split" : df_params['numeric_cols'],
                  "emb_size" : EMBED_SIZE})

deeptlf_params9 = DeepTLFDisc({"n_est" : 9,
                  "min_freq" : 2,
                  "features" : df_params['numeric_cols'] + df_params['cat_cols'],
                  "features_to_split" : df_params['numeric_cols'],
                  })


experiments = [
     {'name' : 'st_num_emb_dist_common_emb_sum', 
     'agg_type' : 'sum', 
     'disc' : copy.deepcopy(disc_st30),
     'nemb' : True
    },
    {'name' : 'quant_num_emb_dist_common_emb_cat', 
     'agg_type' : 'cat', 
     'disc' : copy.deepcopy(disc_quant30),
     'nemb' : True
    },
    {'name' : 'st_num_emb_dist_common_emb_mean', 
     'agg_type' : 'mean', 
     'disc' : copy.deepcopy(disc_st30),
     'nemb' : True
    },
    {'name' : 'quant_100_dist_common_emb_cat', 
     'agg_type' : 'cat', 
     'disc' : copy.deepcopy(disc_quant100),
     'nemb' : False
    },
    {'name' : 'quant_100_dist_common_emb_sum', 
     'agg_type' : 'sum', 
     'disc' : copy.deepcopy(disc_quant100),
     'nemb' : False
    },
    {'name' : 'quant_100_dist_common_emb_mean', 
     'agg_type' : 'mean', 
     'disc' : copy.deepcopy(disc_quant100),
     'nemb' : False
    },
    {'name' : 'st_20_dist_common_emb_cat', 
     'agg_type' : 'cat', 
     'disc' : copy.deepcopy(disc_st20),
     'nemb' : False
    },
    {'name' : 'st_20_dist_common_emb_sum', 
     'agg_type' : 'sum', 
     'disc' : copy.deepcopy(disc_st20),
     'nemb' : False
    },
    {'name' : 'st_20_dist_common_emb_mean', 
     'agg_type' : 'mean', 
     'disc' : copy.deepcopy(disc_st20),
     'nemb' : False
    },
     {'name' : 'deeptlf9_disc_common_emb_cat',
     'agg_type' : 'cat',
     'disc' : copy.deepcopy(deeptlf_params9),
     'nemb' : False
    },
    {'name' : 'deeptlf9_disc_common_emb_sum',
     'agg_type' : 'sum',
     'disc' : copy.deepcopy(deeptlf_params9),
     'nemb' : False
    },
    {'name' : 'deeptlf9_disc_common_emb_mean',
     'agg_type' : 'mean',
     'disc' : copy.deepcopy(deeptlf_params9),
     'nemb' : False
    },
    {'name' : 'deeptlf6_num_emb_disc_common_emb_cat',
     'agg_type' : 'cat',
     'disc' : copy.deepcopy(deeptlf_params6_ne),
     'nemb' : True
    },
    {'name' : 'deeptlf6_num_emb_disc_common_emb_sum',
     'agg_type' : 'sum',
     'disc' : copy.deepcopy(deeptlf_params6_ne),
     'nemb' : True
    },
    {'name' : 'deeptlf6_num_emb_disc_common_emb_mean',
     'agg_type' : 'mean',
     'disc' : copy.deepcopy(deeptlf_params6_ne),
     'nemb' : True
    }
]

In [None]:
disc_quant100 = KDiscretizer(
    f_names = ['amount_rur'],
    k_bins = 100,
    d_type = 'quantile'
)

deeptlf_params9 = DeepTLFDisc({"n_est" : 9,
                  "min_freq" : 2,
                  "features" : df_params['numeric_cols'] + df_params['cat_cols'],
                  "features_to_split" : df_params['numeric_cols'],
                  })

experiments = [
    {
     'name' : 'trx_tran_orig_no_disc',
     'algo' : 'orig',
     'agg_type' : None,
     'nsep' : False,
    },
    {
     'name' : 'trx_tran_orig_deeptlf9',
     'algo' : 'orig',
     'agg_type' : None,
     'nsep' : False,
     'disc' : copy.deepcopy(deeptlf_params9),
     'nemb' : False
    },
    {
     'name' : 'trx_tran_classic_deeptlf9_cat',
     'algo' : 'classic',
     'agg_type' : 'cat',
     'nsep' : False,
     'disc' : copy.deepcopy(deeptlf_params9),
     'nemb' : False
    },
    {
     'name' : 'trx_tran_classic_quant100_sum',
     'algo' : 'classic',
     'agg_type' : 'sum',
     'nsep' : False,
     'disc' : copy.deepcopy(disc_quant100),
     'nemb' : False
    },
    {
     'name' : 'trx_tran_orig_quant100',
     'algo' : 'orig',
     'agg_type' : None,
     'nsep' : False,
     'disc' : copy.deepcopy(disc_quant100),
     'nemb' : False
    },
    {
     'name' : 'trx_tran_classic_quant100_cat',
     'algo' : 'classic',
     'agg_type' : 'cat',
     'nsep' : False,
     'disc' : copy.deepcopy(disc_quant100),
     'nemb' : False
    },
      {
     'name' : 'trx_tran_classic_deeptlf9_sum',
     'algo' : 'classic',
     'agg_type' : 'sum',
     'nsep' : False,
     'disc' : copy.deepcopy(deeptlf_params9),
     'nemb' : False
    },
    #  {
    #  'name' : 'trx_tran_classic_deeptlf9_mean',
    #  'algo' : 'classic',
    #  'agg_type' : 'mean',
    #  'nsep' : False,
    #  'disc' : copy.deepcopy(deeptlf_params9),
    #  'nemb' : False
    # },
    # {
    #  'name' : 'trx_tran_classic_quant100_mean',
    #  'algo' : 'classic',
    #  'agg_type' : 'mean',
    #  'nsep' : False,
    #  'disc' : copy.deepcopy(disc_quant100),
    #  'nemb' : False
    # },
    {
     'name' : 'trx_tran_classic_no_disc_cat',
     'algo' : 'classic',
     'agg_type' : 'cat',
     'nsep' : True,
    }
]

### Experiments with sole numeric features discretization and common embeds

In [5]:
EMBED_SIZE = 16
HIDDEN_SIZE = 256

def get_basic_model_encoder(df_params):
    embeddings=dict()
    for i, f in enumerate(df_params["cat_cols"] + [df_params["date_col"]]):
        embeddings[f] = {'in' : df_params["cat_unique"][i], 'out' : EMBED_SIZE}
    
    trx_encoder_params = dict(
        embeddings_noise=0.003,
        numeric_values=dict([(fe, 'identity') for fe in df_params['numeric_cols']]),
        embeddings=embeddings
    )
    
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        hidden_size=256,
        type='gru',
    )
    
    return seq_encoder

def get_cat_encoder(df_params, agg_type, num_emb_flag=False):
    embeddings=dict()
    for i, f in enumerate(df_params["cat_cols"] + [df_params["date_col"]]):
        embeddings[f] = {'in' : df_params["cat_unique"][i], 'out' : EMBED_SIZE}

    trx_encoder_params = dict(
        embeddings=embeddings,
        embeddings_noise=0.003,
        agg_type=agg_type,
        numeric_separate=num_emb_flag,
        numeric_features=df_params['numeric_cols']
    )
    
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoderCat(**trx_encoder_params),
        hidden_size=HIDDEN_SIZE,
        type='gru',
    )
    return seq_encoder

def get_trans_encoder(df_params, agg_type, algo, numeric_separate=False):
    embeddings=dict()

    trx_encoder_params = dict(
        feature_names=df_params['cat_cols'] + [df_params["date_col"]], 
        in_emb_sizes=df_params["cat_unique"],
        out_emb_size=EMBED_SIZE,
        agg_type=agg_type,
        numeric_separate=numeric_separate,
        numeric_features=df_params['numeric_cols']
    )
    
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoderTrans(**trx_encoder_params),
        hidden_size=HIDDEN_SIZE,
        type='gru',
    )
    return seq_encoder

def get_glove_encoder(df_params, exp, glove_embedding):
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoderGlove(glove_embedding, agg_type=exp['agg_type'], numeric_separate=exp['nsep']),
        hidden_size=256,
        type='gru',
    )
    return seq_encoder

In [6]:
experiments_glove = []
for e in experiments:
    for agg_t in ['cat', "sum", 'mean']:
        if agg_t != 'cat' and e['disc'] is None:
            continue
        experiments_glove.append(copy.deepcopy(e))
        experiments_glove[-1]['agg_type'] = agg_t
        experiments_glove[-1]['nsep'] = (experiments_glove[-1]['disc'] is None)
        experiments_glove[-1]['name'] += f'_{agg_t}'

In [7]:
experiments_glove

[{'name': 'glove_emb_cfg2_no_num_gender_cat',
  'glove_config': {'alpha': 0.75,
   'x_max': 15000,
   'embedding_size': 16,
   'num_epochs_train': 50},
  'disc': None,
  'agg_type': 'cat',
  'nsep': True},
 {'name': 'glove_emb_cfg1_ST20_gender_cat',
  'glove_config': {'alpha': 0.75,
   'x_max': 100,
   'embedding_size': 16,
   'num_epochs_train': 50},
  'disc': <ptls.preprocessing.baseline_discretizer.single_tree_discretizer.SingleTreeDiscretizer at 0x1b778062c20>,
  'agg_type': 'cat',
  'nsep': False},
 {'name': 'glove_emb_cfg1_ST20_gender_sum',
  'glove_config': {'alpha': 0.75,
   'x_max': 100,
   'embedding_size': 16,
   'num_epochs_train': 50},
  'disc': <ptls.preprocessing.baseline_discretizer.single_tree_discretizer.SingleTreeDiscretizer at 0x1b778061a80>,
  'agg_type': 'sum',
  'nsep': False},
 {'name': 'glove_emb_cfg1_ST20_gender_mean',
  'glove_config': {'alpha': 0.75,
   'x_max': 100,
   'embedding_size': 16,
   'num_epochs_train': 50},
  'disc': <ptls.preprocessing.baseline_

In [11]:
for exp in experiments:
    df_params = {'numeric_cols': ['amount'],
             'cat_cols': ['mcc_code', 'tr_type'],
             'cat_unique': [184, 77, 457],
             'date_col': 'tr_datetime',
             'id_col': 'client_id'
    }
    data = copy.deepcopy(source_data)
    if 'disc' in exp:
        disc = exp['disc']
        if exp['name'][:2] != "st" and ('ST' not in exp['name']) and (disc is not None):
            disc.fit(data)
            data = disc.transform(data, to_embeds=exp['nemb'] if 'nemb' in exp else False)
        elif (disc is not None):
            disc.fit(data.sample(int(1e+5), random_state=42).merge(targets, on=df_params['id_col'], how='left'))
            data = disc.transform(data, to_embeds=exp['nemb'] if 'nemb' in exp else False)
        print(f"{exp['name']}: DATA DISCRETIZED")

    if 'nemb' in exp and not exp['nemb']:
        df_params['cat_cols'] =  df_params['numeric_cols'] + df_params['cat_cols']
        df_params["cat_unique"] = (exp['disc'].k_bins if (type(exp['disc'].k_bins) is list) else [exp['disc'].k_bins] * len(df_params['numeric_cols'])) + df_params["cat_unique"]
        df_params['numeric_cols'] = []
    else:
        nn = []
        for fn in df_params['numeric_cols']:
            nn += [fn + '_val', fn + '_pos']
    
    if 'glove_config' in exp:
        if not exp['nsep']:
            embedded_feats = df_params['numeric_cols'] + df_params['cat_cols'] + [df_params["date_col"]]
        else:
            embedded_feats = df_params['cat_cols'] + [df_params["date_col"]]
        folder_nm = f'../glove_embeddings/{exp["name"]}'[:-4] if exp['agg_type'] != 'mean' else f'../glove_embeddings/{exp["name"]}'[:-5]
        glove_embedding = GloveEmbedding(
            feature_names=embedded_feats,
            calculate_cooccur=False,
            embedding_folder=folder_nm,
            glove_params=exp['glove_config']
        )
        glove_embedding.load()
        data = glove_embedding.tokenize_data(data)

    preprocessor = PandasDataPreprocessor(
        col_id=df_params['id_col'],
        col_event_time=df_params['date_col'],
        event_time_transformation='none',
        category_transformation = 'none' if ('glove_config' in exp) else 'frequency',
        cols_category=df_params['cat_cols'],
        cols_numerical= nn if ('nemb' in exp and exp['nemb']) else df_params['numeric_cols'] ,
        return_records=True,
    )
#+ (df_params['numeric_cols'] if ('glove_config' in exp) else [])
#if ('nsep' in exp and exp['nsep']) else []    
    dataset = preprocessor.fit_transform(data)

    dataset = sorted(dataset, key=lambda x: x[df_params['id_col']])

    with open(f"../coles_prep_datasets/{exp['name']}_dataset_gender.pkl", "wb") as fl:
        pickle.dump(dataset , fl)

    # with open(f"../coles_prep_datasets/{exp['name']}_dataset.pkl", "rb") as fl:
    #     dataset = pickle.load(fl)
    
    train, test = train_test_split(dataset, test_size=0.2, random_state=42)

    train, val = train_test_split(dataset, test_size=0.1, random_state=42)


    print(f"{exp['name']}: DATA PREPROCESSED AND SAVED")

    del dataset, data

    train_dl = PtlsDataModule(
        train_data = ColesDataset(
                MemoryMapDataset(
                    data=train,
                    i_filters=[
                        SeqLenFilter(min_seq_len=25),
                    ],
                ),
                splitter=SampleSlices(
                    split_count=5,
                    cnt_min=25,
                    cnt_max=200,
                ),
            ),
        train_num_workers=1,
        train_batch_size=256,
        valid_data = ColesDataset(
                MemoryMapDataset(
                    data=val,
                    i_filters=[
                        SeqLenFilter(min_seq_len=25),
                    ],
                ),
                splitter=SampleSlices(
                    split_count=5,
                    cnt_min=25,
                    cnt_max=200,
                ),
            ),
        valid_num_workers=1,
        valid_batch_size=256,
    )

    #seq_encoder = get_cat_encoder(df_params, agg_type=exp['agg_type'], num_emb_flag=exp['nemb'])
    #seq_encoder = get_trans_encoder(df_params, agg_type=exp['agg_type'], algo=exp['algo'], numeric_separate=exp['nsep'])
    #seq_encoder = get_glove_encoder(df_params, exp, glove_embedding)
    seq_encoder = get_basic_model_encoder(df_params)

    model = CoLESModule(
        seq_encoder=seq_encoder,
        optimizer_partial=partial(torch.optim.Adam, lr=0.001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
    )

    early_stopping = EarlyStopping(f'valid/{model.metric_name}', mode='max', patience=2, min_delta=0.01)
    
    trainer = pl.Trainer(
        max_epochs=25,
        accelerator="gpu",
        callbacks = [early_stopping],
        enable_progress_bar=True,
        enable_model_summary=False,
        logger=False
    )

    print(f"{exp['name']}: TRAIN STARTS")

    trainer.fit(model, train_dl)
    print(trainer.logged_metrics)

    torch.save(seq_encoder.state_dict(), f"../models_coles/{exp['name']}_gender.pt")

    print(f"{exp['name']}: TRAIN ENDS, MODEL SAVED")

    del train, test

basic_model_emb_16_gender: DATA PREPROCESSED AND SAVED


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


basic_model_emb_16_gender: TRAIN STARTS


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

{'loss': tensor(131.8917), 'seq_len': tensor(101.8275), 'valid/recall_top_k': tensor(0.8515)}
basic_model_emb_16_gender: TRAIN ENDS, MODEL SAVED


### Get cosine-dist metric values on train/test datasets

In [13]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
res_metrics = []
N_TRIALS = 5

tasks = {'boosting' : LGBMClassifier(), 'r_forest' : RandomForestClassifier()}

def cuda_memory_clear():
    import gc
    gc.collect()
    torch.cuda.empty_cache()

def get_train_test_age_bins_scenario(df_params, train_embeds, test_embeds):
    data_path = "../data/age_bins"
    
    df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
    df_target = df_target.set_index(df_params["id_col"])
    df_target.rename(columns={"bins": "target"}, inplace=True)
    
    train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
    train_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in train]
    train_df = train_df.merge(df_target, how='left', on=df_params["id_col"])
    
    test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
    test_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in test]
    test_df = test_df.merge(df_target, how='left', on=df_params["id_col"])
    return train_df, test_df

def get_train_test_gender_scenario(df_params, train_embeds, test_embeds):
    data_path = "../data/gender"

    df_target = pd.read_csv(os.path.join('../data/gender', 'gender_train1.csv')).drop(columns=['Unnamed: 0'])
    df_target = df_target.set_index(df_params["id_col"])
    df_target.rename(columns={"gender": "target"}, inplace=True)
    
    train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
    train_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in train]
    train_df = train_df.merge(df_target, how='left', on=df_params["id_col"])
    
    test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
    test_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in test]
    test_df = test_df.merge(df_target, how='left', on=df_params["id_col"])
    train_df = train_df.fillna(2)
    test_df = test_df.fillna(2)
    return train_df, test_df

for exp in experiments:
    print(exp['name'])
    data = copy.deepcopy(source_data)
    if 'disc' in exp:
        if exp['name'][:2] != "st" and ('ST' not in exp['name']) and (exp['disc'] is not None):
            exp['disc'].fit(data)
        elif (exp['disc'] is not None):
            exp['disc'].fit(data.sample(int(1e+5), random_state=42).merge(targets, on=df_params['id_col'], how='left'))
    df_params = {'numeric_cols': ['amount'],
             'cat_cols': ['mcc_code', 'tr_type'],
             'cat_unique': [184, 77, 457],
             'date_col': 'tr_datetime',
             'id_col': 'client_id'
    }
    if 'nemb' in exp and not exp['nemb']:
        df_params['cat_cols'] =  df_params['numeric_cols'] + df_params['cat_cols']
        df_params["cat_unique"] = (exp['disc'].k_bins if (type(exp['disc'].k_bins) is list) else [exp['disc'].k_bins] * len(df_params['numeric_cols'])) + df_params["cat_unique"]
        df_params['numeric_cols'] = []
    with open(f"../coles_prep_datasets/{exp['name']}_dataset_gender.pkl", "rb") as fl:
        dataset = pickle.load(fl)

    train, test = train_test_split(dataset, test_size=0.2, random_state=42)

    if 'glove_config' in exp:
        if not exp['nsep']:
            embedded_feats = df_params['numeric_cols'] + df_params['cat_cols'] + [df_params["date_col"]]
        else:
            embedded_feats = df_params['cat_cols'] + [df_params["date_col"]]
        folder_nm = f'../glove_embeddings/{exp["name"]}'[:-4] if exp['agg_type'] != 'mean' else f'../glove_embeddings/{exp["name"]}'[:-5]
        glove_embedding = GloveEmbedding(
            feature_names=embedded_feats,
            calculate_cooccur=False,
            embedding_folder=folder_nm,
            glove_params=exp['glove_config']
        )
        glove_embedding.load()
    
    #seq_encoder = get_cat_encoder(df_params, agg_type=exp['agg_type'], num_emb_flag=exp['nemb'])
    #seq_encoder = get_trans_encoder(df_params, agg_type=exp['agg_type'], algo=exp['algo'], numeric_separate=exp['nsep'])
    #seq_encoder = get_glove_encoder(df_params, exp, glove_embedding)

    # if exp['name'] != 'basic_model_emb_16':
    #     seq_encoder = get_trans_encoder(df_params, agg_type=exp['agg_type'], algo=exp['algo'], numeric_separate=exp['nsep'])
    #     mx = torch.load(f"../models_coles/{exp['name']}_emb.pt")
    #     for fn in df_params['cat_cols'] + [df_params["date_col"]] + df_params['numeric_cols'] :
    #         if f'trx_encoder.embeddings.emb_dict.{fn}.weight' in mx:
    #             mx.pop(f'trx_encoder.embeddings.emb_dict.{fn}.weight')
    #     seq_encoder.load_state_dict(mx)
    # else:
    seq_encoder = get_basic_model_encoder(df_params)
    seq_encoder.load_state_dict(torch.load(f"../models_coles/{exp['name']}_gender.pt"))

    metric = BatchRecallTopK(4)

    train_embeds = None
    test_embeds = None
    #('train', train)
    for ds_name, ds in [('test', test)]:
        dl = PtlsDataModule(
            test_data = ColesDataset(
                    MemoryMapDataset(
                        data=ds,
                        i_filters=[
                            SeqLenFilter(min_seq_len=25),
                        ],
                    ),
                    splitter=SampleSlices(
                        split_count=5,
                        cnt_min=25,
                        cnt_max=200,
                    ),
                ),
            train_num_workers=1,
            train_batch_size=256,
        )

        module = TestModule(
            model = seq_encoder,
            metrics = {"recall_top_k" : metric}
        )
    
        predictor = pl.Trainer(
                accelerator="gpu",
                enable_model_summary=False,
                logger=False
        )

        predictor.predict(module, dl)

        ds_metrics = module.get_metrics()

        for m in ds_metrics:
            res_metrics.append([exp['name'], ds_name, m, ds_metrics[m]])

    coles_model = CoLESModule(
            seq_encoder=seq_encoder,
    )

    inference_runner = pl.Trainer(
        accelerator="gpu",
        enable_progress_bar=True,
        enable_model_summary=False,
        logger=False
    )

    with torch.no_grad():
        cuda_memory_clear()
        train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
        train_embeds = torch.vstack(inference_runner.predict(coles_model, train_dl))
        cuda_memory_clear()
        test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
        test_embeds = torch.vstack(inference_runner.predict(coles_model, test_dl))

    train_df, test_df = get_train_test_age_bins_scenario(df_params, train_embeds, test_embeds)

    for task_name, clf in tasks.items():
        sc_test = 0
        sc_train = 0
        num = 0
        for i in tqdm(range(N_TRIALS)):
            embed_columns = [x for x in train_df.columns if x.startswith('embed')]
            x_train, y_train = train_df[embed_columns], train_df['target']
            x_test, y_test = test_df[embed_columns], test_df['target']
            
            clf.fit(x_train, y_train)
            sc_test += clf.score(x_test, y_test)
            #sc_train += clf.score(x_train, y_train)
            num += 1

        #res_metrics.append([exp['name'], 'train', f"acc_{task_name}", sc_train/num])
        res_metrics.append([exp['name'], 'test', f"acc_{task_name}", sc_test/num])

    report = pd.DataFrame(res_metrics, columns = ['exp_name', 'dataset', 'metric', 'value'])
    report.to_csv('../coles_report/experiments/report_gender_baseline.csv')

basic_model_emb_16_gender


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 256
[LightGBM] [Info] Start training from score -1.161020
[LightGBM] [Info] Start training from score -1.394999
[LightGBM] [Info] Start training from score -0.823256
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 256
[LightGBM] [Info] Start training from score -1.161020
[LightGBM] [Info] Start training from score -1.394999
[LightGBM] [Info] Start training from score -0.823256
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009507

  0%|          | 0/5 [00:00<?, ?it/s]