## Data load

In [1]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/di-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

## Setup

In [2]:
%load_ext autoreload
%autoreload 2

import logging
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Data preproccessing

In [3]:
import os
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [4]:
from ptls.data_preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    cols_event_time='trans_date',
    time_transformation='float',
    cols_category=["trans_date", "small_group"],
    cols_log_norm=["amount_rur"],
    print_dataset_info=False,
)

In [5]:
%%time

dataset = preprocessor.fit_transform(source_data)

CPU times: user 1min 6s, sys: 12.9 s, total: 1min 19s
Wall time: 1min 19s


In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train), len(test))

24000 6000


## Inference 

### load SequenceEncoder obtained from `coles-emb.ipynb`

In [7]:
import torch
from ptls.seq_encoder import SequenceEncoder
from ptls.models import Head
from ptls.lightning_modules.emb_module import EmbModule

seq_encoder = SequenceEncoder(
    category_features=preprocessor.get_category_sizes(),
    numeric_features=["amount_rur"],
    trx_embedding_noize=0.003
)

head = Head(input_size=seq_encoder.embedding_size, use_norm_encoder=True)

model = EmbModule(seq_encoder=seq_encoder, head=head)

model.load_state_dict(torch.load('coles-emb.pt'))
model.eval()

### embedding inference

In [10]:
from ptls.data_load.data_module.emb_data_module import inference_data_loader


trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

In [11]:
# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)
train_df.head(2)

(24000, 514) (6000, 514)


Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_504,embed_505,embed_506,embed_507,embed_508,embed_509,embed_510,embed_511,client_id,target
0,0.373101,-0.213911,-0.189349,-0.12308,-0.31747,-0.002377,-0.175975,0.102246,-0.009308,0.110643,...,0.176085,-0.815983,0.257683,0.064436,0.270341,0.127847,-0.024828,0.242617,36253,1
1,0.368686,-0.1877,-0.201657,-0.122744,-0.235919,0.000645,0.125182,0.503544,0.062359,-0.01596,...,0.261096,-0.922538,0.693982,0.103815,0.406128,0.129606,-0.048214,0.174769,396,2


In [12]:
import numpy as np


y_train = train_df['target'].values
X_train = train_df.drop(['client_id', 'target'], axis=1)
X_train_emb = pd.DataFrame(np.arange(len(X_train)))
X_train_emb['embeddings'] = X_train.values.tolist()
X_train_emb = X_train_emb.drop([0], axis=1)

y_val = test_df['target'].values
X_val = test_df.drop(['client_id', 'target'], axis=1)
X_val_emb = pd.DataFrame(np.arange(len(X_val)))
X_val_emb['embeddings'] = X_val.values.tolist()
X_val_emb = X_val_emb.drop([0], axis=1)

## Lets use obtained embeddings for CatBoost training

In [25]:
#!pip install catboost

from catboost import CatBoostClassifier, metrics

### With CatBoost embedding_features

In [16]:
CatBoostModel_emb = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    use_best_model=True,
    custom_metric=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent',
    embedding_features=['embeddings'],
    depth=5
)

In [17]:
%%time

CatBoostModel_emb.fit(
    X_train_emb, y_train,
    eval_set=(X_val_emb, y_val),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 1min 33s, sys: 6.11 s, total: 1min 39s
Wall time: 50.9 s


<catboost.core.CatBoostClassifier at 0x7f288f57e340>

In [19]:
CatBoostModel_emb.get_best_score()

{'learn': {'Accuracy': 0.606125, 'MultiClass': 0.8918884607626317},
 'validation': {'Accuracy': 0.5925, 'MultiClass': 0.9238751802205147}}

### Without CatBoost embedding_features

In [20]:
CatBoostModel = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    use_best_model=True,
    custom_metric=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent',
    depth=4
)

In [21]:
%%time

CatBoostModel.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 5min 9s, sys: 9.23 s, total: 5min 18s
Wall time: 17.9 s


<catboost.core.CatBoostClassifier at 0x7f288f594970>

In [22]:
CatBoostModel.get_best_score()

{'learn': {'Accuracy': 0.6721666666666667, 'MultiClass': 0.7912943257854652},
 'validation': {'Accuracy': 0.617, 'MultiClass': 0.8795840934570158}}

In [23]:
CatBoostModel.score(X_val, y_val)

0.6135