In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

import logging
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Data load

In [3]:
! mkdir ../../data
! curl -OL https://storage.googleapis.com/di-datasets/age-prediction-nti-sbebank-2019.zip
! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d ../../data
! mv age-prediction-nti-sbebank-2019.zip ../../data/

mkdir: cannot create directory ‘../../data’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  239M  100  239M    0     0   143M      0  0:00:01  0:00:01 --:--:--  143M
Archive:  age-prediction-nti-sbebank-2019.zip
  inflating: ../../data/test.csv     
  inflating: ../../data/small_group_description.csv  
  inflating: ../../data/train_target.csv  
  inflating: ../../data/transactions_train.csv  
  inflating: ../../data/transactions_test.csv  


## Data Preproccessing

In [4]:
import os
import pandas as pd

data_path = '../../data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [5]:
from dltranz.data_preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    cols_event_time='trans_date',
    time_transformation='float',
    cols_category=["trans_date", "small_group"],
    cols_log_norm=["amount_rur"],
    print_dataset_info=False,
)

In [6]:
%%time

dataset = preprocessor.fit_transform(source_data)

CPU times: user 1min 5s, sys: 11.7 s, total: 1min 17s
Wall time: 1min 16s


In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train), len(test))

24000 6000


## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (pl.LightningModule)
    * data_module (pl.LightningDataModule)
    * pl.trainer (pl.trainer)
    
For futher details check https://www.pytorchlightning.ai/

### model 

In [8]:
from dltranz.seq_encoder import SequenceEncoder
from dltranz.models import Head
from dltranz.lightning_modules.emb_module import EmbModule

seq_encoder = SequenceEncoder(
    category_features=preprocessor.get_category_sizes(),
    numeric_features=["amount_rur"],
    trx_embedding_noize=0.003
)

head = Head(input_size=seq_encoder.embedding_size, use_norm_encoder=True)

model = EmbModule(seq_encoder=seq_encoder, head=head)

### Data module

In [9]:
from dltranz.data_load.data_module.emb_data_module import EmbeddingTrainDataModule

dm = EmbeddingTrainDataModule(
    dataset=train,
    pl_module=model,
    min_seq_len=25,
    seq_split_strategy='SampleSlices',
    category_names = model.seq_encoder.category_names,
    category_max_size = model.seq_encoder.category_max_size,
    split_count=5,
    split_cnt_min=25,
    split_cnt_max=200,
    train_num_workers=16,
    train_batch_size=256,
    valid_num_workers=16,
    valid_batch_size=256
)

### Trainer

In [10]:
import torch
import pytorch_lightning as pl

import logging
# logging.getLogger("lightning").addHandler(logging.NullHandler())
# logging.getLogger("lightning").propagate = False

trainer = pl.Trainer(
#     progress_bar_refresh_rate=0,
    max_epochs=150,
    gpus=1 if torch.cuda.is_available() else 0
)

### Training 

In [19]:
%%time

trainer.fit(model, dm)

## Inference 

In [12]:
# embedding inference

from dltranz.inference import get_embeddings

train_embeds = get_embeddings(
    data=train,
    model=model, 
    category_names = model.seq_encoder.category_names,
    category_max_size = model.seq_encoder.category_max_size,
)

test_embeds = get_embeddings(
    data=test,
    model=model, 
    category_names = model.seq_encoder.category_names,
    category_max_size = model.seq_encoder.category_max_size,
)

train_embeds.shape, test_embeds.shape

24000it [00:02, 9276.94it/s]
6000it [00:00, 7929.94it/s]                    
                                               

((24000, 512), (6000, 512))

In [136]:
# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)
train_df.head(2)

(24000, 514) (6000, 514)


Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_504,embed_505,embed_506,embed_507,embed_508,embed_509,embed_510,embed_511,client_id,target
0,-0.005807,-0.058029,0.662589,-0.025395,0.015006,-0.143082,0.273978,0.720892,-0.781646,0.013621,...,-0.03687,-0.012004,0.325783,-0.001054,-0.053529,0.030138,0.209631,0.055678,36253,1
1,0.040785,-0.04951,0.936986,0.13848,0.02306,-0.143601,0.259793,0.226826,-0.850043,-0.00422,...,-0.112304,-0.028837,0.018069,-0.026397,0.075928,0.003776,0.336074,-0.008147,396,2


In [137]:
y_train = train_df['target'].values
X_train = train_df.drop(['client_id', 'target'], axis=1)
X_train_emb = pd.DataFrame(np.arange(len(X_train)))
X_train_emb['embeddings'] = X_train.values.tolist()
X_train_emb = X_train_emb.drop([0], axis=1)

y_val = test_df['target'].values
X_val = test_df.drop(['client_id', 'target'], axis=1)
X_val_emb = pd.DataFrame(np.arange(len(X_val)))
X_val_emb['embeddings'] = X_val.values.tolist()
X_val_emb = X_val_emb.drop([0], axis=1)

## Lets use obtained embeddings for CatBoost training

In [145]:
# !pip install catboost

In [155]:
from catboost import CatBoostClassifier, metrics

### With CatBoost embedding_features

In [156]:
CatBoostModel_emb = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    use_best_model=True,
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent',
    embedding_features=['embeddings']
)

In [157]:
%%time

CatBoostModel_emb.fit(
    X_train_emb, y_train,
    eval_set=(X_val_emb, y_val),
#     logging_level='Verbose',  # you can uncomment this for text output
)

CPU times: user 1min 13s, sys: 3.26 s, total: 1min 16s
Wall time: 52.1 s


<catboost.core.CatBoostClassifier at 0x7f33eee71d00>

In [159]:
CatBoostModel_emb.get_best_score()

{'learn': {'Accuracy': 0.653, 'MultiClass': 0.8087743073729691},
 'validation': {'Accuracy': 0.6185, 'MultiClass': 0.8762496658268969}}

### Without CatBoost embedding_features

In [160]:
CatBoostModel = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    use_best_model=True,
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [161]:
%%time

CatBoostModel.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
#     logging_level='Verbose',  # you can uncomment this for text output
)

CPU times: user 7min 23s, sys: 9.58 s, total: 7min 32s
Wall time: 28.6 s


<catboost.core.CatBoostClassifier at 0x7f33eee60910>

In [165]:
CatBoostModel.get_best_score()

{'learn': {'Accuracy': 0.7834166666666667, 'MultiClass': 0.621036204100321},
 'validation': {'Accuracy': 0.6308333333333334,
  'MultiClass': 0.8416157837933733}}

In [164]:
CatBoostModel.score(X_val, y_val)

0.6265