In [1]:
from IPython.core.display import HTML, display
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import HTML, display


In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

import logging
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Data load

In [3]:
! mkdir ../../data
! curl -OL https://storage.googleapis.com/di-datasets/age-prediction-nti-sbebank-2019.zip
! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d ../../data
! mv age-prediction-nti-sbebank-2019.zip ../../data/

mkdir: cannot create directory ‘../../data’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  239M  100  239M    0     0   198M      0  0:00:01  0:00:01 --:--:--  197M
Archive:  age-prediction-nti-sbebank-2019.zip
  inflating: ../../data/test.csv     
  inflating: ../../data/small_group_description.csv  
  inflating: ../../data/train_target.csv  
  inflating: ../../data/transactions_train.csv  
  inflating: ../../data/transactions_test.csv  


## Data Preproccessing

In [4]:
import os
import pandas as pd

data_path = '../../data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [5]:
from dltranz.data_preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    cols_event_time='trans_date',
    time_transformation='float',
    cols_category=["trans_date", "small_group"],
    cols_log_norm=["amount_rur"],
    print_dataset_info=False,
)

In [15]:
# Split data into train and finetuning parts:

metric_learn_data, finetune_data = train_test_split(source_data, test_size=0.5, random_state=42)

In [17]:
%%time

import pickle

preproc_fitted = preprocessor.fit(metric_learn_data)

# Save preprocessor:
# with open('preproc_fitted.pickle', 'wb') as handle:
#     pickle.dump(preproc_fitted, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
dataset = preproc_fitted.transform(metric_learn_data)


CPU times: user 42.7 s, sys: 8.74 s, total: 51.4 s
Wall time: 51.3 s


In [18]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train), len(test))

24000 6000


## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (pl.LightningModule)
    * data_module (pl.LightningDataModule)
    * pl.trainer (pl.trainer)
    
For futher details check https://www.pytorchlightning.ai/

### model 

In [58]:
from dltranz.seq_encoder import SequenceEncoder
from dltranz.models import Head
from dltranz.lightning_modules.emb_module import EmbModule

seq_encoder = SequenceEncoder(
    category_features=preprocessor.get_category_sizes(),
    numeric_features=["amount_rur"],
    trx_embedding_noize=0.003
)

head = Head(input_size=seq_encoder.embedding_size, use_norm_encoder=True)

model = EmbModule(seq_encoder=seq_encoder, head=head)

### Data module

In [62]:
from dltranz.data_load.data_module.emb_data_module import EmbeddingTrainDataModule

dm = EmbeddingTrainDataModule(
    dataset=train,
    pl_module=model,
    min_seq_len=25,
    seq_split_strategy='SampleSlices',
    category_names = model.seq_encoder.category_names,
    category_max_size = model.seq_encoder.category_max_size,
    split_count=5,
    split_cnt_min=25,
    split_cnt_max=200,
    train_num_workers=16,
    train_batch_size=256,
    valid_num_workers=16,
    valid_batch_size=256
)

### Trainer

In [63]:
import torch
import pytorch_lightning as pl

import logging
# logging.getLogger("lightning").addHandler(logging.NullHandler())
# logging.getLogger("lightning").propagate = False

trainer = pl.Trainer(
#     progress_bar_refresh_rate=0,
    max_epochs=10,
    gpus=1 if torch.cuda.is_available() else 0
)

### Training 

In [None]:
%%time

trainer.fit(model, dm)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

## FineTuning

In [37]:
from pyhocon import ConfigFactory
from dltranz.seq_to_target import SequenceToTarget


class SeqToTargetDemo(SequenceToTarget):
    def __init__(self,
                 seq_encoder = None,
                 encoder_lr: float = 0.0001,
                 in_features: int = 256,
                 out_features: int = 1,
                 head_lr: float = 0.005,
                 weight_decay: float = 0.0,
                 lr_step_size: int = 1,
                 lr_step_gamma: float = 0.60):  
        
        params = {
            'score_metric': ['auroc', 'accuracy'],

            'encoder_type': 'pretrained',
            'pretrained': {
                'pl_module_class': 'dltranz.lightning_modules.coles_module.CoLESModule',
                'lr': encoder_lr
            },

            'head_layers': [
                ['BatchNorm1d', {'num_features': in_features}],
                ['Linear', {"in_features": in_features, "out_features": out_features}],
                ['Sigmoid', {}],
                ['Squeeze', {}]
            ],

            'train': {
              'random_neg': 'false',
              'loss': 'bce',
              'lr': head_lr,
              'weight_decay': weight_decay,
            },
            'lr_scheduler': {
              'step_size': lr_step_size,
              'step_gamma': lr_step_gamma
            }
        }
        super().__init__(ConfigFactory.from_dict(params), seq_encoder)


pretrained_encoder = model.seq_encoder
downstream_model = SeqToTargetDemo(pretrained_encoder,
                                   encoder_lr=0.0001,
                                   in_features=model.seq_encoder.embedding_size,
                                   out_features=1,
                                   head_lr=0.05,
                                   weight_decay=0.0,
                                   lr_step_size=1,
                                   lr_step_gamma=0.60)


In [39]:
finetune_dataset = preproc_fitted.transform(finetune_data)

finetune_dm = EmbeddingTrainDataModule(
        dataset=finetune_dataset,
        pl_module=downstream_model,
        min_seq_len=25,
        seq_split_strategy='SampleSlices',
        category_names = model.seq_encoder.category_names,
        category_max_size = model.seq_encoder.category_max_size,
        split_count=5,
        split_cnt_min=25,
        split_cnt_max=200,
        train_num_workers=16,
        train_batch_size=256,
        valid_num_workers=16,
        valid_batch_size=256
)

In [65]:
# trainer.fit(downstream_model, dm)