In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

## Data load

In [3]:
! mkdir ../../data
! curl -OL https://storage.googleapis.com/di-datasets/age-prediction-nti-sbebank-2019.zip
! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d ../../data
! mv age-prediction-nti-sbebank-2019.zip ../../data/

## Data Preproccessing

In [4]:
import os
import pandas as pd

data_path = '../../data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [5]:
df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')

df_target.head(2)

Unnamed: 0_level_0,bins
client_id,Unnamed: 1_level_1
24662,2
1046,0


In [6]:
import logging
logging.basicConfig(level=logging.INFO, format='%(funcName)-20s   : %(message)s')

In [7]:
from dltranz.data_preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    cols_event_time='trans_date',
    time_transformation='float',
    cols_category=["trans_date", "small_group"],
    cols_log_norm=["amount_rur"],
    print_dataset_info=False,
)

In [8]:
%%time

dataset = preprocessor.fit_transform(source_data)

_td_float              : To-float time transformation
transform              : Feature collection in progress ...
transform              : Prepared features for 30000 clients


CPU times: user 49.4 s, sys: 3.31 s, total: 52.7 s
Wall time: 52.7 s


In [9]:
from dltranz.data_preprocessing.util import update_with_target

dataset = update_with_target(dataset, df_target, 'client_id', 'bins')

In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

train, valid = train_test_split(train, test_size=0.25, random_state=42)

print(len(train), len(valid), len(test))

18000 6000 6000


## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (pl.LightningModule)
    * data_module (pl.LightningDataModule)
    * pl.trainer (pl.trainer)
    
For futher details check https://www.pytorchlightning.ai/

### model 

In [11]:
from dltranz.seq_encoder import SequenceEncoder

seq_encoder = SequenceEncoder(
    category_features=preprocessor.get_category_sizes(),
    numeric_features=["amount_rur"],
    trx_embedding_noize=0.003
)

In [12]:
from dltranz.models import Head

head = Head(input_size=seq_encoder.embedding_size, use_norm_encoder=True)

In [13]:
from dltranz.lightning_modules.emb_module import EmbModule

model = EmbModule(seq_encoder=seq_encoder, head=head)

  stream(template_mgs % msg_args)


### data module

In [14]:
from dltranz.data_load.data_module.emb_data_module import EmbeddingTrainDataModule

dm = EmbeddingTrainDataModule(
    train=train,
    valid=valid,
    pl_module=model,
    min_seq_len=25,
    seq_split_strategy='SampleSlices',
    split_count=5,
    split_cnt_min=25,
    split_cnt_max=200,
    train_num_workers=16,
    train_batch_size=256,
    valid_num_workers=16,
    valid_batch_size=256
)

### trainer

In [15]:
import torch
import pytorch_lightning as pl

trainer = pl.Trainer(
    max_epochs=150,
    gpus=1 if torch.cuda.is_available() else 0
)

set_distributed_mode   : GPU available: True, used: True
set_distributed_mode   : TPU available: False, using: 0 TPU cores


### training 

In [16]:
%%time

trainer.fit(model, dm)

0it [00:00, ?it/s]

setup_map              : Loaded 18000 for train


0it [00:00, ?it/s]

setup_map              : Loaded 6000 for valid
set_nvidia_flags       : LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
get_lr_scheduler       : StepLR lr_scheduler used
summarize              : 
  | Name               | Type             | Params
--------------------------------------------------------
0 | _seq_encoder       | SequenceEncoder  | 855 K 
1 | _validation_metric | BatchRecallTopPL | 0     
2 | _head              | Head             | 0     
--------------------------------------------------------
855 K     Trainable params
0         Non-trainable params
855 K     Total params
3.420     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

CPU times: user 1h 11min 35s, sys: 12min 1s, total: 1h 23min 36s
Wall time: 1h 24min 42s


### inference 

In [17]:
from dltranz.train import score_model

def get_embeds(data):
    dl = dm.create_inference_dataloader(data)
    embeds, _ = score_model(model, dl)

    df = pd.DataFrame(data=embeds, columns = [f'embed_{i}' for i in range(embeds.shape[1])])
    df['client_id'] = [x['client_id'] for x in data]
    df['target'] = [x['target'] for x in data]

    return df

train_df = get_embeds(train)
valid_df = get_embeds(valid)
test_df = get_embeds(test)

print(train_df.shape, valid_df.shape, test_df.shape)

train_df.head()

0it [00:00, ?it/s]

                                                                                

0it [00:00, ?it/s]

                                                                                

0it [00:00, ?it/s]

                                                                                

(18000, 514) (6000, 514) (6000, 514)




Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_504,embed_505,embed_506,embed_507,embed_508,embed_509,embed_510,embed_511,client_id,target
0,-0.251097,-0.000656,-0.135804,0.8133,0.97386,0.051594,-0.042974,-0.072253,0.010358,0.117536,...,0.05539,-0.113052,-0.272813,-0.075393,-0.1163,-0.036536,-0.041959,0.019408,21913,0
1,-0.342959,0.056122,-0.012851,0.421687,0.961612,-0.052675,-0.007343,-0.066303,-0.166137,-0.01835,...,0.03146,-0.151436,0.127264,-0.488471,-0.004294,-0.046501,-0.04817,0.050471,20915,2
2,-0.116982,0.044243,-0.052426,0.829302,0.449566,0.086275,-0.380041,-0.066621,0.122122,-0.011187,...,0.020102,-0.069431,0.20143,0.020184,-0.015894,0.080722,-0.016586,0.008831,43439,1
3,-0.466537,0.076832,0.088506,-0.283267,0.998537,-0.140224,0.086542,-0.168265,0.695276,0.157042,...,0.092097,-0.155758,-0.05142,-0.013904,-0.01402,-0.04217,-0.089601,0.098638,33681,0
4,-0.285025,0.003682,-0.035685,-0.126177,0.978815,-0.013662,-0.007316,-0.251961,0.269508,0.003988,...,0.016449,-0.104859,-0.018788,0.19816,-0.035927,-0.004006,-0.032437,0.014226,43806,3


In [18]:
# Obtained embeddings can be used as features for model training
# For example:

from sklearn.linear_model import LogisticRegression

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_valid, y_valid = valid_df[embed_columns], valid_df['target']

model = LogisticRegression()
model.fit(x_train, y_train)
model.score(x_valid, y_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6066666666666667