## Data load

## Setup

In [2]:
%load_ext autoreload
%autoreload 2

# import logging
import torch
import pytorch_lightning as pl
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Data preproccessing

In [3]:
import os
import pandas as pd

source_data = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true', compression='gzip')
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [4]:
# Load pretrained preprocessor
import pickle
from ptls.preprocessing import PandasDataPreprocessor

with open('preprocessor.p', 'rb') as f:
    preprocessor = pickle.load(f)

In [5]:
%%time

dataset = preprocessor.transform(source_data)

CPU times: user 34.9 s, sys: 7.15 s, total: 42.1 s
Wall time: 42 s


In [6]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train), len(test))

24000 6000


## Inference 

### load SequenceEncoder obtained from `coles-emb.ipynb`

In [8]:
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.supervised import SequenceToTarget

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
        'small_group': {'in': 250, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

seq_encoder.load_state_dict(torch.load('coles-emb.pt'))

model = SequenceToTarget(seq_encoder)
model.eval();

### embedding inference

In [9]:
from ptls.data_load.datasets import inference_data_loader

trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

(torch.Size([24000, 256]), torch.Size([6000, 256]))

In [10]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)
train_df.head(2)

(24000, 258) (6000, 258)


Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_248,embed_249,embed_250,embed_251,embed_252,embed_253,embed_254,embed_255,client_id,target
0,0.369912,0.107698,-0.078325,0.028368,-0.463864,-0.297325,0.182668,0.282931,0.335743,-0.470002,...,0.024539,0.325287,0.6609,-0.180033,-0.276484,-0.288909,-0.240146,0.042354,36253,1
1,0.33384,-0.006155,-0.031773,-0.203356,-0.670753,-0.421903,0.141004,0.272937,0.444377,-0.550712,...,-0.064095,0.020957,0.169336,-0.377122,-0.250504,-0.083562,-0.05057,0.01287,396,2


In [11]:
import numpy as np


y_train = train_df['target'].values
X_train = train_df.drop(['client_id', 'target'], axis=1)
X_train_emb = pd.DataFrame(np.arange(len(X_train)))
X_train_emb['embeddings'] = X_train.values.tolist()
X_train_emb = X_train_emb.drop([0], axis=1)

y_val = test_df['target'].values
X_val = test_df.drop(['client_id', 'target'], axis=1)
X_val_emb = pd.DataFrame(np.arange(len(X_val)))
X_val_emb['embeddings'] = X_val.values.tolist()
X_val_emb = X_val_emb.drop([0], axis=1)

## Lets use obtained embeddings for CatBoost training

In [12]:
#!pip install catboost

from catboost import CatBoostClassifier, metrics

### With CatBoost embedding_features

In [13]:
CatBoostModel_emb = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    use_best_model=True,
    custom_metric=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent',
    embedding_features=['embeddings'],
    depth=5
)

In [14]:
%%time

CatBoostModel_emb.fit(
    X_train_emb, y_train,
    eval_set=(X_val_emb, y_val),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 1min 7s, sys: 3.85 s, total: 1min 11s
Wall time: 35 s


<catboost.core.CatBoostClassifier at 0x7f2e3ac79940>

In [15]:
CatBoostModel_emb.get_best_score()

{'learn': {'Accuracy': 0.616875, 'MultiClass': 0.8707710171950899},
 'validation': {'Accuracy': 0.5991666666666666,
  'MultiClass': 0.9140885900754535}}

### Without CatBoost embedding_features

In [16]:
CatBoostModel = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    use_best_model=True,
    custom_metric=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent',
    depth=4
)

In [17]:
%%time

CatBoostModel.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 2min 30s, sys: 4.72 s, total: 2min 35s
Wall time: 7.99 s


<catboost.core.CatBoostClassifier at 0x7f2e24311100>

In [18]:
CatBoostModel.get_best_score()

{'learn': {'Accuracy': 0.6730833333333334, 'MultiClass': 0.7880862797825388},
 'validation': {'Accuracy': 0.6115, 'MultiClass': 0.8752625132255518}}

In [19]:
CatBoostModel.score(X_val, y_val)

0.6093333333333333