In [1]:
import os
import pandas as pd
from ptls.nn.trx_encoder.glove_embedding import GloveEmbedding
import numpy as np
from functools import partial
from ptls.nn import RnnSeqEncoder, TrxEncoder
from ptls.nn.trx_encoder.trx_encoder_tlf import TrxEncoderTLF
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
import ptls
import torch
import pytorch_lightning as pl
import pickle as pkl
from ptls.frames.coles.losses import ContrastiveLoss

import logging

import ptls
from ptls.preprocessing.deeptlf.src import DeepTLF, TreeDrivenEncoder



## Datasets

#### #1 Age bins

In [2]:
data_path = 'data/age_bins'

df_params = {
    "features" : ["amount_rur", "small_group"],
    "cat_cols" : ["small_group"],
    "numeric_cols" : ["amount_rur"],
    "cat_unique" : [],
    "date_col" : "trans_date",
    "id_col" : "client_id"
}

In [3]:
source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))

for f in df_params["cat_cols"] + [df_params["date_col"]]:
    df_params["cat_unique"].append(source_data[f].unique().shape[0])

#### #2 Gender

In [2]:
data_path = 'data/gender'

source_data = pd.read_csv(os.path.join(data_path, 'transactions.csv'))
source_data = source_data.drop(columns=["term_id"]) 

source_data.tr_datetime = [int(i.split()[0]) for i in source_data.tr_datetime.values]

df_params = {
    "features" : ["mcc_code", "tr_type", "amount"],
    "numeric_cols" : ["amount"],
    "cat_cols" : ["mcc_code", "tr_type"],
    "cat_unique" : [],
    "date_col" : "tr_datetime",
    "id_col" : "customer_id"
}

for f in df_params["cat_cols"] + [df_params["date_col"]]:
    df_params["cat_unique"].append(source_data[f].unique().shape[0])

#### #3 rosbank2

## DeepTLF Encoding

#### Encoding

In [4]:
params = {"n_est" : 20,
          "max_depth" : 6,
          "xgb_lr" : 0.01,
          "min_freq" : 5
         }

tree_encoder = DeepTLF(**params)
split_conditions = tree_encoder.fit(source_data[df_params["features"]])
len(split_conditions)
#encoded_data = tree_encoder.transform(source_data)

1960

In [5]:
#moved to tlf encoder

# df_params["features"] = [f"ef_{i}" for i in range(encoded_data.shape[1])]
# encoded_data = pd.DataFrame(encoded_data, columns=df_params["features"])
# encoded_data[df_params['id_col']] = source_data[df_params['id_col']]
# encoded_data[df_params['date_col']] = source_data[df_params['date_col']]
# del source_data

#### Preprocessing

In [5]:
preprocessor = PandasDataPreprocessor(
    col_id=df_params['id_col'],
    col_event_time=df_params['date_col'],
    event_time_transformation='none',
    cols_numerical=df_params["features"],
    return_records=True,
)

dataset = preprocessor.fit_transform(source_data)
dataset = sorted(dataset, key=lambda x: x[df_params['id_col']])

from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

del dataset, source_data

In [27]:
with open("data/age_bins/train_encoded.pkl", "wb") as f:
    pkl.dump(train, f, protocol=pkl.HIGHEST_PROTOCOL)
with open("data/age_bins/test_encoded.pkl", "wb") as f:
    pkl.dump(test, f, protocol=pkl.HIGHEST_PROTOCOL)

In [3]:
with open("data/age_bins/train_encoded.pkl", "rb") as f:
    train = pkl.load(f)
with open("data/age_bins/test_encoded.pkl", "rb") as f:
    test = pkl.load(f)

## CoLES training

In [6]:
#Basic trx encoder

embeddings=dict()
for i, f in enumerate(df_params["cat_cols"] + [df_params["date_col"]]):
    embeddings[f] = {'in' : df_params["cat_unique"][i], 'out' : 16}

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values=dict([(fe, 'identity') for fe in df_params['numeric_cols']]),
    embeddings=embeddings
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

amount_rur


In [6]:
#TLF trx encoder

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderTLF(encoder=tree_encoder, feature_names=df_params["features"]),
    hidden_size=256,
    type='gru',
)

In [8]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [9]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=1,
    train_batch_size=128,
)

trainer = pl.Trainer(
    max_epochs=12,
    accelerator="cuda",
    #devices=1,
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

logger.version = 195


  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 1.5 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
5.924     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=12` reached.


{'loss': tensor(68.1145), 'seq_len': tensor(100.2608)}


In [12]:
torch.save(seq_encoder.state_dict(), "models/coles-tlf1650-gen.pt")

In [7]:
seq_encoder.load_state_dict(torch.load("models/coles-tlf1800-age.pt"))

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

## Testing embeddings via different models

In [10]:
from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

(torch.Size([24000, 256]), torch.Size([6000, 256]))

In [11]:
def get_train_test_age_bins_scenario(df_params, train_embeds, test_embeds):
    data_path = "data/age_bins"
    
    df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
    df_target = df_target.set_index(df_params["id_col"])
    df_target.rename(columns={"bins": "target"}, inplace=True)
    
    train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
    train_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in train]
    train_df = train_df.merge(df_target, how='left', on=df_params["id_col"])
    
    test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
    test_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in test]
    test_df = test_df.merge(df_target, how='left', on=df_params["id_col"])
    return train_df, test_df

def get_train_test_gender_scenario(df_params, train_embeds, test_embeds):
    data_path = "data/gender"
    
    df_target = pd.read_csv(os.path.join(data_path, 'gender_train.csv'))
    df_target = df_target.set_index(df_params["id_col"])
    df_target.rename(columns={"gender": "target"}, inplace=True)
    
    train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
    train_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in train]
    train_df = train_df.merge(df_target, how='left', on=df_params["id_col"])
    
    test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
    test_df[df_params["id_col"]] = [x[df_params["id_col"]] for x in test]
    test_df = test_df.merge(df_target, how='left', on=df_params["id_col"])
    train_df = train_df.fillna(2)
    test_df = test_df.fillna(2)
    return train_df, test_df

In [12]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [13]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

train_df, test_df = get_train_test_age_bins_scenario(df_params, train_embeds, test_embeds)

#train_df, test_df = get_train_test_gender_scenario(df_params, train_embeds, test_embeds)

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

In [14]:
from ptls.frames.coles.sampling_strategies import HardNegativePairSelector

In [None]:
cl = ContrastiveLoss(margin=0.5, sampling_strategy=HardNegativePairSelector(neg_count=5))
cl(torch.tensor(x_test.values), torch.tensor(y_test))

In [41]:
79563808/6000

13260.634666666667

In [None]:
#79563808 main

#### Random forest classifier

In [17]:
sc = 0
num_iters = 3

for i in range(num_iters):
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    sc += clf.score(x_test, y_test)

In [18]:
sc/num_iters

0.4481111111111111

#### GB classifier

In [19]:
clf = lgb.LGBMClassifier()
clf.fit(x_train, y_train)

y_pred=clf.predict(x_test)
accuracy_score(y_pred, y_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64973
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 256
[LightGBM] [Info] Start training from score -1.161020
[LightGBM] [Info] Start training from score -1.394999
[LightGBM] [Info] Start training from score -0.823256


0.43633333333333335

#### KNeighbors classifier

In [20]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(x_train, y_train)

neigh.score(x_test, y_test)

0.396