In [1]:
import pickle

import numpy as np
import pandas as pd
import torch

from pathlib import Path
from functools import partial
import datetime

from ptls.preprocessing import PandasDataPreprocessor
from ptls.nn.normalization import L2NormEncoder

from sklearn.model_selection import StratifiedKFold, train_test_split

import ptls.data_load
import ptls.data_load.datasets
import ptls.frames
import ptls.frames.coles
import ptls.frames.inference_module
import ptls.nn

import pytorch_lightning as pl
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [2]:
MODEL_NAME='multi_coles_static_adaptive_idx_7'
EMBED_COEF = 0.05
fold_i = 4
gpu_n = 0

df_trx_pretrain = pd.read_pickle(f'idx_data/fold_{fold_i}/df_trx_pretrain.pickle')
df_seq_pretrain = pd.read_pickle(f'idx_data/fold_{fold_i}/df_seq_pretrain.pickle')
df_gbm_train = pd.read_pickle(f'idx_data/fold_{fold_i}/df_gbm_train.pickle')
df_gbm_test = pd.read_pickle(f'idx_data/fold_{fold_i}/df_gbm_test.pickle')

with open(f'idx_data/fold_{fold_i}/pdp.pickle', 'rb') as f:
    pdp = pickle.load(f)

In [3]:
df_seq_pretrain_train, df_seq_pretrain_valid = train_test_split(
    df_seq_pretrain, test_size=0.05, shuffle=True, random_state=42)
len(df_seq_pretrain_train), len(df_seq_pretrain_valid)


coles_data_module = ptls.frames.PtlsDataModule(
    train_data=ptls.frames.coles.ColesDataset(
        data=ptls.data_load.datasets.MemoryMapDataset(
            df_seq_pretrain_train.to_dict(orient='records') + 
            df_trx_pretrain.to_dict(orient='records')
        ),
        splitter=ptls.frames.coles.split_strategy.SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    valid_data=ptls.frames.coles.ColesDataset(
        data=ptls.data_load.datasets.MemoryMapDataset(
            df_seq_pretrain_train.to_dict(orient='records')),
        splitter=ptls.frames.coles.split_strategy.SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=100,
        ),
    ),
    train_batch_size=64,
    train_num_workers=4,
    valid_batch_size=650,
)

In [4]:
'''
pl_coles_module = ptls.frames.coles.CoLESModule(
    validation_metric=ptls.frames.coles.metric.BatchRecallTopK(
        K=4,
        metric='cosine',
    ),
    seq_encoder=ptls.nn.RnnSeqEncoder(
        trx_encoder=ptls.nn.TrxEncoder(norm_embeddings=False,
                                       embeddings_noise=0.003,
                                       embeddings={
                                               'weekday': {'in': 10, 'out': 8},
                                               'small_group': {'in': 250, 'out': 16},
                                               'event_time': {'in': 800, 'out': 8},
                                           },
                                       numeric_values={ 
                                               'amount_rur': 'log',
                                           },
                                       ),
        
        input_size=33,
        type='gru',
        hidden_size=400,
        is_reduce_sequence=True,
    ),
    head=ptls.nn.Head(use_norm_encoder=True),
    loss=ptls.frames.coles.losses.ContrastiveLoss(
        margin=1.,
        sampling_strategy=ptls.frames.coles.sampling_strategies.HardNegativePairSelector(
          neg_count=5,
        ),
    ),
    optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9025)
)
'''

"\npl_coles_module = ptls.frames.coles.CoLESModule(\n    validation_metric=ptls.frames.coles.metric.BatchRecallTopK(\n        K=4,\n        metric='cosine',\n    ),\n    seq_encoder=ptls.nn.RnnSeqEncoder(\n        trx_encoder=ptls.nn.TrxEncoder(norm_embeddings=False,\n                                       embeddings_noise=0.003,\n                                       embeddings={\n                                               'weekday': {'in': 10, 'out': 8},\n                                               'small_group': {'in': 250, 'out': 16},\n                                               'event_time': {'in': 800, 'out': 8},\n                                           },\n                                       numeric_values={ \n                                               'amount_rur': 'log',\n                                           },\n                                       ),\n        \n        input_size=33,\n        type='gru',\n        hidden_size=400,\n        is_reduc

In [5]:
'''
trainer = pl.Trainer(
    gpus=[gpu_n],
    max_epochs=150,
    enable_checkpointing=False,
    enable_progress_bar=True,
    gradient_clip_val=0.5,
    gradient_clip_algorithm="value",
    track_grad_norm = 2,
)

pretrain_logger_version = trainer.logger.version
print(f'pretrain_logger_version = {pretrain_logger_version}')
trainer.fit(pl_coles_module, coles_data_module)

first_model_name = '_'.join(['first_model', MODEL_NAME, str(fold_i)]) + '.pth'
torch.save(pl_coles_module.seq_encoder.state_dict(), first_model_name)
'''

'\ntrainer = pl.Trainer(\n    gpus=[gpu_n],\n    max_epochs=150,\n    enable_checkpointing=False,\n    enable_progress_bar=True,\n    gradient_clip_val=0.5,\n    gradient_clip_algorithm="value",\n    track_grad_norm = 2,\n)\n\npretrain_logger_version = trainer.logger.version\nprint(f\'pretrain_logger_version = {pretrain_logger_version}\')\ntrainer.fit(pl_coles_module, coles_data_module)\n\nfirst_model_name = \'_\'.join([\'first_model\', MODEL_NAME, str(fold_i)]) + \'.pth\'\ntorch.save(pl_coles_module.seq_encoder.state_dict(), first_model_name)\n'

In [6]:
first_model_name = '_'.join(['first_model', 'coles_first_model_idx_7', str(fold_i)]) + '.pth'

In [7]:
import time
import os
while first_model_name not in os.listdir():
    time.sleep(60)

In [8]:
class ResNet(torch.nn.Module):
    def __init__(self, h):
        super().__init__()
        self.net = torch.nn.Sequential(torch.nn.Linear(h, h), torch.nn.BatchNorm1d(h), torch.nn.ReLU(), 
                                 torch.nn.Linear(h, h), torch.nn.BatchNorm1d(h), torch.nn.ReLU())
    
    def forward(self, inp):
        return self.net(inp) + inp


class ClfDisc(torch.nn.Module):
    def __init__(self, inp1=400, inp2=400, h=512):
        super().__init__()
        self.a = torch.nn.Sequential(torch.nn.Linear(inp1, h), torch.nn.BatchNorm1d(h), torch.nn.ReLU(),
                                     ResNet(h), ResNet(h), ResNet(h), L2NormEncoder())
        self.b = torch.nn.Sequential(torch.nn.Linear(inp1, h), torch.nn.BatchNorm1d(h), torch.nn.ReLU(),
                                     ResNet(h), ResNet(h), ResNet(h), L2NormEncoder())

    def forward(self, domain_a, domain_b):
        a = self.a(domain_a)
        b = self.b(domain_b)
        return -torch.sqrt(((a - b) ** 2).sum(axis=-1, keepdims=True))


class IdentityDisc(torch.nn.Module):
    def __init__(self, inp=400, h=512):
        super().__init__()
        self.inp = 400
        self.h = 512
        self.k = self.h / self.inp
        self.register_parameter('lame', torch.nn.Parameter(torch.tensor(0.)))

    def forward(self, domain_a, domain_b):
        return ((domain_a - domain_b)**2).sum(axis=-1, keepdims=True) * self.k + 0 * self.lame

In [9]:
coles_loss = ptls.frames.coles.losses.ContrastiveLoss(margin=1.,
                             sampling_strategy=ptls.frames.coles.sampling_strategies.HardNegativePairSelector(neg_count=5))
club_loss = ptls.frames.coles.losses.CLUBLoss(log_var=-2, emb_coef=1, prob_coef=1.)


#discriminator_model = torch.nn.Sequential(torch.nn.Linear(400, 512), torch.nn.BatchNorm1d(512), torch.nn.ReLU(), 
#                                          torch.nn.Linear(512, 512), torch.nn.BatchNorm1d(512), torch.nn.ReLU(),
#                                          torch.nn.Linear(512, 400), ptls.nn.L2NormEncoder())
discriminator_model = ClfDisc()

seq_encoder=ptls.nn.RnnSeqEncoder(
    trx_encoder=ptls.nn.TrxEncoder(norm_embeddings=False,
                                   embeddings_noise=0.003,
                                   embeddings={'weekday': {'in': 10, 'out': 8},
                                               'small_group': {'in': 250, 'out': 16},
                                               'event_time': {'in': 800, 'out': 8},
                                               'index_0': {'in': 7, 'out': 8},
                                               #'index_1': {'in': 47, 'out': 16},
                                               #'index_2': {'in': 157, 'out': 16},
                                              },
                                   numeric_values={ 
                                               'amount_rur': 'log',},
                                  ),
    
    input_size=41,
    type='gru',
    hidden_size=400,
    is_reduce_sequence=True
)


pl_coles_module = ptls.frames.coles.MultiCoLESModule(
    head=ptls.nn.Head(use_norm_encoder=True),
    loss=coles_loss,
    discriminator_loss=club_loss,
    seq_encoder=seq_encoder,
    discriminator=discriminator_model,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
    d_optimizer_partial=partial(torch.optim.Adam, lr=0.01),
    trained_encoders=[first_model_name],
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9025),
    coles_coef=1.,
    embed_coef=EMBED_COEF,
    g_step_every=1,
    ema_alpha=0.01,
    gamma_max=0.97,
    gamma_min=0.85,
    delta_coef=0.0001,
    delta_up_coef=10,
)

In [10]:
trainer = pl.Trainer(
    gpus=[gpu_n],
    max_epochs=30, # 150,
    enable_checkpointing=False,
    enable_progress_bar=True,
    track_grad_norm = 2,
)
pretrain_logger_version = trainer.logger.version
print(f'pretrain_logger_version = {pretrain_logger_version}')
trainer.fit(pl_coles_module, coles_data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


pretrain_logger_version = 168


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name                    | Type            | Params
------------------------------------------------------------
0 | _loss                   | ContrastiveLoss | 0     
1 | _seq_encoder            | RnnSeqEncoder   | 542 K 
2 | _validation_metric      | BatchRecallTopK | 0     
3 | discriminator_loss      | CLUBLoss        | 0     
4 | trained_models          | ModuleList      | 542 K 
5 | discriminator           | ClfDisc         | 3.6 M 
6 | reference_discriminator | ClfDisc         | 3.6 M 
7 | _head                   | Head            | 0     
------------------------------------------------------------
7.7 M     Trainable params
542 K     Non-trainable params
8.2 M     Total params
32.955    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [11]:
inference_dl_gbm_train = torch.utils.data.DataLoader(
    dataset=ptls.data_load.datasets.MemoryMapDataset(
        df_gbm_train.to_dict(orient='records'),
        i_filters=[
            ptls.data_load.iterable_processing.ISeqLenLimit(max_seq_len=2000), 
        ],
    ),
    collate_fn=ptls.data_load.utils.collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=12,
)

inference_dl_gbm_test = torch.utils.data.DataLoader(
    dataset=ptls.data_load.datasets.MemoryMapDataset(
        df_gbm_test.to_dict(orient='records'),
        i_filters=[
            ptls.data_load.iterable_processing.ISeqLenLimit(max_seq_len=2000), 
        ],
    ),
    collate_fn=ptls.data_load.utils.collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=12,
)


inf_model = ptls.frames.inference_module.InferenceModule(
    model=pl_coles_module, pandas_output=True, model_out_name='emb')

predict_gbm_train = pl.Trainer(gpus=[gpu_n], enable_progress_bar=False, logger=None)\
.predict(inf_model, inference_dl_gbm_train)

predict_gbm_test = pl.Trainer(gpus=[gpu_n], enable_progress_bar=False, logger=None)\
.predict(inf_model, inference_dl_gbm_test)

predict_gbm_train = pd.concat(predict_gbm_train, axis=0)
predict_gbm_test = pd.concat(predict_gbm_test, axis=0)
predict_gbm_train.set_index('client_id', inplace=True)
predict_gbm_test.set_index('client_id', inplace=True)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


In [12]:
for gbm_i in range(5):
    gbm_model = LGBMClassifier(**{
          'n_estimators': 1000,
          'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'num_class': 4,
          'metric': 'multi_error',
          'learning_rate': 0.02,
          'subsample': 0.75,
          'subsample_freq': 1,
          'feature_fraction': 0.75,
          'colsample_bytree': None,
          'max_depth': 12,
          'lambda_l1': 1,
          'reg_alpha': None,
          'lambda_l2': 1,
          'reg_lambda': None,
          'min_data_in_leaf': 50,
          'min_child_samples': None,
          'num_leaves': 50,
          'random_state': 42+gbm_i,
          'n_jobs': 4,
    })
    gbm_model.fit(predict_gbm_train.drop(columns='bins'), predict_gbm_train['bins'])
    
    acc = accuracy_score(
        gbm_model.predict(predict_gbm_test.drop(columns='bins')), 
        predict_gbm_test['bins'],
    )
    with open('my_results_idx.log', 'at') as f:
        print('\t'.join([
            MODEL_NAME,
            f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S}',
            f'{fold_i}',
            'accuracy',
            f'{acc:.4f}',
            f'{pretrain_logger_version}',
            'gbm_seed',
            f'{42+gbm_i}'
        ]), file=f)
    print(acc)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.165010 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 204000
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 800
[LightGBM] [Info] Start training from score -1.394999
[LightGBM] [Info] Start training from score -1.386794
[LightGBM] [Info] Start training from score -1.378326
[LightGBM] [Info] Start training from score -1.385128
0.606
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.168721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 204000
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 800
[LightGBM] [Info] Start training from score -1.394999
[LightGBM] [Info] Start training from score -1.386794
[LightGBM] [Info] Start training from score -1.378326
[LightGBM] [Info] Start training