# Baseline model

Use `AggFeatureSeqEncoder` to make handcraft aggregated features.
Estimate the quality for downstream task.

In [1]:
import pickle

import numpy as np
import pandas as pd

from pathlib import Path

In [2]:
from ptls.preprocessing import PandasDataPreprocessor


libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [3]:
from sklearn.model_selection import StratifiedKFold, train_test_split

In [4]:
for fold_i in [4,]:

    df_trx_pretrain = pd.read_pickle(f'data/fold_{fold_i}/df_trx_pretrain.pickle')
    df_seq_pretrain = pd.read_pickle(f'data/fold_{fold_i}/df_seq_pretrain.pickle')
    df_gbm_train = pd.read_pickle(f'data/fold_{fold_i}/df_gbm_train.pickle')
    df_gbm_test = pd.read_pickle(f'data/fold_{fold_i}/df_gbm_test.pickle')

    with open(f'data/fold_{fold_i}/pdp.pickle', 'rb') as f:
        pdp = pickle.load(f)
        
    break

In [5]:
import ptls.data_load
import ptls.data_load.datasets
import ptls.frames
import ptls.frames.coles
import ptls.frames.inference_module
import ptls.nn

In [6]:
import torch

In [7]:
from functools import partial

In [8]:
pl_agg_module = ptls.frames.coles.CoLESModule(
    seq_encoder=ptls.nn.AggFeatureSeqEncoder(
        embeddings={
            'weekday': {'in': 10},
            'small_group': {'in': 250},
        },
        numeric_values={ 
            'amount_rur': 'identity',
        },
        was_logified=False,
    )
)

In [9]:
import pytorch_lightning as pl

In [10]:
inference_dl_gbm_train = torch.utils.data.DataLoader(
    dataset=ptls.data_load.datasets.MemoryMapDataset(
        df_gbm_train.to_dict(orient='records'),
        i_filters=[
            ptls.data_load.iterable_processing.ISeqLenLimit(max_seq_len=2000), 
        ],
    ),
    collate_fn=ptls.data_load.utils.collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=12,
)

inference_dl_gbm_test = torch.utils.data.DataLoader(
    dataset=ptls.data_load.datasets.MemoryMapDataset(
        df_gbm_test.to_dict(orient='records'),
        i_filters=[
            ptls.data_load.iterable_processing.ISeqLenLimit(max_seq_len=2000), 
        ],
    ),
    collate_fn=ptls.data_load.utils.collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=12,
)

In [11]:
inf_model = ptls.frames.inference_module.InferenceModule(
    model=pl_agg_module.seq_encoder, pandas_output=True, model_out_name='emb')

In [12]:
predict_gbm_train = pl.Trainer(gpus=1, enable_progress_bar=False, logger=None)\
.predict(inf_model, inference_dl_gbm_train)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [13]:
predict_gbm_test = pl.Trainer(gpus=1, enable_progress_bar=False, logger=None)\
.predict(inf_model, inference_dl_gbm_test)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [14]:
predict_gbm_train = pd.concat(predict_gbm_train, axis=0)

In [15]:
predict_gbm_test = pd.concat(predict_gbm_test, axis=0)

In [16]:
predict_gbm_train.set_index('client_id', inplace=True)
predict_gbm_test.set_index('client_id', inplace=True)

In [17]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [18]:
gbm_model = LGBMClassifier(**{
      'n_estimators': 1000,
      'boosting_type': 'gbdt',
      'objective': 'multiclass',
      'num_class': 4,
      'metric': 'multi_error',
      'learning_rate': 0.02,
      'subsample': 0.75,
      'subsample_freq': 1,
      'feature_fraction': 0.75,
      'colsample_bytree': None,
      'max_depth': 12,
      'lambda_l1': 1,
      'reg_alpha': None,
      'lambda_l2': 1,
      'reg_lambda': None,
      'min_data_in_leaf': 50,
      'min_child_samples': None,
      'num_leaves': 50,
      'random_state': 42,
      'n_jobs': 4,
})

In [19]:
gbm_model.fit(predict_gbm_train.drop(columns='bins'), predict_gbm_train['bins'])

In [20]:
acc = accuracy_score(
    gbm_model.predict(predict_gbm_test.drop(columns='bins')), 
    predict_gbm_test['bins'],
)
acc

0.629

In [21]:
import datetime

In [22]:
pretrain_logger_version = -1

In [23]:
with open('results.log', 'at') as f:
    print('\t'.join([
        '02_agg_baseline',
        f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S}',
        f'{fold_i}',
        'accuracy',
        f'{acc:.4f}',
        f'{pretrain_logger_version}',
]), file=f)

In [25]:
df_res = pd.read_csv(
    'results.log', sep='\t', header=None,
    names=['model', 'time', 'fold_i', 'metric', 'value', 'pretrain_logger_version']
)
df_res.tail(10)

Unnamed: 0,model,time,fold_i,metric,value,pretrain_logger_version
0,02_coles_baseline,2024-01-17 01:55:47,0,accuracy,0.6245,8
1,02_agg_baseline,2024-01-17 08:54:59,0,accuracy,0.6297,-1
2,02_agg_baseline,2024-01-17 08:57:20,1,accuracy,0.635,-1
3,02_agg_baseline,2024-01-17 09:00:16,2,accuracy,0.6362,-1
4,02_agg_baseline,2024-01-17 09:03:30,3,accuracy,0.6325,-1
5,02_agg_baseline,2024-01-17 09:07:43,4,accuracy,0.629,-1


In [30]:
df_res.groupby('model')['value'].agg(['mean', 'std', lambda x: sorted(x)])

Unnamed: 0_level_0,mean,std,<lambda_0>
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
02_agg_baseline,0.63248,0.003163,"[0.629, 0.6297, 0.6325, 0.635, 0.6362]"
02_coles_baseline,0.6245,,[0.6245]
