# Age prediction task

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import pickle
import os
import sys
from pathlib import Path

import ptls
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import pytorch_lightning as pl

from sklearn.metrics import accuracy_score
from datetime import datetime

from tqdm import tqdm
from functools import partial

In [3]:
import json

def save_json(obj, path):
    with open(path, 'w') as file:
        json.dump(obj, file)

def load_json(path):
    with open(path, 'r') as file:
        return json.load(file)
    
import pickle

def save_pkl(obj, path):
    with open(path, 'wb') as file:
        pickle.dump(obj, file)

def load_pkl(path):
    with open(path, 'rb') as file:
        return pickle.load(file)

In [4]:
os.getcwd()

'/ptls-experiments/pytorch-lifestream/tutorials/ensemble_learning'

## Data Acquisition & Preparation

In [5]:
EXPERIMENT_DIR = 'age_pred'
!mkdir {EXPERIMENT_DIR}

EXPERIMENTS = [
    'raw',
    'composite_1',
    'composite_2'
]
BATCH_SIZE = 64
data_dir = '../../../scenario_age_pred/notebooks/data'


mkdir: cannot create directory ‘age_pred’: File exists


In [16]:
def define_coles():
    
    pl_coles_module = ptls.frames.coles.CoLESModule(
        validation_metric=ptls.frames.coles.metric.BatchRecallTopK(
            K=4,
            metric='cosine',
        ),
        seq_encoder=torch.nn.Sequential(
            ptls.nn.TrxEncoder(
                norm_embeddings=False,
                embeddings_noise=0.003,
                use_batch_norm=False,
                embeddings={
                    'weekday': {'in': 10, 'out': 8},
                    'small_group': {'in': 250, 'out': 16},
                    'event_time': {'in': 800, 'out': 8},
                },
                numeric_values={ 
                    # 'amount_rur': 'identity',
                    # 'amount_rur': 'log',
                    # 'amount_rur': LogScaler(*get_norm(df_seq_pretrain_train)),
                },
            ),
            ptls.nn.RnnEncoder(
                input_size=32,
                type='gru',
                hidden_size=128,
                is_reduce_sequence=True,
            ),
        ),
        head=ptls.nn.Head(input_size=128, use_norm_encoder=True, hidden_layers_sizes=[256, 256]),
        optimizer_partial=partial(torch.optim.Adam, lr=0.001, weight_decay=0.0),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9025)
    )
    return pl_coles_module

In [17]:
# from fedcore.api.utils.data import get_compression_input
# from fedcore.api.main import FedCore
from ptls.frames.coles.losses.contrastive_loss import ContrastiveLoss
from ptls.frames.coles.sampling_strategies.hard_negative_pair_selector import HardNegativePairSelector
from ptls.data_load.datasets import MemoryMapDataset
from ptls.fedcore_compression.fc_utils import fedcore_fit, extract_loss, get_experimental_setup

def run_training(data_dir, define_model, fedcore_setup, save_dir, n_cls=2, folds=range(5)):
    log_path = os.path.join(save_dir, 'exp.json')

    res = []
    for fold_i in tqdm(folds, desc='fold'):
        exp_res = {'fold': fold_i} 

        # data loading
        df_trx_pretrain = pd.read_pickle(f'{data_dir}/fold_{fold_i}/df_trx_pretrain.pickle')
        df_seq_pretrain = pd.read_pickle(f'{data_dir}/fold_{fold_i}/df_seq_pretrain.pickle')

        
        df_seq_pretrain_train, df_seq_pretrain_valid = train_test_split(
            df_seq_pretrain, test_size=0.5, shuffle=True, random_state=42)
        
        coles_data_module = ptls.frames.PtlsDataModule(
        train_data=ptls.frames.coles.ColesDataset(
            data=MemoryMapDataset(
                df_seq_pretrain_train.to_dict(orient='records') + 
                df_trx_pretrain.to_dict(orient='records')
            ),
            splitter=ptls.frames.coles.split_strategy.SampleSlices(
                split_count=5,
                cnt_min=25,
                cnt_max=200,
            ),
        ),
        valid_data=ptls.frames.coles.ColesDataset(
            data=MemoryMapDataset(
                df_seq_pretrain_valid.to_dict(orient='records')),
            splitter=ptls.frames.coles.split_strategy.SampleSlices(
                split_count=5,
                cnt_min=25,
                cnt_max=100,
            ),
        ),
        train_batch_size=BATCH_SIZE,
        train_num_workers=12,
        valid_batch_size=BATCH_SIZE,
        valid_num_workers=12,
        )

        # model initialization
        model = define_model()
        
        training_time_0 = datetime.now().timestamp()
        exp_setup = get_experimental_setup(fedcore_setup)[0]
        exp_setup['need_evo_opt'] = False
        exp_setup['need_fedot_pretrain'] = False
        exp_setup['distributed_compression'] = False
        fedcore_compressor = fedcore_fit(model, coles_data_module, 
                                         exp_setup, 
                                         extract_loss(model),
                                         n_cls=n_cls)
        training_time_1 = datetime.now().timestamp()
        exp_res['training_time_0'] = training_time_0
        exp_res['training_time_1'] = training_time_1

        save_json(res, log_path)

        del input_data
        del model
    return fedcore_compressor

## Without compression

In [None]:
save_path = '/ptls-experiments/compression_experiments/age_pred/raw'
fcomp = run_training(data_dir, 
                define_coles,
                fedcore_setup = 'raw',
    save_dir=save_path,
    folds=[0, 1, 2, 3, 4]
)

## Composite #1

In [None]:
save_path = '/ptls-experiments/compression_experiments/age_pred/composite_1'
fcomp_1 = run_training(data_dir, 
                define_coles,
                fedcore_setup='composite_1',
    save_dir=save_path,
    folds=[0, 1, 2, 3, 4]
)

## Composite #2

In [None]:
save_path = '/ptls-experiments/compression_experiments/age_pred/composite_2'
fcomp_1 = run_training(data_dir, 
                define_coles,
                fedcore_setup='composite_2',
    save_dir=save_path,
    folds=[0, 1, 2, 3, 4]
)

# Inference evaluation

In [None]:
from ptls.frames.inference_module import InferenceModule
from sklearn.decomposition import PCA

def eval_metric_evolution(models_dir, data_dir, save_path, folds=[0, 1, 2, 3, 4]):
    for file in tqdm(os.listdir(models_dir), 'file'):
        if not file.endswith('.pth'): continue
        path = Path(models_dir, file)
        model = torch.load(path)
        evaluate_embs(model, file, data_dir, save_path, folds)


def evaluate_embs(coles, MODEL_NAME, DATA_DIR, save_path, folds=[0, 1, 2, 3, 4]):
    BATCH_SIZE = 256
    for fold_i in tqdm(folds, desc='Fold'):
        df_gbm_train = pd.read_pickle(f'{DATA_DIR}/fold_{fold_i}/df_gbm_train.pickle')
        if 'event_time' in df_gbm_train.columns:
            df_gbm_train['trans_date'] = df_gbm_train['event_time']
        df_gbm_test = pd.read_pickle(f'{DATA_DIR}/fold_{fold_i}/df_gbm_test.pickle')


        if 'event_time' in df_gbm_test.columns:
            df_gbm_test['trans_date'] = df_gbm_test['event_time']

        print('Inference started')

        inference_dl_gbm_train = torch.utils.data.DataLoader(
            dataset=ptls.data_load.datasets.memory_dataset.MemoryMapDataset(
                df_gbm_train.to_dict(orient='records'),
                i_filters=[
                    ptls.data_load.iterable_processing.ISeqLenLimit(max_seq_len=2000), 
                ],
            ),
            collate_fn=ptls.data_load.utils.collate_feature_dict,
            shuffle=False,
            batch_size=BATCH_SIZE,
            num_workers=12,
        )

        inference_dl_gbm_test = torch.utils.data.DataLoader(
            dataset=ptls.data_load.datasets.MemoryMapDataset(
                df_gbm_test.to_dict(orient='records'),
                i_filters=[
                    ptls.data_load.iterable_processing.ISeqLenLimit(max_seq_len=2000), 
                ],
            ),
            collate_fn=ptls.data_load.utils.collate_feature_dict,
            shuffle=False,
            batch_size=BATCH_SIZE,
            num_workers=12,
        )

        inf_model = InferenceModule(
            model=coles.seq_encoder, pandas_output=True, model_out_name='emb')

        predict_gbm_train = pl.Trainer(enable_progress_bar=False, logger=None)\
        .predict(inf_model, inference_dl_gbm_train)

        predict_gbm_test = pl.Trainer(enable_progress_bar=False, logger=None)\
        .predict(inf_model, inference_dl_gbm_test)

        predict_gbm_train = pd.concat(predict_gbm_train, axis=0)

        predict_gbm_test = pd.concat(predict_gbm_test, axis=0)

        predict_gbm_train.set_index('client_id', inplace=True)
        predict_gbm_test.set_index('client_id', inplace=True)

        gbm_model = LGBMClassifier(**{
            'n_estimators': 1000,
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class': 4,
            'metric': 'multi_error',
            'learning_rate': 0.02,
            'subsample': 0.75,
            'subsample_freq': 1,
            'feature_fraction': 0.75,
            'colsample_bytree': None,
            'max_depth': 12,
            'lambda_l1': 1,
            'reg_alpha': None,
            'lambda_l2': 1,
            'reg_lambda': None,
            'min_data_in_leaf': 50,
            'min_child_samples': None,
            'num_leaves': 50,
            'random_state': 42,
            'n_jobs': 4,
        })
        print('GBM started')
        pca = PCA(0.95)
        gbm_model.fit(pca.fit_transform(predict_gbm_train.drop(columns='bins')), predict_gbm_train['bins'])

        acc = accuracy_score(
            gbm_model.predict(pca.transform(predict_gbm_test.drop(columns='bins'))), 
            predict_gbm_test['bins'],
        )

        with open(save_path, 'at') as f:
            print('\t'.join([
                MODEL_NAME,
                f'{datetime.now():%Y-%m-%d %H:%M:%S}',
                f'{fold_i}',
                'accuracy',
                f'{acc:.4f}',
        ]), file=f)

## Non Composite

In [None]:
eval_metric_evolution(
    '/ptls-experiments/compression_experiments/age_pred/raw/checkpoints',
    data_dir,
    '/ptls-experiments/compression_experiments/age_pred/raw/results.txt',
    [0, 1, 2, 3, 4]
)

## Composite #1

In [None]:
eval_metric_evolution(
    '/ptls-experiments/compression_experiments/age_pred/composite_1/checkpoints',
    data_dir,
    '/ptls-experiments/compression_experiments/age_pred/composite_1/results.txt',
    [0, 1, 2, 3, 4]
)

## Composite #2

In [None]:
eval_metric_evolution(
    '/ptls-experiments/compression_experiments/age_pred/composite_2/checkpoints',
    data_dir,
    '/ptls-experiments/compression_experiments/age_pred/composite_2/results.txt',
    [0]
)