In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


In [2]:
%cd ../

/home/nikita/ML/work_repo/vtb_competition/vtb_data_fusion_contest


In [3]:
from glob import glob

import numpy as np
import pandas as pd
import torch

import pytorch_lightning as pl

from pyhocon import ConfigFactory

In [4]:
FOLD_ID = 1

fold_id_test = FOLD_ID

folds_count = len(glob('data/train_matching_*.csv'))
folds_count

6

In [5]:
# fold_id_valid = np.random.choice([i for i in range(folds_count) if i != fold_id_test], size=1)[0]
fold_id_valid = (fold_id_test + 1) % folds_count
fold_id_valid

2

In [6]:
df_matching_train = pd.concat([pd.read_csv(f'data/train_matching_{i}.csv')
                              for i in range(folds_count) 
                              if i not in (fold_id_test, fold_id_valid)])
df_matching_valid = pd.read_csv(f'data/train_matching_{fold_id_valid}.csv')
df_matching_test = pd.read_csv(f'data/train_matching_{fold_id_test}.csv')

[len(df) for df in [df_matching_train, df_matching_valid, df_matching_test]]

[11721, 2930, 2930]

In [7]:
def click_types(df):
    df['cat_id'] = df['cat_id'].astype(str)
    return df[['user_id', 'timestamp', 'cat_id']]

def click_pivot(df):
    clickstream_embed = df.pivot_table(index = 'user_id', 
                            values=['timestamp'],
                            columns=['cat_id'],
                            aggfunc=['count']).fillna(0)
    clickstream_embed.columns = [f'{str(i[0])}-{str(i[2])}' for i in clickstream_embed.columns]
    clickstream_embed.loc['0'] = np.empty(len(clickstream_embed.columns))

    dtype_clickstream = list()
    for x in clickstream_embed.dtypes.tolist():
        if x=='int64':
            dtype_clickstream.append('int16')
        elif(x=='float64'):
            dtype_clickstream.append('float32')
        else:
            dtype_clickstream.append('object')

    dtype_clickstream = dict(zip(clickstream_embed.columns.tolist(), dtype_clickstream))
    clickstream_embed = clickstream_embed.astype(dtype_clickstream)
    clickstream_embed.reset_index(drop=False, inplace=True)
    return clickstream_embed

In [8]:
%%time

df_click = pd.read_csv(f'data/clickstream.csv')
df_click = click_types(df_click)
df_click = click_pivot(df_click)

CPU times: user 1min 46s, sys: 20 s, total: 2min 6s
Wall time: 2min 13s


  clickstream_embed.reset_index(drop=False, inplace=True)


In [9]:
df_click_train = df_click[lambda x: x['user_id'].isin(df_matching_train['rtk'].values)]
df_click_valid = df_click[lambda x: x['user_id'].isin(df_matching_valid['rtk'].values)]
df_click_test = df_click[lambda x: x['user_id'].isin(df_matching_test['rtk'].values)]

print(df_click_train.shape, df_click_valid.shape, df_click_test.shape)

(9782, 403) (2446, 403) (2446, 403)


In [10]:
def trx_types(df):
    df['mcc_code'] = df['mcc_code'].astype(str)
    df['currency_rk'] = df['currency_rk'].astype(str)
    df['event_time'] = pd.to_datetime(df['transaction_dttm']).astype(int) / 1e9
    return df[['user_id', 'event_time', 'mcc_code', 'currency_rk', 'transaction_amt']]

def trx_pivot(df):
    bankclient_embed = df.pivot_table(index = 'user_id', 
                        values=['transaction_amt'],
                        columns=['mcc_code'],
                        aggfunc=['sum','mean', 'count']).fillna(0)
    bankclient_embed.columns = [f'{str(i[0])}-{str(i[2])}' for i in bankclient_embed.columns]
    
    dtype_bankclient = list()
    for x in bankclient_embed.dtypes.tolist():
        if x=='int64':
            dtype_bankclient.append('int16')
        elif(x=='float64'):
            dtype_bankclient.append('float32')
        else:
            dtype_bankclient.append('object')
    
    dtype_bankclient = dict(zip(bankclient_embed.columns.tolist(), dtype_bankclient))
    bankclient_embed = bankclient_embed.astype(dtype_bankclient)
    bankclient_embed.reset_index(drop=False, inplace=True)
    return bankclient_embed

In [11]:
%%time

df_trx = pd.read_csv(f'data/transactions.csv')
df_trx = trx_types(df_trx)
df_trx = trx_pivot(df_trx)

df_trx_train = df_trx[lambda x: x['user_id'].isin(df_matching_train['bank'].values)]
df_trx_valid = df_trx[lambda x: x['user_id'].isin(df_matching_valid['bank'].values)]
df_trx_test = df_trx[lambda x: x['user_id'].isin(df_matching_test['bank'].values)]

print(df_trx_train.shape, df_trx_valid.shape, df_trx_test.shape)

  bankclient_embed.reset_index(drop=False, inplace=True)


(11721, 1159) (2930, 1159) (2930, 1159)
CPU times: user 31.9 s, sys: 3.82 s, total: 35.7 s
Wall time: 36.9 s


In [12]:
def prepare_dataset(df_matching, df_trx, df_click, neg_count=19):
    positive = pd.merge(df_matching, df_trx, left_on='bank', right_on='user_id').drop(columns='user_id')
    positive = pd.merge(positive, df_click, left_on='rtk', right_on='user_id').drop(columns='user_id')
    positive['target'] = 1

    rtks = df_click['user_id'].unique()

    negative = pd.DataFrame(data=df_trx['user_id'].values, columns=['bank'])
    negative['rtk'] = negative['bank'].apply(lambda x: np.random.choice(rtks, size=neg_count))
    negative = negative.explode('rtk')

    negative = pd.merge(negative, df_trx, left_on='bank', right_on='user_id').drop(columns='user_id')
    negative = pd.merge(negative, df_click, left_on='rtk', right_on='user_id').drop(columns='user_id')
    negative['target'] = 0

    dataset = pd.concat([positive, negative]).sample(frac=1)

    return dataset


train = prepare_dataset(df_matching_train, df_trx_train, df_click_train)
valid = prepare_dataset(df_matching_valid, df_trx_valid, df_click_valid)

train.shape, valid.shape

((234420, 1563), (58600, 1563))

In [13]:
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

drop_columns = ['bank', 'rtk', 'target']
TARGET = 'target'
CAT_FEATURES = []

params = dict(
    objective='binary',
    metric='auc',
    n_estimators=5000,
    boosting_type='gbdt',
    learning_rate=0.01,
    subsample=0.75,
    subsample_freq=1,
    feature_fraction=0.75,
    max_depth=8,
    lambda_l1=0.5,
    lambda_l2=0.5,
    num_leaves=128,
    random_state=42,
    verbose=-1
)

train_data = lgb.Dataset(train.drop(columns=drop_columns),
                                 label=train[TARGET],
                                 categorical_feature=CAT_FEATURES)

valid_data = lgb.Dataset(valid.drop(columns=drop_columns),
                         label=valid[TARGET],
                         categorical_feature=CAT_FEATURES)

lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], verbose_eval=50)
preds = lgb_model.predict(valid.drop(columns=drop_columns))

metric_value = roc_auc_score(valid[TARGET], preds)
round(metric_value, 4)



[50]	training's auc: 0.816927	valid_1's auc: 0.689003
[100]	training's auc: 0.841256	valid_1's auc: 0.699088
[150]	training's auc: 0.860464	valid_1's auc: 0.706953
[200]	training's auc: 0.877574	valid_1's auc: 0.715476
[250]	training's auc: 0.890523	valid_1's auc: 0.721447
[300]	training's auc: 0.903704	valid_1's auc: 0.725865
[350]	training's auc: 0.914664	valid_1's auc: 0.729821
[400]	training's auc: 0.925555	valid_1's auc: 0.733962
[450]	training's auc: 0.933285	valid_1's auc: 0.737244
[500]	training's auc: 0.941992	valid_1's auc: 0.740424
[550]	training's auc: 0.948728	valid_1's auc: 0.74348
[600]	training's auc: 0.955663	valid_1's auc: 0.746621
[650]	training's auc: 0.960148	valid_1's auc: 0.749322
[700]	training's auc: 0.964726	valid_1's auc: 0.751973
[750]	training's auc: 0.968581	valid_1's auc: 0.755053
[800]	training's auc: 0.971697	valid_1's auc: 0.757019
[850]	training's auc: 0.974734	valid_1's auc: 0.759195
[900]	training's auc: 0.977509	valid_1's auc: 0.761886
[950]	traini

0.7882

In [14]:
df_trx_valid.set_index('user_id', inplace=True)
df_click_valid.set_index('user_id', inplace=True)
df_matching_valid.set_index('bank', inplace=True)

In [15]:
def inference(df_trx, df_click, model, model_features, batch_size=200):

    list_of_rtk = list(df_click.index.unique())
    list_of_bank = list(df_trx.index.unique())

    submission = pd.DataFrame(list_of_bank, columns=['bank'])
    submission['rtk'] = submission['bank'].apply(lambda x: list_of_rtk)

    num_of_batches = int((len(list_of_bank))/batch_size)+1
    submission_ready = []

    for i in range(num_of_batches):
        bank_ids = list_of_bank[(i*batch_size):((i+1)*batch_size)]
        if len(bank_ids) != 0:
            part_of_submit = submission[submission['bank'].isin(bank_ids)].explode('rtk')
            part_of_submit = part_of_submit.merge(df_trx, how='left', left_on='bank', right_index=True
                                        ).merge(df_click, how='left', left_on='rtk', right_index=True).fillna(0)

            for feature in model_features:
                if feature not in part_of_submit.columns:
                    part_of_submit[feature] = 0

            part_of_submit['predicts'] = model.predict(part_of_submit[model_features])
            part_of_submit = part_of_submit[['bank', 'rtk', 'predicts']]

            part_of_submit = part_of_submit.sort_values(by=['bank', 'predicts'], ascending=False).reset_index(drop=True)
            part_of_submit = part_of_submit.pivot_table(index='bank', values='rtk', aggfunc=list)
            part_of_submit['rtk'] = part_of_submit['rtk'].apply(lambda x: x[:100])
            part_of_submit['bank'] = part_of_submit.index
            part_of_submit = part_of_submit[['bank', 'rtk']]
            submission_ready.extend(part_of_submit.values)

    submission_final = np.array(submission_ready, dtype=object)
    return submission_final

In [16]:
%%time

preds = inference(df_trx_valid, df_click_valid, lgb_model, lgb_model.feature_name())

CPU times: user 1h 33min 16s, sys: 39 s, total: 1h 33min 55s
Wall time: 13min 11s


In [17]:
%%time

def score(preds, matching):
    pre = 0.0
    mrr = 0.0

    for ix_bank, pred in preds:
        match = matching.loc[ix_bank]['rtk']
        d = match in pred
        if d:
            pre += 1
            mrr += 1 / (1 + pred.index(match))

    pre /= len(preds)
    mrr /= len(preds)
    r1 = 2 * pre * mrr / (pre + mrr)

    return pre, mrr, r1

pre, mrr, r1 = score(preds, df_matching_valid)

print(f'pre: {pre:.3f}, mrr: {mrr:.3f}, r1: {r1:.3f}')

pre: 0.360, mrr: 0.175, r1: 0.235
CPU times: user 102 ms, sys: 8 µs, total: 102 ms
Wall time: 101 ms


In [20]:
import pickle

with open('lgb_submit/lgb_model.p', 'wb') as f:
    pickle.dump(lgb_model, f)