In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as ltb

import warnings
warnings.filterwarnings("ignore")

import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import random
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import os
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.


In [7]:
###TRAIN###
train_trx = pd.read_parquet("../embeddings/train.parquet")
train_geo = pd.read_parquet("../embeddings/train_geo.parquet")
train_dial = pd.read_parquet("../embeddings/dialog_embs_m_train.parquet")
train_dial = train_dial.drop(columns=['event_time'])


df_train = train_trx.merge(train_geo, on=['client_id', 'target_1', 'target_2', 'target_3', 'target_4'], how='outer')
df_train = df_train.fillna(0)
df_train = df_train.merge(train_dial, on=['client_id'], how='left')
df_train = df_train.fillna(0)


###TEST###
test_trx = pd.read_parquet("../embeddings/not_only_trx.parquet")
test_geo = pd.read_parquet("../embeddings/test_geo.parquet")
test_dial = pd.read_parquet("../embeddings/dialog_embs_m_test.parquet")
test_dial = test_dial.drop(columns=['event_time'])
test_dial['client_id'] = test_dial['client_id'].apply(lambda x: x.split('_')[0])
test_dial = test_dial.groupby('client_id').agg('mean')


df_test = test_trx.merge(test_geo, on='client_id')
df_test = df_test.merge(test_dial, on='client_id', how='left')
df_test = df_test.fillna(0)

In [9]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [12]:
df_train.loc[0].client_id

'000006265d27d1166ed67506682be7380007a5bead4362f0a9795f7d97fb08e3_month=1'

In [18]:
automl_production = TabularAutoML(
                                    task=Task('binary', metric='auc', loss='logloss'), 
                                    reader_params={'n_jobs': 4, 'random_state': 42, 'advanced_roles': False},
                                    debug=True,
                                    general_params={"use_algos": [['denselight_tuned', 'autoint_tuned']]},
                                    nn_params={
                                        "0": {
                                            "bs": 1024,
                                            "tuning_params": {
                                                "max_tuning_iter": 5,
                                                "max_tuning_time": 100,
                                                "fit_on_holdout": True
                                                },
                                            "freeze_defaults": True,
                                            "n_epochs": 20,
                                            'nn_params': {
                                                'dnn_activation': 'relu',
                                            }
                                        },
                                        '1': {
                                            "bs": 1024,
                                            "tuning_params": {
                                                "max_tuning_iter": 5,
                                                "max_tuning_time": 100,
                                                "fit_on_holdout": True
                                                },
                                            'lr': 1e-3,
                                            "freeze_defaults": True,
                                            "n_epochs": 20
                                        }
                                    }

                                )

In [19]:
def fit(train_embeddings):
#         X_train = train_embeddings.drop(columns=self.drop_feat)
    clfs = dict()
    clf = automl_production
    print(f'predict target:1')
    clf.fit_predict(train_embeddings.drop(columns = ['target_2','target_3','target_4']), roles={'target': 'target_1'}, verbose=3)
    cfls['target_1'] = clf

    clf = automl_production
    print(f'predict target:2')
    clf.fit_predict(train_embeddings.drop(columns = ['target_1','target_3','target_4']), roles={'target': 'target_2'}, verbose=3)
    cfls['target_2'] = clf

    clf = automl_production
    print(f'predict target:3')
    clf.fit_predict(train_embeddings.drop(columns = ['target_1','target_2','target_4']), roles={'target': 'target_3'}, verbose=3)
    cfls['target_3'] = clf

    clf = automl_production
    print(f'predict target:4')
    clf.fit_predict(train_embeddings.drop(columns = ['target_2','target_3','target_1']), roles={'target': 'target_4'}, verbose=3)
    cfls['target_4'] = clf

    return clfs

In [None]:
models = fit(df_train)

predict target:1
[00:16:15] Stdout logging level is INFO3.
[00:16:15] Task: binary

[00:16:15] Start automl preset with listed constraints:
[00:16:15] - time: 3600.00 seconds
[00:16:15] - CPU: 4 cores
[00:16:15] - memory: 16 GB

[00:16:15] [1mTrain data shape: (8830335, 1282)[0m

[00:17:47] Layer [1m1[0m train process start. Time left 3507.78 secs
[00:33:49] Start hyperparameters optimization for [1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_denselight_tuned_0[0m ... Time budget is 100.00 secs


In [6]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path
        self.col_id = col_id
        self.all_targets = targets
        self.params = params
        self.result_apth = result_path
        self.model = automl_production = TabularAutoML(
                                                        task=Task('binary', metric='auc', loss='logloss'), 
                                                        reader_params={'n_jobs': 4, 'cv': 2, 'random_state': 42, 'advanced_roles': False},
                                                        debug=True,
                                                        general_params={"use_algos": [['denselight', 'autoint']]},
                                                        nn_params={
                                                            "0": {
                                                                "bs": 1024,
                                                                "freeze_defaults": True,
                                                                "n_epochs": 30,
                                                                'dnn_params': {
                                                                    'hidden_units': ((512, 0.2, True), (256, 0.2, True)),
                                                                    'dnn_activation': 'relu',
                                                                }
                                                            },
                                                            '1': {
                                                                "bs": 1024,
                                                                'lr': 1e-3,
                                                                "freeze_defaults": True,
                                                                "n_epochs": 30
                                                            }
                                                        }

                                                    )

        path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):
        train_embeddings = pd.read_parquet(self.train_path)
        clfs = dict()
        clf = self.model
        clf.fit_predict(train_embeddings.drop(columns = ['target_2','target_3','target_4']), roles={'target': 'target_1'}, verbose=3)
        cfls['target_1'] = clf
        
        clf = self.model
        clf.fit_predict(train_embeddings.drop(columns = ['target_1','target_3','target_4']), roles={'target': 'target_2'}, verbose=3)
        cfls['target_2'] = clf
        
        clf = self.model
        clf.fit_predict(train_embeddings.drop(columns = ['target_1','target_2','target_4']), roles={'target': 'target_3'}, verbose=3)
        cfls['target_3'] = clf
        
        clf = self.model
        clf.fit_predict(train_embeddings.drop(columns = ['target_2','target_3','target_1']), roles={'target': 'target_4'}, verbose=3)
        cfls['target_4'] = clf
        
        return clfs


    def get_scores(self, clfs):
        scores = pd.DataFrame([])
        test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        scores[self.col_id] = ids

        for col_target in self.all_targets:
            clf = clfs[col_target]
            score = clf.predict_proba(X_test)[:, 1]
            scores[col_target] = score
            continue
        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)
        scores.to_csv(self.result_path)
        return scores

In [None]:
dw = Downstream(
    train_path="../embeddings/geo_trx_train.parquet",
    test_path="../embeddings/geo_not_only_trx.parquet",
    params=params,
    result_path='/kaggle/working/submission.csv'
)

In [None]:
scores = dw.run()