In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import lightgbm as ltb

from sklearn.metrics import roc_auc_score
import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5"

  from .autonotebook import tqdm as notebook_tqdm

libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


# Train

In [None]:
train_geo = pd.read_parquet("embeddings/train_geo.parquet")
train_trx = pd.read_parquet("agg_encoder_train.parquet")
train_dial = pd.read_parquet("embeddings/dialog_embs_m_train.parquet")

# Test

In [None]:
test_geo = pd.read_parquet("embeddings/geo_not_only_trx.parquet")
test_trx = pd.read_parquet("agg_encoder_not_only_trx.parquet")
test_dial = pd.read_parquet("embeddings/dialog_embs_m_test.parquet")

In [4]:
test_dial['month'] = test_dial['client_id'].str.extract(r'_month=(\d+)$')[0].astype(np.int8)
test_dial['client_id'] = test_dial['client_id'].str.extract(r'(.+)_month')[0]
test_dial = test_dial.drop(columns='month')

In [5]:
aggregation_functions = {f'dialog_emb_{i+1}': 'mean' for i in range(len(test_dial.columns)-2)}
test_dial = test_dial.groupby(['client_id']).agg(aggregation_functions).reset_index()

# Merge Train

In [6]:
full_train = train_trx.merge(train_geo.drop(columns=['target_1', 'target_2', 'target_3', 'target_4']), on='client_id', how='left')
# full_train.fillna(0, inplace=True)

In [7]:
full_train = full_train.merge(train_dial.drop(columns=['event_time']), on='client_id', how='left')
full_train.fillna(0, inplace=True)

# Merge Test

In [9]:
full_test = test_trx.merge(test_geo, on='client_id', how='left')
# full_test.drop(columns=['target_1', 'target_2', 'target_3', 'target_4'], inplace=True)

In [10]:
full_test = full_test.merge(test_dial, on='client_id', how='left')
full_test.fillna(0, inplace=True)

# Add target features for train

In [11]:
X_train = full_train.copy()
X_test = full_test.copy()

In [13]:
import gc

#del unnecessary
del train_geo
del train_trx
del train_dial

del test_geo
del test_trx
del test_dial

gc.collect()

19

In [14]:
X_train[['client', 'month']] = X_train['client_id'].str.split('_month=', expand = True)
X_train['client_id'] = X_train['client'] + '_month=' + X_train['month'].astype(str)
X_train['month'] = pd.to_numeric(X_train['month'])

In [15]:
X_train.sort_values(by=['client', 'month'], inplace=True)

X_train['any_target'] = X_train[['target_1', 'target_2', 'target_3', 'target_4']].any(axis=1).astype(int)

for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    X_train[f'{target}_count'] = X_train.groupby('client')[target].cumsum() - X_train[target]

100%|██████████| 5/5 [00:07<00:00,  1.44s/it]


In [16]:
last_seen = X_train[['client', 'month']].copy()
for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
    last_seen[target] = X_train.groupby('client')[target].transform(lambda x: x.where(x == 1).ffill().shift().fillna(0))

100%|██████████| 5/5 [30:08<00:00, 361.60s/it]


In [18]:
# for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
#     X_train[f'last_{target}_month'] = last_seen.groupby('client')['month'].transform(lambda x: x.where(X_train[target] == 1).ffill().shift().fillna(0))


In [19]:
# for target in tqdm(['target_1', 'target_2', 'target_3', 'target_4', 'any_target']):
#     X_train[f'{target}_months_ago'] = X_train['month'] - X_train[f'last_{target}_month']

# X_train.drop(columns=['last_target_1_month', 'last_target_2_month', 'last_target_3_month', 'last_target_4_month', 'last_any_target_month', 'client', 'month'], inplace=True)

In [23]:
X_train.to_parquet("embeddings/X_train_lastTry.parquet", index=False, engine="pyarrow", compression="snappy")

# Add target features for test

In [72]:
test_target_b = pd.read_parquet("test_target_b.parquet")

In [73]:
test_target_b['month'] = pd.to_numeric(pd.to_datetime(test_target_b['mon']).dt.month)

In [74]:
test_target_b['month'].value_counts()

5     143831
4     143831
10    143831
8     143831
6     143831
2     143831
3     143831
9     143831
7     143831
11     95776
12     49107
Name: month, dtype: int64

In [75]:
# test_target_b['client'] = test_target_b['client_id'] + '_month=' + test_target_b['month'].astype(str)

In [76]:
test_target_b.sort_values(by=['client_id', 'month'], inplace=True)

test_target_b['any_target'] = test_target_b[['target_1', 'target_2', 'target_3', 'target_4']].any(axis=1).astype(int)

for target in ['target_1', 'target_2', 'target_3', 'target_4', 'any_target']:
    test_target_b[f'{target}_count'] = test_target_b.groupby('client_id')[target].cumsum() - test_target_b[target]

In [77]:
# last_seen = test_target_b[['client_id', 'month']].copy()
# for target in ['target_1', 'target_2', 'target_3', 'target_4', 'any_target']:
#     last_seen[target] = test_target_b.groupby('client_id')[target].transform(lambda x: x.where(x == 1).ffill().shift().fillna(0))

In [78]:
# for target in ['target_1', 'target_2', 'target_3', 'target_4', 'any_target']:
#     test_target_b[f'last_{target}_month'] = last_seen.groupby('client_id')['month'].transform(lambda x: x.where(df[target] == 1).ffill().shift().fillna(0))


In [79]:
# for target in ['target_1', 'target_2', 'target_3', 'target_4', 'any_target']:
#     test_target_b[f'{target}_months_ago'] = test_target_b['month'] - test_target_b[f'last_{target}_month']

# test_target_b.drop(columns=['last_target_1_month', 'last_target_2_month', 'last_target_3_month', 'last_target_4_month', 'last_any_target_month'], inplace=True)

In [80]:
test_target_b = test_target_b.sort_values(by = 'month', ascending = False)

In [83]:
test_target_b = test_target_b.drop_duplicates(subset = 'client_id', keep = 'first')

In [84]:
test_target_b['month'].value_counts()

12    48877
10    46086
11    45525
Name: month, dtype: int64

In [88]:
X_test = full_test.copy()

In [89]:
X_test = X_test.merge(test_target_b.drop(columns=['target_1', 'target_2', 'target_3', 'target_4']), on='client_id', how='left')

In [91]:
X_test.to_parquet("embeddings/X_test_lastTry.parquet", index=False, engine="pyarrow", compression="snappy")

# Downstream

In [2]:
X_train = pd.read_parquet('embeddings/X_train_lastTry.parquet')
X_test = pd.read_parquet('embeddings/X_test_lastTry.parquet')

In [3]:
import catboost as cb

In [4]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id] + ['month'] + ['client']

    def fit(self):

#         train_embeddings =  self.train_path/ #pd.read_parquet(self.train_path)
        X_tr = self.train_path.drop(columns=self.drop_feat).copy()

        clfs1 = dict()
#         clfs2 = dict()
#         clfs3 = dict()
        clfs4 = dict()
        clfs5 = dict()
        clfs6 = dict()
        
        for col_target in tqdm(self.all_targets):
            
#             clf2 = ltb.LGBMClassifier(**self.params, random_state = 69)
#             clf3 = ltb.LGBMClassifier(**self.params, random_state = 228)
            print('init clf4')
            clf4 = cb.CatBoostClassifier(learning_rate = 0.03, iterations = 1700, class_weights = [1, 9], verbose=100, random_state = 42, task_type = 'GPU')
            print('init clf5')
            clf5 = cb.CatBoostClassifier(learning_rate = 0.03, iterations = 1700, class_weights = [1, 9], verbose=100, random_state = 69, task_type = 'GPU')
            print('init clf6')
            clf6 = cb.CatBoostClassifier(learning_rate = 0.03, iterations = 1700, class_weights = [1, 9], verbose=100, random_state = 228, task_type = 'GPU')
            print('init clf1')
            clf1 = ltb.LGBMClassifier(**self.params, random_state = 42)
            y_train = self.train_path[col_target].copy()
            
            clf1.fit(X_tr, y_train)
#             clf2.fit(X_tr, y_train)
#             clf3.fit(X_tr, y_train)
            clf4.fit(X_tr, y_train)
            clf5.fit(X_tr, y_train)
            clf6.fit(X_tr, y_train)
            
            print(f'Model fitted, target: {col_target}')
            clfs1[col_target] = clf1
#             clfs2[col_target] = clf2
#             clfs3[col_target] = clf3
            clfs4[col_target] = clf4
            clfs5[col_target] = clf5
            clfs6[col_target] = clf6

#         return clfs1, clfs2,clfs3, clfs4,clfs5, clfs6
        return clfs1, clfs4, clfs5, clfs6

    def get_scores(
        self,
        clfs1,
#         clfs2,
#         clfs3,
        clfs4,
        clfs5,
        clfs6
    ):
        scores = pd.DataFrame([])

#         test_embeddings_curr = self.test_path.drop_duplicates('client_id') #pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = (self.test_path.drop_duplicates('client_id')).drop(columns=[self.col_id]).copy()
        ids = X_test[self.col_id]
        scores[self.col_id] = ids

        for col_target in self.all_targets:
            clf1 = clfs1[col_target]
#             clf2 = clfs2[col_target]
#             clf3 = clfs3[col_target]
            clf4 = clfs4[col_target]
            clf5 = clfs5[col_target]
            clf6 = clfs6[col_target]
            score1 = clf1.predict_proba(X_test)[:, 1]
#             score2 = clf2.predict_proba(X_test)[:, 1]
#             score3 = clf3.predict_proba(X_test)[:, 1]
            score4 = clf4.predict_proba(X_test)[:, 1]
            score5 = clf5.predict_proba(X_test)[:, 1]
            score6 = clf6.predict_proba(X_test)[:, 1]
            scores[col_target] = 0.7 * (score1) + (score4 + score5 + score6) / 3 * 0.3

        return scores

    def run(self):
        clfs1, clfs4,clfs5, clfs6 = self.fit()
        scores = self.get_scores(clfs1, clfs4,clfs5, clfs6)

        scores.to_csv(self.result_path)

        return scores
    
params = {
    "n_estimators": 500,
      "boosting_type": "gbdt",
      "objective": "binary",
      "subsample": 0.5,
      "subsample_freq": 1,
      "learning_rate": 0.02,
      "feature_fraction": 0.75,
      "max_depth": 6,
      "lambda_l1": 1,
      "lambda_l2": 1,
      "min_data_in_leaf": 50,
      "n_jobs": 8,
}


In [None]:
dw = Downstream(
    train_path=X_train,
    test_path=X_test,
    params=params,
    result_path='last_try_submit.csv'
)

scores = dw.run()
scores

  0%|          | 0/4 [00:00<?, ?it/s]

init clf4
init clf5
init clf6
init clf1
0:	learn: 0.5539614	total: 171ms	remaining: 4m 49s
100:	learn: 0.0387087	total: 7.45s	remaining: 1m 57s
200:	learn: 0.0381086	total: 14.6s	remaining: 1m 48s
300:	learn: 0.0378231	total: 21.4s	remaining: 1m 39s
400:	learn: 0.0376240	total: 28.1s	remaining: 1m 31s
500:	learn: 0.0374714	total: 35s	remaining: 1m 23s
600:	learn: 0.0373475	total: 41.8s	remaining: 1m 16s
700:	learn: 0.0372266	total: 48.8s	remaining: 1m 9s
800:	learn: 0.0371224	total: 55.5s	remaining: 1m 2s
900:	learn: 0.0370316	total: 1m 2s	remaining: 55.2s
1000:	learn: 0.0369373	total: 1m 9s	remaining: 48.2s
1100:	learn: 0.0368478	total: 1m 15s	remaining: 41.2s
1200:	learn: 0.0367603	total: 1m 22s	remaining: 34.3s
1300:	learn: 0.0366786	total: 1m 29s	remaining: 27.4s
