In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import lightgbm as ltb

from sklearn.metrics import roc_auc_score

import catboost as cb

import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train = pd.read_parquet('embeddings/X_train_lastTry.parquet')
X_test = pd.read_parquet('embeddings/X_test_lastTry.parquet')

In [3]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id] + ['month'] + ['client']

    def fit(self):

#         train_embeddings =  self.train_path/ #pd.read_parquet(self.train_path)
        X_tr = self.train_path.drop(columns=self.drop_feat).copy()

        clfs1 = dict()
        clfs2 = dict()
        clfs3 = dict()
        clfs4 = dict()
        clfs5 = dict()
        clfs6 = dict()
        
        for col_target in tqdm(self.all_targets):
            

            print('init clf4')
            clf4 = cb.CatBoostClassifier(learning_rate = 0.03, iterations = 1700, class_weights = [1, 9], verbose=100, random_state = 42, task_type = 'GPU')
            print('init clf5')
            clf5 = cb.CatBoostClassifier(learning_rate = 0.03, iterations = 1700, class_weights = [1, 9], verbose=100, random_state = 69, task_type = 'GPU')
            print('init clf6')
            clf6 = cb.CatBoostClassifier(learning_rate = 0.03, iterations = 1700, class_weights = [1, 9], verbose=100, random_state = 228, task_type = 'GPU')
            print('init clf1')
            clf1 = ltb.LGBMClassifier(**self.params, random_state = 42)
            clf2 = ltb.LGBMClassifier(**self.params, random_state = 69)
            clf3 = ltb.LGBMClassifier(**self.params, random_state = 228)
            y_train = self.train_path[col_target].copy()
            
            clf1.fit(X_tr, y_train)
            clf2.fit(X_tr, y_train)
            clf3.fit(X_tr, y_train)
            clf4.fit(X_tr, y_train)
            clf5.fit(X_tr, y_train)
            clf6.fit(X_tr, y_train)
            
            print(f'Model fitted, target: {col_target}')
            clfs1[col_target] = clf1
            clfs2[col_target] = clf2
            clfs3[col_target] = clf3
            clfs4[col_target] = clf4
            clfs5[col_target] = clf5
            clfs6[col_target] = clf6

        return clfs1, clfs2,clfs3, clfs4,clfs5, clfs6
#         return clfs1, clfs4, clfs5, clfs6

    def get_scores(
        self,
        clfs1,
        clfs2,
        clfs3,
        clfs4,
        clfs5,
        clfs6
    ):
        scores = pd.DataFrame([])

#         test_embeddings_curr = self.test_path.drop_duplicates('client_id') #pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = (self.test_path.drop_duplicates('client_id')).drop(columns=[self.col_id]).copy()
        ids = X_test[self.col_id]
        scores[self.col_id] = ids

        for col_target in self.all_targets:
            clf1 = clfs1[col_target]
            clf2 = clfs2[col_target]
            clf3 = clfs3[col_target]
            clf4 = clfs4[col_target]
            clf5 = clfs5[col_target]
            clf6 = clfs6[col_target]
            score1 = clf1.predict_proba(X_test)[:, 1]
            score2 = clf2.predict_proba(X_test)[:, 1]
            score3 = clf3.predict_proba(X_test)[:, 1]
            score4 = clf4.predict_proba(X_test)[:, 1]
            score5 = clf5.predict_proba(X_test)[:, 1]
            score6 = clf6.predict_proba(X_test)[:, 1]
            scores[col_target] = 0.7 * (score1, score2, score3) + (score4 + score5 + score6) / 3 * 0.3

        return scores

    def run(self):
        clfs1, clfs2, clfs3, clfs4, clfs5, clfs6 = self.fit()
        scores = self.get_scores(clfs1, clfs2, clfs3, clfs4, clfs5, clfs6)

        scores.to_csv(self.result_path)

        return scores
    
params = {
    "n_estimators": 500,
      "boosting_type": "gbdt",
      "objective": "binary",
      "subsample": 0.5,
      "subsample_freq": 1,
      "learning_rate": 0.02,
      "feature_fraction": 0.75,
      "max_depth": 6,
      "lambda_l1": 1,
      "lambda_l2": 1,
      "min_data_in_leaf": 50,
      "n_jobs": 8,
}


In [None]:
dw = Downstream(
    train_path=X_train,
    test_path=X_test,
    params=params,
    result_path='cb_lgbm_last_try_submit.csv'
)

scores = dw.run()
scores

  0%|          | 0/4 [00:00<?, ?it/s]

init clf4
init clf5
init clf6
init clf1




0:	learn: 0.5537613	total: 134ms	remaining: 3m 47s
100:	learn: 0.0387185	total: 6.17s	remaining: 1m 37s
200:	learn: 0.0381290	total: 12.2s	remaining: 1m 30s
300:	learn: 0.0378412	total: 18.1s	remaining: 1m 24s
400:	learn: 0.0376356	total: 24s	remaining: 1m 17s
500:	learn: 0.0374824	total: 29.8s	remaining: 1m 11s
600:	learn: 0.0373549	total: 35.8s	remaining: 1m 5s
700:	learn: 0.0372390	total: 41.6s	remaining: 59.4s
800:	learn: 0.0371285	total: 47.6s	remaining: 53.4s
900:	learn: 0.0370395	total: 53.4s	remaining: 47.4s
1000:	learn: 0.0369504	total: 59.2s	remaining: 41.4s
1100:	learn: 0.0368596	total: 1m 5s	remaining: 35.4s
1200:	learn: 0.0367781	total: 1m 10s	remaining: 29.5s
1300:	learn: 0.0366951	total: 1m 16s	remaining: 23.6s
1400:	learn: 0.0366143	total: 1m 22s	remaining: 17.6s
1500:	learn: 0.0365394	total: 1m 28s	remaining: 11.7s
1600:	learn: 0.0364638	total: 1m 34s	remaining: 5.83s
1699:	learn: 0.0363974	total: 1m 40s	remaining: 0us




0:	learn: 0.5544849	total: 74.9ms	remaining: 2m 7s
100:	learn: 0.0387160	total: 6.54s	remaining: 1m 43s
200:	learn: 0.0381113	total: 12.8s	remaining: 1m 35s
300:	learn: 0.0378274	total: 19s	remaining: 1m 28s
400:	learn: 0.0376351	total: 25.2s	remaining: 1m 21s
500:	learn: 0.0374925	total: 31.3s	remaining: 1m 14s
600:	learn: 0.0373628	total: 37.5s	remaining: 1m 8s
700:	learn: 0.0372422	total: 43.8s	remaining: 1m 2s
800:	learn: 0.0371403	total: 50s	remaining: 56.1s
900:	learn: 0.0370427	total: 56.2s	remaining: 49.9s
1000:	learn: 0.0369455	total: 1m 2s	remaining: 43.5s
1100:	learn: 0.0368619	total: 1m 8s	remaining: 37.3s
1200:	learn: 0.0367804	total: 1m 14s	remaining: 31s
1300:	learn: 0.0366969	total: 1m 20s	remaining: 24.8s
1400:	learn: 0.0366149	total: 1m 26s	remaining: 18.5s
1500:	learn: 0.0365379	total: 1m 32s	remaining: 12.3s
1600:	learn: 0.0364611	total: 1m 39s	remaining: 6.13s
1699:	learn: 0.0363866	total: 1m 45s	remaining: 0us




0:	learn: 0.5546015	total: 87ms	remaining: 2m 27s
100:	learn: 0.0387322	total: 6.38s	remaining: 1m 40s
200:	learn: 0.0381117	total: 12.6s	remaining: 1m 33s
300:	learn: 0.0378354	total: 18.9s	remaining: 1m 28s
400:	learn: 0.0376372	total: 25.2s	remaining: 1m 21s
500:	learn: 0.0374821	total: 31.5s	remaining: 1m 15s
600:	learn: 0.0373613	total: 37.7s	remaining: 1m 8s
700:	learn: 0.0372391	total: 43.9s	remaining: 1m 2s
800:	learn: 0.0371354	total: 50.1s	remaining: 56.3s
900:	learn: 0.0370363	total: 56.4s	remaining: 50s
1000:	learn: 0.0369482	total: 1m 2s	remaining: 43.8s
1100:	learn: 0.0368610	total: 1m 8s	remaining: 37.5s
1200:	learn: 0.0367762	total: 1m 15s	remaining: 31.2s
1300:	learn: 0.0366957	total: 1m 21s	remaining: 24.9s
1400:	learn: 0.0366154	total: 1m 27s	remaining: 18.7s
1500:	learn: 0.0365385	total: 1m 33s	remaining: 12.4s
1600:	learn: 0.0364592	total: 1m 39s	remaining: 6.17s
1699:	learn: 0.0363879	total: 1m 45s	remaining: 0us


 25%|██▌       | 1/4 [35:01<1:45:03, 2101.08s/it]

Model fitted, target: target_1
init clf4
init clf5
init clf6
init clf1
