In [2]:
import pandas as pd
import numpy as np
import os
import sys
import polars as pl
import json
from joblib import Parallel, delayed
import deepchem
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Configure Polars 
cfg = pl.Config()
cfg.set_tbl_rows(20)
cfg.set_tbl_cols(50)
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold, GroupShuffleSplit, ShuffleSplit
import gc
import random
from functools import partial
import catboost
from catboost import FeaturesData, Pool, CatBoost, CatBoostClassifier, CatBoostRegressor, CatBoostRanker
import time
from sklearn.metrics import average_precision_score

In [3]:
NUM_TRAIN = 400_000_000
SEED = 42
FROM_GLOBAL_POOL = False
# POS_WEIGHT = 1 / 0.015779035993603314
POS_WEIGHT = 10
POS_WEIGHT

10

In [3]:
cv_train_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/cv/v1/train.csv').with_row_index('_index').collect()
print(cv_train_df.shape)
cv_train_df.head(5)

(66710159, 3)


_index,index,label
u32,i64,i64
0,363110,0
1,363118,0
2,363119,0
3,363122,0
4,363123,0


In [4]:
cv_val_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/cv/v1/val.csv').with_row_index('_index').collect()
print(cv_val_df.shape)
cv_val_df.head(5)

(31705451, 4)


_index,index,label,subset
u32,i64,i64,i64
0,0,0,1
1,1,0,1
2,2,0,1
3,3,0,1
4,4,0,1


In [5]:
sampled_val_idxs = cv_val_df.filter(pl.col('subset').is_in([0,2,3]))['index'].to_numpy()
sampled_val_labels = cv_val_df.filter(pl.col('subset').is_in([0,2,3]))['label'].to_numpy()
sampled_val_idxs.shape, sampled_val_labels.shape

((299837,), (299837,))

In [6]:
if NUM_TRAIN < cv_train_df.shape[0]:
    pos_idxs = cv_train_df.filter(pl.col('label') > 0)['_index'].to_numpy()
    neg_idxs = cv_train_df.filter(pl.col('label') == 0)['_index'].to_numpy()
    print('Pos/Neg num:', pos_idxs.shape, neg_idxs.shape)
    np.random.seed(SEED)
    sampled_neg_idxs = np.random.choice(neg_idxs, NUM_TRAIN - len(pos_idxs), replace = False)
    _sampled_idxs = np.concatenate([pos_idxs, sampled_neg_idxs], axis = 0)
else:
    _sampled_idxs = cv_train_df['_index'].to_numpy()
_sampled_idxs.shape

(66710159,)

In [7]:
if FROM_GLOBAL_POOL:
    sampled_train_idxs = cv_train_df[_sampled_idxs, 'index'].to_numpy()
    sampled_train_labels = cv_train_df[_sampled_idxs, 'label'].to_numpy()
else:
    sampled_train_idxs = _sampled_idxs
    sampled_train_labels = cv_train_df['label'].to_numpy()
    
# get some statistic
print('NUM SAMPLED TRAIN:', sampled_train_idxs.shape)
(cv_train_df[_sampled_idxs, 'label']!=0).mean()

NUM SAMPLED TRAIN: (66710159,)


0.015779035993603314

In [9]:
del cv_train_df, cv_val_df
gc.collect()

NameError: name 'cv_train_df' is not defined

In [10]:
if FROM_GLOBAL_POOL:
    train_ds = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/train.tsv'
    train_cd = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/train_num.cd'
else:
    train_ds = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/_train_66M.tsv'
    train_cd = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/train_num.cd'

global_pool = Pool(
    train_ds,
    column_description=train_cd,
    has_header = False,
    thread_count = -1,
)
global_pool.num_row(), global_pool.num_col()

(66710159, 256)

In [11]:
train_pool = global_pool.slice(sampled_train_idxs)
gc.collect()
print(train_pool.num_row(), train_pool.num_col())

66710159 256


In [12]:
# val_pool = global_pool.slice(sampled_val_idxs)
# gc.collect()
# print(val_pool.num_row(), val_pool.num_col())

# 5M

In [13]:
# train_ds = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6/_train_5M.tsv'
# train_cd = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6/train_ctr.cd'

# train_pool = Pool(
#     train_ds,
#     column_description=train_cd,
#     has_header = False,
#     thread_count = -1,
# )
# print(train_pool.num_row(), train_pool.num_col())

In [5]:
val_ds = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/_val_300k.tsv'
val_cd = '/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/train_num.cd'

val_pool = Pool(
    val_ds,
    column_description=val_cd,
    has_header = False,
    thread_count = -1,
)
print(val_pool.num_row(), val_pool.num_col())

299837 256


# End 5M

In [6]:
gc.collect()

514

In [16]:
# # DEL GLOBAL POOL IF NEEDED
# del global_pool
# gc.collect()

- 3 gpus, 0.95 -> ok
- 3 gpus, 0.8 -> ok
- 3 gpus, 0.5 -> not enough
- 3 gpus (1 with 10 GB already), 0.3 -> die
- 4 gpus (1 with 10 GB), 0.95 -> not enough (Error: not enough memory for learning)
- 4 gpus (1 with 10 GB already), 0.8 -> die

In [None]:
# ori_train_labels = train_pool.get_label()
# ori_val_labels = val_pool.get_label()

In [21]:
(sampled_train_labels > 0).mean(), (sampled_val_labels > 0).mean()

(0.015779035993603314, 0.01364741509553524)

In [47]:
# train_pool.set_weight((sampled_train_labels > 0).astype(np.float32) * (POS_WEIGHT - 1) + 1) 
# val_pool.set_weight((sampled_val_labels > 0).astype(np.float32) * (POS_WEIGHT - 1) + 1)

<catboost.core.Pool at 0x7fc25c3ee500>

In [53]:
train_pool.set_weight(np.ones_like(sampled_train_labels)) 
val_pool.set_weight(np.ones_like(sampled_val_labels))

<catboost.core.Pool at 0x7fc25c3ee500>

In [54]:
%%time
lr = None
model = CatBoostClassifier(task_type='GPU',
                           devices='1:2:3',
#                            loss_function = 'MultiCrossEntropy',
                           loss_function = 'MultiLogloss',
                           n_estimators = 4000,
                           learning_rate = lr,
                           random_state = 42,
#                            depth = 12,
#                            auto_class_weights='Balanced',
#                            scale_pos_weight = 1.2,
#                            class_weights=[2.0, 1.0],
                           # max_ctr_complexity = None,
#                            eval_metrics = 
#                            used_ram_limit = '180GB',
#                            gpu_ram_part = 0.3,
#                            max_ctr_complexity = 1,
#                            gpu_cat_features_storage = 'CpuPinnedMemory',
                          )
print(model.get_params())

model.fit(
    train_pool,
    verbose=100,
    eval_set=val_pool,
    plot = True
)

print('ALL PARAMS:\n', model.get_all_params())

SAVE_PATH = f'../runs/ckpts/model_256_full_weight=63_lr={lr}.cbm'
model.save_model(SAVE_PATH)

{'loss_function': 'MultiLogloss', 'task_type': 'GPU', 'devices': '1:2:3', 'n_estimators': 4000, 'random_state': 42}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.015603
0:	learn: 0.6490300	test: 0.6489748	best: 0.6489748 (0)	total: 946ms	remaining: 1h 3m 2s
100:	learn: 0.0281296	test: 0.0290355	best: 0.0290355 (100)	total: 1m 37s	remaining: 1h 2m 41s
200:	learn: 0.0234446	test: 0.0253229	best: 0.0253229 (200)	total: 2m 56s	remaining: 55m 37s
300:	learn: 0.0220581	test: 0.0247110	best: 0.0247110 (300)	total: 4m 23s	remaining: 53m 56s
400:	learn: 0.0212358	test: 0.0243995	best: 0.0243995 (400)	total: 5m 46s	remaining: 51m 49s
500:	learn: 0.0206168	test: 0.0241788	best: 0.0241788 (500)	total: 7m 9s	remaining: 49m 59s
600:	learn: 0.0200504	test: 0.0240080	best: 0.0240080 (600)	total: 8m 32s	remaining: 48m 17s
700:	learn: 0.0196320	test: 0.0238578	best: 0.0238578 (700)	total: 9m 51s	remaining: 46m 24s
800:	learn: 0.0192623	test: 0.0236834	best: 0.0236834 (800)	total: 11m 12s	remaining: 44m 45s
900:	learn: 0.0189568	test: 0.0235566	best: 0.0235566 (900)	total: 12m 30s	remaining: 43m
1000:	learn: 0.0187096	test: 0.0234779	best: 

In [4]:
model = CatBoostClassifier()
# model.load_model(SAVE_PATH)
model.load_model(f'../runs/ckpts/model_256_full_weight=63_lr={None}.cbm')
model

<catboost.core.CatBoostClassifier at 0x7fb79c278220>

In [32]:
# metrics = model.eval_metrics(val_pool,
#              metrics = ['F1', 'Accuracy'],
#              ntree_start=0,
#              ntree_end=0,
#              eval_period=1,
#              thread_count=-1,
#              )

In [33]:
# pd.Series(metrics['F1:class=2']).hist()

In [7]:
preds = model.predict_proba(val_pool, verbose = 100)
preds.shape

(299837, 3)

In [8]:
labels = val_pool.get_label().astype(np.uint8)
labels.shape

(299837, 3)

In [9]:
for i in range(3):
    print(average_precision_score(labels[:, i], preds[:, i], pos_label=1, average='micro'))

average_precision_score(labels.reshape(-1), preds.reshape(-1), pos_label=1, average='micro')

0.24414941127764045
0.11894557382816726
0.36578464871741667


0.27468052236519097

# Result

- 256 + CTR + LR=0.1: 0.2851591290053086, 0.03271806002, 3996 (12 min + 60 GB)
- 256 + NUM + LR=0.1: 0.28140081176626164, 0.03188184721 (3min32s + 10 GB)
- 256 + NUM + LR=0.2 + 40M: 0.2569072157627742, 0.0228263346
- 2048 + CTR + LR=0.2: 0.31463215756548685 (5mins + 90 GB)
- 2048 + NUM + LR=0.2: 0.31528349357809027 (4min7s + 53 GB)

# Test

In [None]:
# test_pool = Pool('/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/test.tsv',
#                   column_description='/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6_packbits/test_num.cd',
#                   has_header = False,
#                   thread_count = -1,
#                  )
# test_pool.num_row(), test_pool.num_col()

test_pool = Pool('/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6/test.tsv',
                  column_description='/home/dangnh36/datasets/competitions/leash_belka/processed/datasets/catboost/ecfp6/test_ctr.cd',
                  has_header = False,
                  thread_count = -1,
                 )
test_pool.num_row(), test_pool.num_col()

In [None]:
test_preds = model.predict_proba(test_pool, verbose = 100)
print('Mean preds:', test_preds.mean(axis = 0))
test_preds.shape

In [None]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v3.csv').collect()
test_ids = test_df[['id_BRD4', 'id_HSA', 'id_sEH']].to_numpy()
assert(test_preds.shape==test_ids.shape)

test_ids = test_ids.reshape(-1)
test_preds = test_preds.reshape(-1)
test_ids.shape, test_preds.shape

In [None]:
mask = test_ids != 0 
assert(mask.sum()==1674896)

submit_df = pd.DataFrame({
    'id':test_ids[mask],
    'binds':test_preds[mask],
})
display(submit_df)
submit_df.to_csv(f'../runs/submission.csv',index=False)