In [2]:
import pandas as pd
import numpy as np

from scipy import sparse
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

In [3]:
import warnings

warnings.filterwarnings('ignore')

In [1]:
import joblib

In [4]:
X_train = sparse.load_npz('../sparse/train_matrix.npz')
y_train = pd.read_csv('../sparse/train_labels.csv', names=['label'])

X_test = sparse.load_npz('../sparse/test_matrix.npz')

### Вперед, тяжелая артилерия

Обучим на 5 стратифицированных фолдах LGBMClassifier, на лидерборд зашлем усредненное предсказание по пяти фолдам, а также оценим OOF-предсказания


UPD: В итоге обучал на 10 фолдах, это дало бОльшее качество для усредненной модели

In [19]:
try:
    folds_idx = joblib.load('folds.pkl')
except:
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    folds_idx = [
        (train_idx, val_idx) 
        for train_idx, val_idx in folds.split(X_train, y=y_train['label'])
    ]

    joblib.dump(folds_idx, 'folds.pkl')

['folds.pkl']

In [8]:
params = {'subsample_freq': 2,
          'subsample_for_bin': 100, 
          'subsample': 0.7, 
          'scale_pos_weight': 1, 
          'reg_lambda': 0.2, 
          'reg_alpha': 7, 
          'objective': 'binary', 
          'num_leaves': 50, 
          'min_split_gain': 2.0, 
          'min_child_weight': 3,
          'min_child_samples': 100, 
          'metric': 'auc', 
          'max_depth': 20, 
          'max_bin': 100, 
          'learning_rate': 0.07, 
          'random_state': 44,
          'colsample_bytree': 0.7}

In [24]:
test_preds = []
oof_preds = np.zeros(X_train.shape[0])
final_preds = np.zeros(X_test.shape[0])
auc_scores = []
    
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
    train_x, train_y = X_train.tocsr()[train_idx], y_train['label'].iloc[train_idx]
    valid_x, valid_y = X_train.tocsr()[valid_idx], y_train['label'].iloc[valid_idx]
    
    num_boost_round=180
    early_stopping_rounds=150
    
    d_train = lgb.Dataset(train_x, label=train_y)
    d_valid = lgb.Dataset(valid_x, label=valid_y)
    clf = lgb.train(params,
                    d_train,
                    valid_sets=[d_train, d_valid], 
                    valid_names=['train','valid'], 
                    num_boost_round=num_boost_round,
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=10,
                   )
    
    oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
    y_pred = clf.predict(X_test, num_iteration=clf.best_iteration)
    test_preds.append(y_pred)
    auc_scores.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_scores[-1]))
    
print("\n", f"Mean AUC:{np.mean(auc_scores)}", 
      f"std: {np.std(auc_scores)}",
      f"oof roc-auc:{roc_auc_score(y_train['label'], oof_preds)}")


Training until validation scores don't improve for 150 rounds.
[10]	train's auc: 0.672681	valid's auc: 0.650973
[20]	train's auc: 0.689228	valid's auc: 0.660458
[30]	train's auc: 0.701825	valid's auc: 0.665654
[40]	train's auc: 0.71094	valid's auc: 0.669977
[50]	train's auc: 0.718763	valid's auc: 0.673379
[60]	train's auc: 0.726207	valid's auc: 0.676089
[70]	train's auc: 0.732621	valid's auc: 0.677882
[80]	train's auc: 0.738363	valid's auc: 0.678979
[90]	train's auc: 0.744435	valid's auc: 0.679686
[100]	train's auc: 0.749665	valid's auc: 0.68083
[110]	train's auc: 0.75505	valid's auc: 0.681007
[120]	train's auc: 0.759438	valid's auc: 0.681167
[130]	train's auc: 0.763765	valid's auc: 0.681458
[140]	train's auc: 0.76809	valid's auc: 0.681773
[150]	train's auc: 0.77196	valid's auc: 0.681511
[160]	train's auc: 0.775686	valid's auc: 0.681235
[170]	train's auc: 0.779461	valid's auc: 0.68134
[180]	train's auc: 0.783093	valid's auc: 0.681158
Did not meet early stopping. Best iteration is:
[180

In [25]:
oof_roc_auc = roc_auc_score(y_train['label'], oof_preds)

In [26]:
final_preds = list(map(lambda x: np.mean(x), zip(*test_preds)))

In [27]:
cuids_test = pd.read_csv('../mlboot_dataset/mlboot_test.tsv.gz', delimiter='\t', compression='gzip', index_col='cuid')

In [28]:
sub = pd.read_csv('../sparse/test_cuid.csv', names=['cuid'], index_col='cuid')

In [29]:
sub['pred'] = final_preds
sub = sub.reindex(cuids_test.index)

In [30]:
sub['pred'].to_csv(f'../output/lgbm_oof_predict_{oof_roc_auc}.csv', index=False)

### На лидерборде получили:
```
П: 0.7403242
Ф: 0.7399504
```

Забыл, что по дефолту 100 итераций. На 100 итерациях было:

`Mean AUC:0.6797945265258807 std: 0.0012320261389598974 oof roc-auc:0.6797624341701467`


На 10 фолдах качество вышло:
    
` Mean AUC:0.6817596224649585 std: 0.004425747820454361 oof roc-auc:0.6817420676253266 `