In [14]:
import pandas as pd
import numpy as np

from scipy import sparse
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgbm

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import joblib

### Здесь используем датасет с численными фичами

In [5]:
X_train = sparse.load_npz('concat_features_train.npz')
y_train = pd.read_csv('../sparse/train_labels.csv', names=['label'])

X_test = sparse.load_npz('concat_features_test.npz')

Обучим на 10 стратифицированных фолдах LGBMClassifier, на лидерборд зашлем усредненное предсказание по десяти фолдам, а также оценим OOF-предсказания

In [6]:
try:
    folds_idx = joblib.load('10folds.pkl')
except:
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    folds_idx = [
        (train_idx, val_idx) 
        for train_idx, val_idx in folds.split(X_train, y=y_train['label'])
    ]

    joblib.dump(folds_idx, '10folds.pkl')

In [11]:
params = {'subsample_freq': 2,
          'subsample_for_bin': 100, 
          'subsample': 0.7, 
          'scale_pos_weight': 1, 
          'reg_lambda': 0.2, 
          'reg_alpha': 7, 
          'objective': 'binary', 
          'num_leaves': 50, 
          'min_split_gain': 2.0, 
          'n_estimators': 300,
          'min_child_weight': 3,
          'min_child_samples': 100, 
          'metric': 'auc', 
          'max_depth': 20, 
          'max_bin': 100, 
          'learning_rate': 0.075, 
          'random_state': 44,
          'colsample_bytree': 0.7}

In [12]:
num_boost_round=150
early_stopping_rounds=120

In [15]:
test_preds = []
oof_preds = np.zeros(X_train.shape[0])
final_preds = np.zeros(X_test.shape[0])
auc_scores = []
    
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
    train_x, train_y = X_train.tocsr()[train_idx], y_train['label'].iloc[train_idx]
    valid_x, valid_y = X_train.tocsr()[valid_idx], y_train['label'].iloc[valid_idx]
    

    
    d_train = lgbm.Dataset(train_x, label=train_y)
    d_valid = lgbm.Dataset(valid_x, label=valid_y)
    clf = lgbm.train(params, 
                         d_train, 
                         valid_sets=[d_train, d_valid], 
                         valid_names=['train','valid'], 
                         num_boost_round=num_boost_round,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=10,
                      )
    
    oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
    y_pred = clf.predict(X_test, num_iteration=clf.best_iteration)
    test_preds.append(y_pred)
    auc_scores.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_scores[-1]))
    
print("\n", f"Mean AUC:{np.mean(auc_scores)}", 
      f"std: {np.std(auc_scores)}",
      f"oof roc-auc:{roc_auc_score(y_train['label'], oof_preds)}")


Training until validation scores don't improve for 120 rounds.
[10]	train's auc: 0.666943	valid's auc: 0.649582
[20]	train's auc: 0.686407	valid's auc: 0.661549
[30]	train's auc: 0.700047	valid's auc: 0.670057
[40]	train's auc: 0.709968	valid's auc: 0.674843
[50]	train's auc: 0.717732	valid's auc: 0.677267
[60]	train's auc: 0.725048	valid's auc: 0.679026
[70]	train's auc: 0.731635	valid's auc: 0.679821
[80]	train's auc: 0.737534	valid's auc: 0.681846
[90]	train's auc: 0.742981	valid's auc: 0.681744
[100]	train's auc: 0.748309	valid's auc: 0.68184
[110]	train's auc: 0.753222	valid's auc: 0.681761
[120]	train's auc: 0.757892	valid's auc: 0.682486
[130]	train's auc: 0.762159	valid's auc: 0.682458
[140]	train's auc: 0.7669	valid's auc: 0.682419
[150]	train's auc: 0.77087	valid's auc: 0.682013
[160]	train's auc: 0.77471	valid's auc: 0.681376
[170]	train's auc: 0.778033	valid's auc: 0.681426
[180]	train's auc: 0.781499	valid's auc: 0.680746
[190]	train's auc: 0.785092	valid's auc: 0.68138
[2

[140]	train's auc: 0.76548	valid's auc: 0.689492
[150]	train's auc: 0.769462	valid's auc: 0.6897
[160]	train's auc: 0.773477	valid's auc: 0.689972
[170]	train's auc: 0.77728	valid's auc: 0.689888
[180]	train's auc: 0.780597	valid's auc: 0.6897
[190]	train's auc: 0.784094	valid's auc: 0.689747
[200]	train's auc: 0.787427	valid's auc: 0.689741
[210]	train's auc: 0.790495	valid's auc: 0.689745
[220]	train's auc: 0.793315	valid's auc: 0.690006
[230]	train's auc: 0.79599	valid's auc: 0.690358
[240]	train's auc: 0.798713	valid's auc: 0.690072
[250]	train's auc: 0.801143	valid's auc: 0.689969
[260]	train's auc: 0.803553	valid's auc: 0.689375
[270]	train's auc: 0.805998	valid's auc: 0.689269
[280]	train's auc: 0.808176	valid's auc: 0.688843
[290]	train's auc: 0.810135	valid's auc: 0.68876
[300]	train's auc: 0.81232	valid's auc: 0.68874
Did not meet early stopping. Best iteration is:
[300]	train's auc: 0.81232	valid's auc: 0.68874
Fold  6 AUC : 0.688740
Training until validation scores don't im

In [16]:
oof_roc_auc = roc_auc_score(y_train['label'], oof_preds)

In [17]:
final_preds = list(map(lambda x: np.mean(x), zip(*test_preds)))

In [18]:
cuids_test = pd.read_csv('../mlboot_dataset/mlboot_test.tsv.gz', delimiter='\t', compression='gzip', index_col='cuid')

In [19]:
sub = pd.read_csv('../sparse/test_cuid.csv', names=['cuid'], index_col='cuid')

In [20]:
sub['pred'] = final_preds
sub = sub.reindex(cuids_test.index)

In [21]:
sub['pred'].to_csv(f'../output/lgbm_oof_predict_{oof_roc_auc}.csv', index=False)

### На лидерборде получили:
```
П: 0.7449432
Ф: 0.7429291
```

## Сблендим с логрегрессией с коэффициентами:

In [22]:
logreg = pd.read_csv('../output/baseline_c002_sag.csv', names=['label'])

In [23]:
lgbm = pd.read_csv('../output/lgbm_oof_predict_0.6823876227541152.csv', names=['label'])

In [24]:
pred = lgbm['label'] * 0.9 + logreg['label'] * 0.1

In [30]:
pred.to_csv('../output/0.9lgbm10folds_0.6823876+0.1logreg.csv', index=False)

## На лидерборде:
    
```
П: 0.7453606
4 / 15
Ф: 0.7449608
4 / 15
```

![yeah](meme.png)


# Итого: 
## За три вечера получен вполне годный и стойки к переобучению на лидерборде результат, который еще можно улучшать