In [2]:
import pandas as pd
import numpy as np

from scipy import sparse
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

In [3]:
import warnings

warnings.filterwarnings('ignore')

In [4]:
import joblib

In [7]:
X_train = sparse.load_npz('../sparse/train_matrix.npz')
y_train = pd.read_csv('../sparse/train_labels.csv', names=['label'])

X_test = sparse.load_npz('../sparse/test_matrix.npz')

### Вперед, тяжелая артилерия

Обучим на 5 стратифицированных фолдах LGBMClassifier, на лидерборд зашлем усредненное предсказание по пяти фолдам, а также оценим OOF-предсказания

In [30]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds_idx = [
    (train_idx, val_idx) 
    for train_idx, val_idx in folds.split(X_train, y=y_train['label'])
]

In [5]:
def get_model():
    return lgb.LGBMClassifier(n_jobs=-1, **params)

In [28]:
params = {'subsample_freq': 2,
          'subsample_for_bin': 100, 
          'subsample': 0.7, 
          'scale_pos_weight': 1, 
          'reg_lambda': 0.2, 
          'reg_alpha': 7, 
          'objective': 'binary', 
          'num_leaves': 50, 
          'min_split_gain': 2.0, 
          'min_child_weight': 3,
          'min_child_samples': 100, 
          'metric': 'auc', 
          'max_depth': 20, 
          'max_bin': 100, 
          'learning_rate': 0.07, 
          'seed': 44,
          'colsample_bytree': 0.7}

In [31]:
test_preds = []
oof_preds = np.zeros(X_train.shape[0])
final_preds = np.zeros(X_test.shape[0])
auc_scores = []
    
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
    train_x, train_y = X_train.tocsr()[train_idx], y_train['label'].iloc[train_idx]
    valid_x, valid_y = X_train.tocsr()[valid_idx], y_train['label'].iloc[valid_idx]
    
    clf = get_model()
    
    clf.fit(train_x, train_y, 
            eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric='auc', verbose=10, early_stopping_rounds=150)
    
    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    y_pred = clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1]
    test_preds.append(y_pred)
    auc_scores.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_scores[-1]))
    
print("\n", f"Mean AUC:{np.mean(auc_scores)}", 
      f"std: {np.std(auc_scores)}",
      f"oof roc-auc:{roc_auc_score(y_train['label'], oof_preds)}")


Training until validation scores don't improve for 150 rounds.
[10]	valid_0's auc: 0.672776	valid_1's auc: 0.650973
[20]	valid_0's auc: 0.68916	valid_1's auc: 0.660458
[30]	valid_0's auc: 0.701482	valid_1's auc: 0.665654
[40]	valid_0's auc: 0.710494	valid_1's auc: 0.669977
[50]	valid_0's auc: 0.718215	valid_1's auc: 0.673379
[60]	valid_0's auc: 0.725417	valid_1's auc: 0.676089
[70]	valid_0's auc: 0.731621	valid_1's auc: 0.677882
[80]	valid_0's auc: 0.737099	valid_1's auc: 0.678979
[90]	valid_0's auc: 0.743027	valid_1's auc: 0.679686
[100]	valid_0's auc: 0.748059	valid_1's auc: 0.68083
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.748059	valid_1's auc: 0.68083
Fold  1 AUC : 0.680830
Training until validation scores don't improve for 150 rounds.
[10]	valid_0's auc: 0.67636	valid_1's auc: 0.648564
[20]	valid_0's auc: 0.689198	valid_1's auc: 0.657616
[30]	valid_0's auc: 0.700561	valid_1's auc: 0.66384
[40]	valid_0's auc: 0.70925	valid_1's auc: 0.667733
[50]	valid_0

In [73]:
final_preds = list(map(lambda x: np.mean(x), zip(*test_preds)))

In [74]:
cuids_test = pd.read_csv('../mlboot_dataset/mlboot_test.tsv.gz', delimiter='\t', compression='gzip', index_col='cuid')

In [75]:
sub = pd.read_csv('../sparse/test_cuid.csv', names=['cuid'], index_col='cuid')

In [76]:
sub['pred'] = final_preds
sub = sub.reindex(cuids_test.index)

In [78]:
sub['pred'].to_csv('../output/lgbm_oof_predict.csv', index=False)

### На лидерборде получили:
```
П: 0.7391013
Ф: 0.7384176
```