In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
train = pd.read_csv('train.csv',index_col=0)
test = pd.read_csv('test.csv',index_col=0)

In [3]:
# feature augmentation based on lgbm feature importance
gen_feats = [(1, 5),
             (0, 1),
             (2, 1),
             (1, 3),
             (2, 5),
             (0, 2),
             (0, 5),
             (4, 1),
             (5, 4),
             (3, 5),
             (0, 4)]
for i in range(6):
    for j in range(6):
        if (i,j) in gen_feats:
            train['gen_{}_{}'.format(i,j)] = train.iloc[:,i]/train.iloc[:,j]
            test['gen_{}_{}'.format(i,j)] = test.iloc[:,i]/test.iloc[:,j]

In [4]:
features = [col for col in train.columns if col != 'label']
num_folds = 10
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=5001)

In [5]:
# from bayesian optimization
params = {
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc',
    'device_type': 'cpu',
    'random_state': 5001,
    'verbose': -1,
    'learning_rate': 0.0266903682740756,
    'n_estimators': 130,
    'num_leaves': 57,
    'max_depth': -1,
    'subsample': 0.6042971686279393,
    'subsample_freq': 6,
    'colsample_bytree': 0.5656296878622898,
    'reg_lambda': 3.8101081958287514e-05,
    'reg_alpha': 0.0465945527577266,
    'min_child_samples': 2,
    'min_child_weight': 0.06250203857075702,
    }

In [6]:
# 10 fold cv
roc_score = 0
acc = 0
test_preds = []
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[features], train['label'])):
    train_x, train_y = train[features].iloc[train_idx], train['label'].iloc[train_idx]
    valid_x, valid_y = train[features].iloc[valid_idx], train['label'].iloc[valid_idx]
    
    clf = lgb.LGBMClassifier(**params)
    clf.fit(train_x,train_y)
    pred = clf.predict(valid_x)
    roc_score += roc_auc_score(valid_y, pred)/num_folds
    acc += accuracy_score(valid_y, pred)/num_folds
    test_pred = clf.predict(test[features])
    test_pred = pd.DataFrame({'pred_{}'.format(n_fold): test_pred})
    test_preds.append(test_pred)
print(roc_score)
print(acc)

0.9333333333333333
0.9541666666666666


In [7]:
output_df = pd.concat(test_preds,axis=1)
output_df['label'] = output_df.mean(axis=1)
output_df['label'] = (output_df['label'] > 0.5).astype(np.int8)
output_df = output_df[['label']]
preds = pd.read_csv('sample_submission.csv')
preds['label'] = output_df['label']
preds.to_csv('submission_3.csv',index=False)