In [69]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgbm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV

In [10]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [82]:
kf = StratifiedKFold(10, shuffle=True)

splits = kf.split(train, train['label'])

train_accs = []
test_accs = []

for train_idx, test_idx in splits:
    train_data, train_label = train.iloc[train_idx, 1:-1], train.iloc[train_idx, -1]
    test_data, test_label = train.iloc[test_idx, 1:-1], train.iloc[test_idx, -1]
    
    model = lgbm.LGBMClassifier(n_estimators=500, scale_pos_weight=(train_label==0).sum()/train_label.sum())
    model.fit(train_data, train_label)
    
    train_pred = model.predict(train_data)
    test_pred = model.predict(test_data)
    
    print(classification_report(train_label, train_pred))
    print(classification_report(test_label, test_pred))
    
    train_accs.append(accuracy_score(train_label, train_pred))
    test_accs.append(accuracy_score(test_label, test_pred))
    
    print("="*150)
    
print("Mean Train Acc:", np.mean(train_accs))
print("Mean Test Acc:", np.mean(test_accs))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       1.00      1.00      1.00        26

    accuracy                           1.00        78
   macro avg       1.00      1.00      1.00        78
weighted avg       1.00      1.00      1.00        78

              precision    recall  f1-score   support

           0       0.83      0.83      0.83         6
           1       0.67      0.67      0.67         3

    accuracy                           0.78         9
   macro avg       0.75      0.75      0.75         9
weighted avg       0.78      0.78      0.78         9

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       1.00      1.00      1.00        26

    accuracy                           1.00        78
   macro avg       1.00      1.00      1.00        78
weighted avg       1.00      1.00      1.00        78

              preci

In [159]:
param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [-1, 3, 5, 7,],
    "subsample": [1, .8, .6],
    "subsample_ratio": [0, .2, .4],
    "colsample_bytree": [1, .8, .6],
    "reg_alpha": [0, .5, .1],
    "reg_lambda": [0, .5, .1],
    "scale_pos_weight": [1, 2, 3]
}

In [146]:
def refit(params):
    model = lgbm.LGBMClassifier(**params)
    model.fit(train.iloc[:, 1:-1], train['label'])
    return model

In [160]:
grid_search = GridSearchCV(lgbm.LGBMClassifier(), param_grid, scoring=['f1', 'accuracy'], n_jobs=4, cv=7, verbose=4, refit='accuracy')

In [161]:
grid_search.fit(train[['MO HLADR+ MFI (cells/ul)', 'Neu CD64+MFI (cells/ul)',
       'CD3+T (cells/ul)', 'CD8+T (cells/ul)', 'CD4+T (cells/ul)',
       'NK (cells/ul)', 'CD19+ (cells/ul)', 'CD45+ (cells/ul)', 'Age',
       'Sex 0M1F', 'Mono CD64+MFI (cells/ul)']], train['label'])

Fitting 7 folds for each of 8748 candidates, totalling 61236 fits


GridSearchCV(cv=7, estimator=LGBMClassifier(), n_jobs=4,
             param_grid={'colsample_bytree': [1, 0.8, 0.6],
                         'max_depth': [-1, 3, 5, 7],
                         'n_estimators': [100, 300, 500],
                         'reg_alpha': [0, 0.5, 0.1],
                         'reg_lambda': [0, 0.5, 0.1],
                         'scale_pos_weight': [1, 2, 3],
                         'subsample': [1, 0.8, 0.6],
                         'subsample_ratio': [0, 0.2, 0.4]},
             refit='accuracy', scoring=['f1', 'accuracy'], verbose=4)

In [170]:
grid_search.best_score_

0.8727106227106228

In [163]:
grid_search.best_params_

{'colsample_bytree': 0.8,
 'max_depth': -1,
 'n_estimators': 100,
 'reg_alpha': 0.5,
 'reg_lambda': 0,
 'scale_pos_weight': 1,
 'subsample': 1,
 'subsample_ratio': 0}

In [167]:
model = grid_search.best_estimator_

In [168]:
pred = model.predict(test.iloc[:, 1:])

In [169]:
submission = pd.Series(pred).reset_index()
submission.columns = ['id', 'label']
submission.to_csv('submission.csv', index=None)

In [157]:
submission

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0
