In [18]:
import lightgbm as lgb
import pandas as pd


from sklearn.metrics import roc_auc_score, confusion_matrix

In [4]:
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('data/binary.train', header=None, sep='\t')
df_test = pd.read_csv('data/binary.test', header=None, sep='\t')

Loading data...


In [6]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,0.869,-0.635,0.226,0.327,-0.69,0.754,-0.249,-1.092,0.0,...,-0.01,-0.046,3.102,1.354,0.98,0.978,0.92,0.722,0.989,0.877
1,1,0.908,0.329,0.359,1.498,-0.313,1.096,-0.558,-1.588,2.173,...,-1.139,-0.001,0.0,0.302,0.833,0.986,0.978,0.78,0.992,0.798
2,1,0.799,1.471,-1.636,0.454,0.426,1.105,1.282,1.382,0.0,...,1.129,0.9,0.0,0.91,1.108,0.986,0.951,0.803,0.866,0.78
3,0,1.344,-0.877,0.936,1.992,0.882,1.786,-1.647,-0.942,0.0,...,-0.678,-1.36,0.0,0.947,1.029,0.999,0.728,0.869,1.027,0.958
4,1,1.105,0.321,1.522,0.883,-1.205,0.681,-1.07,-0.922,0.0,...,-0.374,0.113,0.0,0.756,1.361,0.987,0.838,1.133,0.872,0.808


In [7]:
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

In [8]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [10]:
# specify your configurations as a dict
params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : {'binary_logloss'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [12]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=200,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

Starting training...
[1]	valid_0's binary_logloss: 0.68426
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's binary_logloss: 0.677503
[3]	valid_0's binary_logloss: 0.669053
[4]	valid_0's binary_logloss: 0.661109
[5]	valid_0's binary_logloss: 0.652785
[6]	valid_0's binary_logloss: 0.647685
[7]	valid_0's binary_logloss: 0.642569
[8]	valid_0's binary_logloss: 0.636967
[9]	valid_0's binary_logloss: 0.631087
[10]	valid_0's binary_logloss: 0.625984
[11]	valid_0's binary_logloss: 0.620339
[12]	valid_0's binary_logloss: 0.615836
[13]	valid_0's binary_logloss: 0.611066
[14]	valid_0's binary_logloss: 0.608525
[15]	valid_0's binary_logloss: 0.605393
[16]	valid_0's binary_logloss: 0.602314
[17]	valid_0's binary_logloss: 0.597492
[18]	valid_0's binary_logloss: 0.593435
[19]	valid_0's binary_logloss: 0.59029
[20]	valid_0's binary_logloss: 0.586578
[21]	valid_0's binary_logloss: 0.583909
[22]	valid_0's binary_logloss: 0.581561
[23]	valid_0's binary_logloss: 0.579587
[24]	vali

In [20]:
print('Starting predicting...')
# predict
y_pred_score = gbm.predict(X_test, num_iteration=gbm.best_iteration)

Starting predicting...


In [21]:
threshold = 0.5
y_pred_class = []
for item in y_pred_score:
	if item > threshold:
		y_pred_class.append(1)
	else:
		y_pred_class.append(0)

		
y_pred = pd.DataFrame(data = y_pred_class, index = X_test.index)
y_pred.columns =  ['label']

y_pred_score = pd.DataFrame(data = y_pred_score, index = X_test.index)
y_pred_score.columns = ['score']

print (classification_report(y_test, y_pred))
auc_score = roc_auc_score(y_test,y_pred_score)
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = (conf_matrix[0,0] + conf_matrix[1,1])*1.0 / sum(sum(conf_matrix))

print("Confusion matrix:")
print (conf_matrix)
print ("accuracy of the model is:"+ str(accuracy))
print ("AUC Score is:" +str(auc_score))


             precision    recall  f1-score   support

          0       0.72      0.78      0.75       228
          1       0.80      0.75      0.77       272

avg / total       0.77      0.76      0.76       500

Confusion matrix:
[[178  50]
 [ 69 203]]
accuracy of the model is:0.762
AUC Score is:0.8388480392156864


In [22]:
print('Saving model...')
# save model to file
gbm.save_model('saved_model/model.txt')


Saving model...
