In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import pickle


def get_metrics(model, axis, preprocessed_test=None):
    prob_preds = model.predict_proba(x_train)
    performance = roc_auc_score(y_train, prob_preds[:, axis])
    print "TRAINING: " + str(performance)
    
    prob_preds = model.predict_proba(x_test)
    performance = roc_auc_score(y_test, prob_preds[:, axis])
    print "TEST: " + str(performance)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
x = np.load("data/x_normalized.npy")
y = np.load("data/y.npy")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [3]:
model = lgb.LGBMClassifier(boosting_type='dart',learning_rate=0.1, num_leaves=500, reg_alpha=0, min_child_samples=5, min_child_weight=1e-05, n_estimators=1000, reg_lambda=100, max_depth=15, dropout=0.7)
model.fit(x_train, y_train, verbose=False)

y_pred = model.predict_proba(x_train)[:, 1]
performance = roc_auc_score(y_train, y_pred)
print performance

y_pred = model.predict_proba(x_test)[:, 1]
performance = roc_auc_score(y_test, y_pred)
print performance

0.9881389025222007
0.7976760596588771


In [4]:
from xgboost import XGBClassifier

model1 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=1, learning_rate=0.05, max_delta_step=0,
       max_depth=10, min_child_weight=10, missing=None, n_estimators=200,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)
model1.fit(x_train, y_train)

y_pred = model1.predict_proba(x_train)[:, 1]
performance = roc_auc_score(y_train, y_pred)
print performance

y_pred = model1.predict_proba(x_test)[:, 1]
performance = roc_auc_score(y_test, y_pred)
print performance

0.9104873405681265
0.7978625901336096


In [71]:
xgboost_predictions = model.predict_proba(x_train)[:, 1]
lgb_predictions = model1.predict_proba(x_train)[:, 1]

In [72]:
xgboost_predictions

array([0.14159794, 0.42136143, 0.08245563, ..., 0.19948283, 0.62061479,
       0.05624542])

In [73]:
lgb_predictions

array([0.2386743 , 0.38075927, 0.12050674, ..., 0.24197845, 0.59291583,
       0.09094482], dtype=float32)

In [74]:
xgboost_predictions = xgboost_predictions.reshape(-1,1)
lgb_predictions = lgb_predictions.reshape(-1, 1)

In [75]:
catboost_predictions = model3.predict_proba(x_train)[:, 1]
catboost_predictions = catboost_predictions.reshape(-1, 1)



In [76]:
catboost_predictions.shape

(51733, 1)

In [77]:
new_x_train = np.concatenate((xgboost_predictions, lgb_predictions, catboost_predictions), axis=1)
new_x_train.shape

(51733, 3)

In [78]:
xgboost_predictions = model.predict_proba(x_test)[:, 1]
lgb_predictions = model1.predict_proba(x_test)[:, 1]

In [79]:
xgboost_predictions = xgboost_predictions.reshape(-1,1)
lgb_predictions = lgb_predictions.reshape(-1, 1)
catboost_predictions = model3.predict_proba(x_test)[:, 1]
catboost_predictions = catboost_predictions.reshape(-1, 1)

In [80]:
new_x_test = np.concatenate((xgboost_predictions, lgb_predictions, catboost_predictions), axis=1)

In [65]:
catboost_predictions = model3.predict_proba(x_test)[:, 1]
catboost_predictions = catboost_predictions.reshape(-1, 1)
catboost_predictions

array([[0.57816954],
       [0.16246426],
       [0.68261432],
       ...,
       [0.0514024 ],
       [0.27861476],
       [0.27675975]])

In [66]:
new_x_test = np.concatenate((xgboost_predictions, lgb_predictions, catboost_predictions), axis=1)

In [67]:
new_x_test.shape

(12934, 3)

In [81]:
np.save("hehe/new_x_train_v2.npy", new_x_train)
np.save("hehe/y_train.npy", y_train)
np.save("hehe/new_x_test_v2.npy", new_x_test)
np.save("hehe/y_test.npy", y_test)

In [17]:
pkl_filename = "hehe/best_xgboost.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(model, file)
    
pkl_filename = "hehe/best_lgb.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(model1, file)


In [82]:
pkl_filename = "hehe/best_catboost.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(model3, file)

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
model2 = RandomForestClassifier(n_estimators=5000,max_depth=60, min_samples_leaf=70, random_state=0)
model2.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=60, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=70, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [35]:
y_pred = model2.predict_proba(x_train)[:, 1]
performance = roc_auc_score(y_train, y_pred)
print performance

y_pred = model2.predict_proba(x_test)[:, 1]
performance = roc_auc_score(y_test, y_pred)
print performance

0.7959627031784424
0.7718811668689002


array([0.59382353, 0.20053138, 0.41059164, ..., 0.07868636, 0.2675133 ,
       0.27681837])

In [38]:
from catboost import CatBoostClassifier

model3 = CatBoostClassifier(random_seed=42, verbose=False)
model3.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x1a23a4ba50>

In [40]:
y_pred = model3.predict_proba(x_train)[:, 1]
performance = roc_auc_score(y_train, y_pred)
print performance

y_pred = model3.predict_proba(x_test)[:, 1]
performance = roc_auc_score(y_test, y_pred)
print performance

0.8498191097054603
0.7945903224839722


In [98]:
model3 = CatBoostClassifier(iterations=500,
                             learning_rate=0.07,
                             depth=8,
                             eval_metric='AUC',
                             bagging_temperature = 0.2,
#                              od_type='Iter',
#                              od_wait=20,
                             random_seed=42)
model3.fit(x_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x1a2376db90>

In [99]:
y_pred = model3.predict_proba(x_train)[:, 1]
performance = roc_auc_score(y_train, y_pred)
print performance

y_pred = model3.predict_proba(x_test)[:, 1]
performance = roc_auc_score(y_test, y_pred)
print performance

0.8916588220926331
0.7966545951543388


In [100]:
x = np.load("data/x_normalized.npy")
y = np.load("data/y.npy")
model3 = CatBoostClassifier(iterations=500,
                             learning_rate=0.07,
                             depth=8,
                             eval_metric='AUC',
                             bagging_temperature = 0.2,
#                              od_type='Iter',
#                              od_wait=20,
                             random_seed=42)
model3.fit(x, y, verbose=False)

<catboost.core.CatBoostClassifier at 0x1a23641090>

In [101]:
x_true_test = np.load("data/x_test_normalized.npy")

new_predictions = model3.predict_proba(x_true_test)[:, 1]

new_predictions[new_predictions < 0] = 0.0000001
new_predictions[new_predictions > 1] = 0.9999999

# hekhe

ids = [i for i, _ in enumerate(x_true_test)]
ids = np.array(ids)

df = pd.DataFrame()
df['id'] = ids
df['target'] = new_predictions
df.to_csv("catboost_2.csv", index=False)

print df.head()

   id    target
0   0  0.285417
1   1  0.053161
2   2  0.130045
3   3  0.372295
4   4  0.202752
