In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
from time import time
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
df_train = pd.read_csv('raw_data/train.csv')

In [3]:
df_train.head(5)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [4]:
df_train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [5]:
df_train.isna().sum().sum()

0

In [6]:
df_train.set_index('ID_code', inplace=True)

In [7]:
def classification_metrics(y_true, y_pred, y_pred_prob):
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

    # Accuracy
    accuracy = round(accuracy_score(y_true, y_pred), 3)
    print("Accuracy - test set: %.2f%%" % (accuracy * 100.0))

    # Classification report
    class_report = classification_report(y_true, y_pred)
    print(class_report)

    # Construct the Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

    # AUC
    auc = round(roc_auc_score(y_true, y_pred_prob), 3)
    print('AUC {0}'.format(auc))

    return auc, accuracy

In [41]:
def big_catboost(df_train, label_column):
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]

        model = CatBoostClassifier(verbose=50)

        model.fit(X_train, Y_train, eval_set=(X_valid, Y_valid), use_best_model=True)
        oof_preds[valid] = model.predict_proba(X_valid)
        model.save_model('model_weights/catboost_weights/santader_catboost_model_{0}.dump'.format(c))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/santander_oof_preds_big_catboost_train.csv', index=False)

In [8]:
def big_rf(df_train, label_column):
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]

        model = RandomForestClassifier(n_estimators=100, verbose=2)

        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'model_weights/rf_weights/santander_rf_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/santander_oof_preds_big_rf_train.csv', index=False)

In [9]:
def big_ext(df_train, label_column):
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.metrics import roc_auc_score
    import pickle
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]

        model = ExtraTreesClassifier(n_estimators=100, verbose=2)

        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'model_weights/ext_weights/ext_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/oof_preds_big_ext_train.csv', index=False)

In [10]:
def big_xgboost(df_train, label_column):
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]
        
        D_train = xgb.DMatrix(X_train, label=Y_train)
        
        model = xgb.XGBClassifier(objective='binary:logistic', verbosity=1)
        
        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'model_weights/xgb_weights/santander_xgboost_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/oof_preds_big_xgboost_train.csv', index=False)

In [11]:
def big_lgbm(dfbig_ext, label_column):
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]
        
        D_train = lgb.Dataset(X_train, label=Y_train)
        
        model = lgb.LGBMClassifier(objective='binary')
        
        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'model_weights/lgbm_weights/lgbmboost_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/oof_preds_big_lgbmboost_train.csv', index=False)

In [46]:
start = time()
big_catboost(df_train, 'target')
print(time() - start)

VAL 0
Learning rate set to 0.132949
0:	learn: 0.5866057	test: 0.5865827	best: 0.5865827 (0)	total: 511ms	remaining: 8m 30s
50:	learn: 0.2614047	test: 0.2669151	best: 0.2669151 (50)	total: 18.6s	remaining: 5m 46s
100:	learn: 0.2364212	test: 0.2453022	best: 0.2453022 (100)	total: 37.8s	remaining: 5m 36s
150:	learn: 0.2213842	test: 0.2340841	best: 0.2340841 (150)	total: 56.2s	remaining: 5m 16s
200:	learn: 0.2107474	test: 0.2271728	best: 0.2271728 (200)	total: 1m 13s	remaining: 4m 52s
250:	learn: 0.2028682	test: 0.2224886	best: 0.2224886 (250)	total: 1m 40s	remaining: 4m 59s
300:	learn: 0.1965891	test: 0.2191642	best: 0.2191642 (300)	total: 1m 58s	remaining: 4m 36s
350:	learn: 0.1910515	test: 0.2166981	best: 0.2166981 (350)	total: 2m 17s	remaining: 4m 13s
400:	learn: 0.1862341	test: 0.2148819	best: 0.2148819 (400)	total: 2m 36s	remaining: 3m 53s
450:	learn: 0.1819643	test: 0.2135118	best: 0.2135118 (450)	total: 2m 57s	remaining: 3m 36s
500:	learn: 0.1780296	test: 0.2123780	best: 0.2123780 

In [47]:
start = time()
big_rf(df_train, 'target')
print(time() - start)

VAL 0


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 13.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.0s finished


Accuracy - test set: 90.00%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35981
           1       1.00      0.00      0.00      4020

    accuracy                           0.90     40001
   macro avg       0.95      0.50      0.47     40001
weighted avg       0.91      0.90      0.85     40001

[[35981     0]
 [ 4018     2]]
AUC 0.816
VAL 1


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.6s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 14.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.5s finished


Accuracy - test set: 90.00%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35981
           1       1.00      0.00      0.00      4020

    accuracy                           0.90     40001
   macro avg       0.95      0.50      0.47     40001
weighted avg       0.91      0.90      0.85     40001

[[35981     0]
 [ 4018     2]]
AUC 0.821
VAL 2


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 14.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.6s finished
  'precision', 'predicted', average, warn_for)


Accuracy - test set: 90.00%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35980
           1       0.00      0.00      0.00      4020

    accuracy                           0.90     40000
   macro avg       0.45      0.50      0.47     40000
weighted avg       0.81      0.90      0.85     40000

[[35980     0]
 [ 4020     0]]
AUC 0.818
VAL 3


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 13.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.6s finished


Accuracy - test set: 90.00%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35980
           1       1.00      0.00      0.00      4019

    accuracy                           0.90     39999
   macro avg       0.95      0.50      0.47     39999
weighted avg       0.91      0.90      0.85     39999

[[35980     0]
 [ 4018     1]]
AUC 0.825
VAL 4


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 13.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.6s finished


Accuracy - test set: 90.00%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35980
           1       1.00      0.00      0.00      4019

    accuracy                           0.90     39999
   macro avg       0.95      0.50      0.47     39999
weighted avg       0.91      0.90      0.85     39999

[[35980     0]
 [ 4015     4]]
AUC 0.825
CV_AUC: 0.8210471890037844
4149.188729047775


In [None]:
start = time()
big_ext(df_train, 'target')
print(time() - start)

VAL 0


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  4.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.3s finished
  'precision', 'predicted', average, warn_for)


Accuracy - test set: 90.00%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35981
           1       0.00      0.00      0.00      4020

    accuracy                           0.90     40001
   macro avg       0.45      0.50      0.47     40001
weighted avg       0.81      0.90      0.85     40001

[[35981     0]
 [ 4020     0]]
AUC 0.796
VAL 1


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  4.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.9s finished
  'precision', 'predicted', average, warn_for)


Accuracy - test set: 90.00%
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     35981
           1       0.00      0.00      0.00      4020

    accuracy                           0.90     40001
   macro avg       0.45      0.50      0.47     40001
weighted avg       0.81      0.90      0.85     40001

[[35981     0]
 [ 4020     0]]
AUC 0.804
VAL 2


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


In [None]:
start = time()
big_xgboost(df_train, 'target')
print(time() - start)

In [None]:
start = time()
big_lgbm(df_train, 'target')
print(time() - start)