In [85]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
from time import time
import lightgbm as lgb
import pickle
from sklearn.metrics import roc_auc_score

In [86]:
raw_df = pd.read_csv('raw_data/adult.csv')

In [87]:
raw_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [88]:
print(raw_df.shape)

(48842, 15)


In [89]:
raw_df.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [90]:
raw_df['income'] = raw_df['income'].apply(lambda x: 0 if x.strip() == '<=50K' else 1)

In [91]:
raw_df.columns.tolist()

['age',
 'workclass',
 'fnlwgt',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [92]:
CATEGORICAL_COLUMNS = [
 'workclass',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'hours-per-week',
 'native-country']

In [93]:
def get_categorical_indices(X_train, cat_columns):
    cat_columns_indices = [X_train.columns.get_loc(c) for c in cat_columns if c in X_train]
    return cat_columns_indices

In [94]:
def classification_metrics(y_true, y_pred, y_pred_prob):
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

    # Accuracy
    accuracy = round(accuracy_score(y_true, y_pred), 3)
    print("Accuracy - test set: %.2f%%" % (accuracy * 100.0))

    # Classification report
    class_report = classification_report(y_true, y_pred)
    print(class_report)

    # Construct the Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

    # AUC
    auc = round(roc_auc_score(y_true, y_pred_prob), 3)
    print('AUC {0}'.format(auc))

    return auc, accuracy

In [95]:
def big_catboost(df_train, label_column, cat_columns):
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]

        model = CatBoostClassifier(verbose=100)

        model.fit(X_train, Y_train, eval_set=(X_valid, Y_valid), use_best_model=True,
                  cat_features=get_categorical_indices(X_train, cat_columns))
        oof_preds[valid] = model.predict_proba(X_valid)
        model.save_model('models_weights/catboost_weights/catboost_model_{0}.dump'.format(c))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/oof_preds_big_catboost_train.csv', index=False)

In [96]:
start = time()
big_catboost(raw_df, 'income', CATEGORICAL_COLUMNS)
print('Catboost time', time() - start)

VAL 0
Learning rate set to 0.114656
0:	learn: 0.5834288	test: 0.5822951	best: 0.5822951 (0)	total: 101ms	remaining: 1m 40s
100:	learn: 0.2775209	test: 0.2856788	best: 0.2856788 (100)	total: 9.63s	remaining: 1m 25s
200:	learn: 0.2681615	test: 0.2792384	best: 0.2792196 (198)	total: 19.8s	remaining: 1m 18s
300:	learn: 0.2635871	test: 0.2773754	best: 0.2773754 (300)	total: 30.6s	remaining: 1m 10s
400:	learn: 0.2604416	test: 0.2764249	best: 0.2764195 (396)	total: 41s	remaining: 1m 1s
500:	learn: 0.2584080	test: 0.2760876	best: 0.2760838 (475)	total: 51.5s	remaining: 51.3s
600:	learn: 0.2563432	test: 0.2758985	best: 0.2758945 (598)	total: 1m 1s	remaining: 41.1s
700:	learn: 0.2545192	test: 0.2756325	best: 0.2756144 (657)	total: 1m 13s	remaining: 31.5s
800:	learn: 0.2527578	test: 0.2754641	best: 0.2754480 (792)	total: 1m 30s	remaining: 22.4s
900:	learn: 0.2511441	test: 0.2753642	best: 0.2753101 (853)	total: 1m 49s	remaining: 12s
999:	learn: 0.2495482	test: 0.2753047	best: 0.2752931 (996)	total

KeyboardInterrupt: 

In [None]:
# Accuracy - test set: 87.70%
#               precision    recall  f1-score   support

#            0       0.90      0.94      0.92      7431
#            1       0.79      0.66      0.72      2337

#     accuracy                           0.88      9768
#    macro avg       0.84      0.80      0.82      9768
# weighted avg       0.87      0.88      0.87      9768

# [[7009  422]
#  [ 784 1553]]
# AUC 0.931
# CV_AUC: 0.9301693419797552
# Catboost time 1449.4976069927216

In [28]:
# get dummies
df_dummies = pd.get_dummies(raw_df, columns=CATEGORICAL_COLUMNS)

In [29]:
print(df_dummies.shape)

(48842, 219)


In [81]:
def big_rf(df_train, label_column):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_auc_score
    import pickle
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]

        model = RandomForestClassifier(n_estimators=100, verbose=2)

        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'models_weights/rf_weights/rf_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/oof_preds_big_rf_train.csv', index=False)



In [15]:
start = time()
big_rf(df_dummies, 'income')
print(time() - start)

VAL 0
building tree 1 of 100


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   13.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 85.20%
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      7431
           1       0.73      0.61      0.66      2338

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.78      9769
weighted avg       0.85      0.85      0.85      9769

[[6908  523]
 [ 923 1415]]
AUC 0.897
VAL 1


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   13.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 84.80%
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      7431
           1       0.72      0.60      0.66      2338

    accuracy                           0.85      9769
   macro avg       0.80      0.76      0.78      9769
weighted avg       0.84      0.85      0.84      9769

[[6870  561]
 [ 924 1414]]
AUC 0.895
VAL 2


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   13.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 85.50%
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      7431
           1       0.74      0.61      0.67      2337

    accuracy                           0.86      9768
   macro avg       0.81      0.77      0.79      9768
weighted avg       0.85      0.86      0.85      9768

[[6918  513]
 [ 902 1435]]
AUC 0.903
VAL 3
building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   13.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 85.70%
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7431
           1       0.73      0.63      0.68      2337

    accuracy                           0.86      9768
   macro avg       0.81      0.78      0.79      9768
weighted avg       0.85      0.86      0.85      9768

[[6896  535]
 [ 860 1477]]
AUC 0.901
VAL 4
building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   13.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


Accuracy - test set: 85.30%
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      7431
           1       0.72      0.63      0.67      2337

    accuracy                           0.85      9768
   macro avg       0.81      0.78      0.79      9768
weighted avg       0.85      0.85      0.85      9768

[[6872  559]
 [ 875 1462]]
AUC 0.9
CV_AUC: 0.8989343424379796
72.79848980903625


In [None]:
# Accuracy - test set: 85.30%
#               precision    recall  f1-score   support

#            0       0.89      0.92      0.91      7431
#            1       0.72      0.63      0.67      2337

#     accuracy                           0.85      9768
#    macro avg       0.81      0.78      0.79      9768
# weighted avg       0.85      0.85      0.85      9768

# [[6872  559]
#  [ 875 1462]]
# AUC 0.9
# CV_AUC: 0.8989343424379796
# 72.7984898090362

In [82]:
def big_ext(df_train, label_column):
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.metrics import roc_auc_score
    import pickle
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]

        model = ExtraTreesClassifier(n_estimators=100, verbose=2)

        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'model_weights/ext_weights/ext_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds/oof_preds_big_ext_train.csv', index=False)

In [17]:
start = time()
big_ext(df_dummies, 'income')
print(time() - start)

VAL 0
building tree 1 of 100


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   22.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 82.80%
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      7431
           1       0.66      0.58      0.62      2338

    accuracy                           0.83      9769
   macro avg       0.77      0.74      0.75      9769
weighted avg       0.82      0.83      0.82      9769

[[6726  705]
 [ 978 1360]]
AUC 0.869
VAL 1


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   18.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 82.70%
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      7431
           1       0.66      0.58      0.62      2338

    accuracy                           0.83      9769
   macro avg       0.76      0.74      0.75      9769
weighted avg       0.82      0.83      0.82      9769

[[6714  717]
 [ 972 1366]]
AUC 0.867
VAL 2
building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   18.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 83.10%
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      7431
           1       0.67      0.59      0.62      2337

    accuracy                           0.83      9768
   macro avg       0.77      0.75      0.76      9768
weighted avg       0.83      0.83      0.83      9768

[[6751  680]
 [ 969 1368]]
AUC 0.875
VAL 3
building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   18.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy - test set: 83.20%
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      7431
           1       0.67      0.60      0.63      2337

    accuracy                           0.83      9768
   macro avg       0.77      0.75      0.76      9768
weighted avg       0.83      0.83      0.83      9768

[[6730  701]
 [ 936 1401]]
AUC 0.872
VAL 4
building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   18.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


Accuracy - test set: 82.80%
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      7431
           1       0.66      0.59      0.62      2337

    accuracy                           0.83      9768
   macro avg       0.77      0.75      0.75      9768
weighted avg       0.82      0.83      0.82      9768

[[6721  710]
 [ 967 1370]]
AUC 0.871
CV_AUC: 0.870979102261786
103.5144693851471


In [None]:
# Accuracy - test set: 82.80%
#               precision    recall  f1-score   support

#            0       0.87      0.90      0.89      7431
#            1       0.66      0.59      0.62      2337

#     accuracy                           0.83      9768
#    macro avg       0.77      0.75      0.75      9768
# weighted avg       0.82      0.83      0.82      9768

# [[6721  710]
#  [ 967 1370]]
# AUC 0.871
# CV_AUC: 0.870979102261786
# 103.5144693851471

In [83]:
def big_xgboost(df_train, label_column):
    import pickle
    from sklearn.metrics import roc_auc_score
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]
        
        D_train = xgb.DMatrix(X_train, label=Y_train)
        D_valid = xgb.DMatrix(X_valid, label=Y_valid)
        model = xgb.XGBClassifier(objective='binary:logistic', verbosity=1)
        
        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'model_weights/xgb_weights/xgboost_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds_big_xgboost_train.csv', index=False)

In [19]:
print(df_dummies.shape)

(48842, 219)


In [20]:
start = time()
big_xgboost(df_dummies, 'income')
print(time() - start)

VAL 0


  if getattr(data, 'base', None) is not None and \


Accuracy - test set: 86.10%
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7431
           1       0.79      0.57      0.66      2338

    accuracy                           0.86      9769
   macro avg       0.83      0.76      0.79      9769
weighted avg       0.85      0.86      0.85      9769

[[7077  354]
 [1007 1331]]
AUC 0.911
VAL 1


  if getattr(data, 'base', None) is not None and \


Accuracy - test set: 85.80%
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      7431
           1       0.78      0.56      0.65      2338

    accuracy                           0.86      9769
   macro avg       0.83      0.76      0.78      9769
weighted avg       0.85      0.86      0.85      9769

[[7066  365]
 [1022 1316]]
AUC 0.91
VAL 2


  if getattr(data, 'base', None) is not None and \


Accuracy - test set: 86.30%
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7431
           1       0.79      0.58      0.67      2337

    accuracy                           0.86      9768
   macro avg       0.84      0.77      0.79      9768
weighted avg       0.86      0.86      0.86      9768

[[7075  356]
 [ 978 1359]]
AUC 0.917
VAL 3


  if getattr(data, 'base', None) is not None and \


Accuracy - test set: 86.40%
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7431
           1       0.80      0.58      0.67      2337

    accuracy                           0.86      9768
   macro avg       0.84      0.77      0.79      9768
weighted avg       0.86      0.86      0.86      9768

[[7086  345]
 [ 986 1351]]
AUC 0.919
VAL 4


  if getattr(data, 'base', None) is not None and \


Accuracy - test set: 86.50%
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7431
           1       0.80      0.58      0.67      2337

    accuracy                           0.86      9768
   macro avg       0.84      0.77      0.79      9768
weighted avg       0.86      0.86      0.86      9768

[[7082  349]
 [ 973 1364]]
AUC 0.916
CV_AUC: 0.9143841420990975
148.65156865119934


In [None]:
# Accuracy - test set: 86.50%
#               precision    recall  f1-score   support

#            0       0.88      0.95      0.91      7431
#            1       0.80      0.58      0.67      2337

#     accuracy                           0.86      9768
#    macro avg       0.84      0.77      0.79      9768
# weighted avg       0.86      0.86      0.86      9768

# [[7082  349]
#  [ 973 1364]]
# AUC 0.916
# CV_AUC: 0.9143841420990975
# 148.65156865119934

In [84]:
def big_lgbm(df_train, label_column):
    
    # LABEL SEPARATION
    y_train = df_train[label_column]
    df_train = df_train.drop(label_column, axis=1)
    # CROSS VALIDATION
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(df_train), 2))

    for train, valid in cv.split(df_train, y_train):
        print("VAL %s" % c)
        X_train = df_train.iloc[train]
        Y_train = y_train.iloc[train]
        X_valid = df_train.iloc[valid]
        Y_valid = y_train.iloc[valid]
        
        D_train = lgb.Dataset(X_train, label=Y_train)
        D_valid = lgb.Dataset(X_valid, label=Y_valid)
        model = lgb.LGBMClassifier(objective='binary', silent=False)
        
        model.fit(X_train, Y_train)
        oof_preds[valid] = model.predict_proba(X_valid)
        filename = f'model_weights/lgbm_weights/lgbmboost_model_{c}.sav'
        pickle.dump(model, open(filename, 'wb'))

        # EVALUATION PHASE
        classification_metrics(Y_valid, model.predict(X_valid), [r[1] for r in oof_preds[valid]])
        c += 1

    oof_preds = [row[1] for row in oof_preds]
    auc = roc_auc_score(y_train, oof_preds)
    print("CV_AUC: {}".format(auc))

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(df_train.index.tolist())
    oof_pred_df['target'] = pd.Series(oof_preds)
    oof_pred_df.to_csv('oof_preds_big_lgbmboost_train.csv', index=False)

In [38]:
start = time()
big_lgbm(df_dummies, 'income')
print(time() - start)

VAL 0
Accuracy - test set: 87.30%
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      7431
           1       0.78      0.66      0.71      2338

    accuracy                           0.87      9769
   macro avg       0.84      0.80      0.82      9769
weighted avg       0.87      0.87      0.87      9769

[[6997  434]
 [ 803 1535]]
AUC 0.926
VAL 1
Accuracy - test set: 87.00%
              precision    recall  f1-score   support

           0       0.89      0.94      0.92      7431
           1       0.78      0.64      0.70      2338

    accuracy                           0.87      9769
   macro avg       0.84      0.79      0.81      9769
weighted avg       0.87      0.87      0.87      9769

[[7002  429]
 [ 838 1500]]
AUC 0.924
VAL 2
Accuracy - test set: 87.60%
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      7431
           1       0.79      0.65      0.72      2337

    ac