In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, RidgeClassifier, ElasticNet, LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
    precision_recall_curve,
    average_precision_score,
    auc,
)

# Accelerate the development cycle
SAMPLE_FRAC: float = .1


In [2]:
# Without feature engineering
app_df = pd.read_csv("../data/processed/application.csv")
# Sample to speed up development
if  float == type(SAMPLE_FRAC) and 0 < SAMPLE_FRAC < 1:
    app_df = app_df.sample(frac=SAMPLE_FRAC)

X_app = app_df.drop(["TARGET"], axis=1)
y_app = app_df["TARGET"]
X_train, X_test, y_train, y_test = train_test_split(X_app, y_app, test_size=0.2, random_state=42)

# With feature engineering
feat_app_df = pd.read_csv("../data/processed/application_features.csv")
# Sample to speed up development
if  float == type(SAMPLE_FRAC) and 0 < SAMPLE_FRAC < 1:
    feat_app_df = feat_app_df.sample(frac=SAMPLE_FRAC)

X_feat = feat_app_df.drop(["TARGET"], axis=1)
y_feat = feat_app_df["TARGET"]
X_feat_train, X_feat_test, y_feat_train, y_feat_test = train_test_split(X_feat, y_feat, test_size=0.2, random_state=42)

In [3]:

# Define a result table as a DataFrame
result_table = pd.DataFrame(columns=['classifier', 'fpr','tpr','roc_auc', 'precision', 'recall', 'average_precision'])


for df in [app_df]:
    X = pd.get_dummies(df.drop(["TARGET"], axis=1))
    y = df["TARGET"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipe = Pipeline([
        ('encode', 'passthrough'),
        ('scale', 'passthrough'),
        ('select', 'passthrough'),
        ('reduce', 'passthrough'),
        ('impute', 'passthrough'),
        ('classify', DummyClassifier()),
    ])

    parameters = {
        'encode': [OneHotEncoder(handle_unknown='ignore')],
        'scale': ['passthrough'],
        'select': ['passthrough'],
        'reduce': ['passthrough'],  
        'impute': ['passthrough'],
        'classify': [
            DummyClassifier(), 
            LinearRegression(), RidgeClassifier(), ElasticNet(), LogisticRegression(), SGDClassifier(),
            SVC(), 
            KNeighborsClassifier(),
            BaggingClassifier(), RandomForestClassifier(),
            MLPClassifier(),
            XGBClassifier(),
        ],
    }

    grid = GridSearchCV(
        pipe, 
        parameters, 
        cv=2, 
        n_jobs=4,
        verbose=9,
        scoring='roc_auc',
    ).fit(X_train, y_train)

    print('Training set score: ' + str(grid.score(X_train, y_train)))
    print('Test set score: ' + str(grid.score(X_test, y_test)))
    print()

    print()

    print("Best parameters set found on development set:")
    print(grid.best_params_)
    print()

    print("Grid scores on development set:")
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()


    try:
        y_true, y_pred = y_test, grid.predict(X_test)

        print("Confusion matrix:")
        print(confusion_matrix(y_true, y_pred))
        print()

        print("Detailed classification report:")
        print(classification_report(y_true, y_pred, ))
        print()
    except:
        pass

    try:
        y_true, y_pred_proba = y_test, grid.predict_proba(X_test)[:, 1]

        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
        roc_auc = roc_auc_score(y_true, y_pred_proba)

        plt.figure()
        plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % roc_auc)
        plt.plot([0, 1], [0, 1], "k--")
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Receiver operating characteristic example")
        plt.legend(loc="lower right")
        plt.show()

        precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
        average_precision = average_precision_score(y_true, y_pred_proba)

        plt.figure()
        plt.step(recall, precision, color="b", alpha=0.2, where="post")
        plt.fill_between(recall, precision, step="post", alpha=0.2, color="b")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title(
            "2-class Precision-Recall curve: AP={0:0.2f}".format(average_precision)
        )
        plt.show()
    except:
        pass









Fitting 2 folds for each of 12 candidates, totalling 24 fits


KeyboardInterrupt: 