In [30]:
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn import metrics
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler



In [32]:
def custom_metric(estimator, X, y_true):
    y_prob = estimator.predict_proba(X)
    fpr, tpr, _ = metrics.roc_curve(y_true.astype('int64'),y_prob[:,1])
    tnr = 1-fpr
    i = len(tpr) - 1

    while tpr[i]==1:
        i-=1
        return tnr[i]
    
    

In [10]:
df = pandas.read_csv('credit_default_clients.csv', sep=';')

In [15]:
df = df.rename(columns={'default payment next month': "default"})
y = df["default"]
var = [col for col in list(df.columns) if col != "default"]
X = df[var]

X_train, X_test, y_train, y_test = train_test_split(X.astype(np.float64),
    y.astype(np.float64), train_size=0.75, test_size=0.25, random_state=42)

In [21]:
pipeline_optimizer = TPOTClassifier(generations=50, population_size = 50, scoring = "average_precision",
                                   n_jobs = 3, max_time_mins= 10, use_dask = True)

In [22]:
pipeline_optimizer.fit(X_train, y_train)

TPOTClassifier(generations=50,
               log_file=<ipykernel.iostream.OutStream object at 0x10ede6cd0>,
               max_time_mins=10, n_jobs=3, population_size=50,
               scoring='average_precision', use_dask=True)

In [35]:
pipeline_optimizer.export('tpot_exported.py')

In [33]:
pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier())
            ])
param_grid = [{'classifier' : [MLPClassifier()],
               'classifier__hidden_layer_sizes': [(12, 6),(12, 6, 3)],
              'classifier__solver': ['adam'],
              'classifier__alpha': [0.1, 0.5],
               'classifier__activation' : ['logistic'],
              'classifier__solver': ['lbfgs'],
              },
             {'classifier' : [RandomForestClassifier(n_jobs = -1)],
              'classifier__n_estimators': [10,20],
              'classifier__max_depth' : [5, 10]}]

gs = GridSearchCV(pipeline,param_grid=param_grid,scoring=custom_metric, n_jobs=-1)
gs.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('classifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'classifier': [MLPClassifier()],
                          'classifier__activation': ['logistic'],
                          'classifier__alpha': [0.1, 0.5],
                          'classifier__hidden_layer_sizes': [(12, 6),
                                                             (12, 6, 3)],
                          'classifier__solver': ['lbfgs']},
                         {'classifier': [RandomForestClassifier(max_depth=5,
                                                                n_estimators=10,
                                                                n_jobs=-1)],
                          'classifier__max_depth': [5, 10],
                          'classifier__n_estimators': [10, 20]}],
             scoring=<function cust

In [36]:
gs.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=5, n_estimators=10,
                                        n_jobs=-1))])