# Imports

In [5]:
import joblib
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (balanced_accuracy_score, classification_report,
                             confusion_matrix, roc_auc_score)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

plt.style.use('ggplot')

# Load data

In [2]:

df = pd.read_csv("data/creditcard.csv")
df.shape

(284807, 31)

# Split data

In [3]:
data = df.iloc[:,:-1]
target = df["Class"]

#split data into train and test
X_train,X_test,y_train,y_test=train_test_split(data,target,test_size=0.3,stratify=target)

#to avoid warning
X_train = X_train.copy()
X_test = X_test.copy()

print("Training set "+ str(X_train.shape))
print("Testing set "+str(X_test.shape))

Training set (199364, 30)
Testing set (85443, 30)


## balance?

In [74]:
len_fraud=len(X_train[y_train==1])
a=X_train[y_train==1]
b=X_train[y_train==0].sample(n = len_fraud)
y=[1]*344+[0]*344
X=pd.concat([a,b])

# Train

In [88]:
# train a single model
pipeRF = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier(max_depth=10,n_estimators = 101))])
pipeRF.fit(X_train, y_train)
# pipeRF.fit(X, y)



Pipeline(steps=[('scaler', StandardScaler()),
                ('rf', RandomForestClassifier(max_depth=10, n_estimators=101))])

## hyperparameter tuning and train 

In [102]:
from sklearn.metrics import roc_auc_score,balanced_accuracy_score,classification_report,confusion_matrix,f1_score

# Fit the grid search objects
pipeRF1 = Pipeline([('rf', RandomForestClassifier())])
pipeRF2 = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())])
pipeMLP = Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier())])

#create grid
grid_params_rf = {
    'rf__max_depth': [5,10,15],
    'rf__n_estimators': [51,101],
}

#create grid
grid_params_mlp = {
    'mlp__hidden_layer_sizes': [10,100,200],
}


# Construct grid searches
gs_rf1 = GridSearchCV(estimator=pipeRF1,
                    param_grid=grid_params_rf,
                    scoring='f1_micro',
                    return_train_score=True,
                    verbose=10,
                    cv=10, 
                    n_jobs=4)

gs_rf2 = GridSearchCV(estimator=pipeRF2,
                    param_grid=grid_params_rf,
                    scoring='f1_micro',
                    return_train_score=True,
                    verbose=10,
                    cv=10, 
                    n_jobs=4)


gs_mlp = GridSearchCV(estimator=pipeMLP,
                    param_grid=grid_params_mlp,
                    scoring='f1_micro',
                    return_train_score=True,
                    verbose=10,
                    cv=10, 
                    n_jobs=4)

print('Performing model optimizations...')
best_f1 = 0.0
best_clf = 0
best_gs = ''
grid_dict = {
    0: 'RandomForest_alone'
    1: 'RandomForest'
    2: 'MLPClassifier'
}

grids=[gs_rf1,gs_rf2,gs_mlp]
cv_results_pipelines = []
grid_set=[]
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])	
    # Fit grid search	
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    grid_set.append(gs)
    cv_results_pipelines.append(gs.cv_results_)
    print('Test set f1 score for best params: %.3f ' % f1_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if f1_score(y_test, y_pred) > best_f1:
        best_f1 = f1_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
    
print('\nClassifier with best test set f1: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

Performing model optimizations...

Estimator: RandomForest_std
Fitting 10 folds for each of 2 candidates, totalling 20 fits
Best params: {'rf__max_depth': 15, 'rf__n_estimators': 101}
Test set f1 score for best params: 0.856 

Classifier with best test set f1: RandomForest_std

Saved RandomForest_std grid search pipeline to file: best_gs_pipeline.pkl


In [44]:
# # to save the test dataset
# test_data = X_test
# test_data['target']=y_test
# test_data.to_csv("test.csv",index=False)