In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score

from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from yellowbrick.classifier import discrimination_threshold
from yellowbrick.classifier import confusion_matrix


In [146]:
data = pd.read_csv('../10-Data/credit-card-fraud-data.zip')

In [147]:
data = data.set_index('Time')

In [148]:
data['logAmount'] = np.log1p(data.Amount)
data = data.drop(columns={'Amount'})

In [149]:
Xtrn, Xtst, ytrn, ytst = train_test_split(data.loc[:,data.columns != 'Class'], 
                                          data.Class, 
                                          test_size=0.50, 
                                          random_state=42)

In [150]:
def performance_with_optimal_threshold(model, X = Xtrn, y = ytrn):
    
    from scipy.stats import hmean
    
    prob = model.predict_proba(X)[:,1]
    precision, recall, thresh = precision_recall_curve(y, prob)
    
    F = hmean((precision,recall))
    max_F = max(F)
    threshold = thresh[np.where(F == max_F)][0]
    
    return((threshold, max_F))
    

In [151]:
learners = {'logistic': LogisticRegression(),
            'rf': RandomForestClassifier(random_state=0),
            'nb': GaussianNB(),
            'svm': SVC(),
            'mlp': MLPClassifier(random_state=0),
            'adaboost': AdaBoostClassifier(),
            #'lda': LinearDiscriminantAnalysis(), 
            #'knn': KNeighborsClassifier(),
            'gpc': GaussianProcessClassifier()
          }

results = {}

In [None]:
%%time
for i, (n, m) in enumerate(learners.items()):
    
    try:
        print(n)
        pipe = Pipeline([('scaler', StandardScaler()),
                         ('model', m)])
        pipe.fit(Xtrn, ytrn)
        
        #Define optimal threshold on training data
        threshold, F = performance_with_optimal_threshold(pipe['model'])
        
        #Assess performance (F1 score) on test data
        yhat = (pipe['model'].predict_proba(Xtst)[:,1] > threshold)
        F = f1_score(ytst, yhat)
        
        results[n] = {'threshold': threshold,
                  'F': F}
        learners[n] = pipe['model']
    except:
        pass

In [None]:
sorted(results.items(), key = lambda item: -item[1]['F'])

Let's consider a voting model between our top performers

In [105]:
estimators = [('mlp', lerners['mlp']),
              ('rf', lerners['rf'])
             ]

voting = VotingClassifier(estimators, voting='soft')
voting.fit(Xtrn, ytrn)
threshold, F = performance_with_optimal_threshold(vote)
learners['voting'] = voting
results['voting'] = {'threshold': threshold,
                    'F': F}
print(f'Optimal threshold {threshold:.4f}')
print(f'Performance (F Statistic): {F:.4f}')

Optimal threshold 0.2826
Performance (F Statistic): 0.9798


In [131]:
#Assess performance on test data

In [130]:
yhat = voting.predict(Xtst)

In [134]:
print(classification_report(ytst, yhat))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    142158
           1       0.94      0.78      0.85       246

    accuracy                           1.00    142404
   macro avg       0.97      0.89      0.93    142404
weighted avg       1.00      1.00      1.00    142404

