In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier


from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from yellowbrick.classifier import discrimination_threshold
from yellowbrick.classifier import confusion_matrix


In [2]:
data = pd.read_csv('../10-Data/credit-card-fraud-data.zip')

In [3]:
data = data.set_index('Time')

In [4]:
data['logAmount'] = np.log1p(data.Amount)
data = data.drop(columns={'Amount'})

In [5]:
Xtrn, Xtst, ytrn, ytst = train_test_split(data.loc[:,data.columns != 'Class'], 
                                          data.Class, 
                                          test_size=0.50, 
                                          random_state=42)

In [6]:
def performance_with_optimal_threshold(model, X = Xtrn, y = ytrn):
    
    from scipy.stats import hmean
    
    prob = model.predict_proba(X)[:,1]
    precision, recall, thresh = precision_recall_curve(y, prob)
    
    F = hmean((precision,recall))
    max_F = max(F)
    threshold = thresh[np.where(F == max_F)][0]
    
    return((threshold, max_F))
    

In [25]:
lerners = {'logistic': LogisticRegression(),
           'rf': RandomForestClassifier(random_state=0),
           'nb': GaussianNB(),
           'svm': SVC(),
           'mlp': MLPClassifier(random_state=0)#,
           #'adaboost': AdaBoostClassifier()
           #'lda': LinearDiscriminantAnalysis(), 
           #'knn': KNeighborsClassifier()
          }

results = {}

In [26]:
%%time
for i, (n, m) in enumerate(lerners.items()):
    
    try:
        print(n)
        pipe = Pipeline([('scaler', StandardScaler()),
                         ('model', m)])
        pipe.fit(Xtrn, ytrn)
        threshold, F = performance_with_optimal_threshold(pipe['model'])
        results[n] = {'threshold': threshold,
                  'F': F}
        lerners[n] = pipe['model']
    except:
        pass

logistic
rf
nb
svm
mlp
Wall time: 3min 5s


In [28]:
sorted(results.items(), key = lambda item: -item[1]['F'])

[('mlp', {'threshold': 0.9669707956519142, 'F': 0.840958605664488}),
 ('rf', {'threshold': 0.5, 'F': 0.8312236286919832}),
 ('logistic', {'threshold': 0.25372937576272014, 'F': 0.775347912524851}),
 ('nb', {'threshold': 0.9999999999999787, 'F': 0.14473190851143608})]