#### Imports

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

import warnings
warnings.simplefilter('ignore')

#### Data

In [34]:
df = pd.read_csv('data/creditcard.csv')
df.columns = [lbl.lower() for lbl in df.columns if lbl != 'Class'] + ['fraud']

#### Exploratory Analysis
 - Start with Basics
 - Plot Numerical Distributions
 - Plot Categorical Distributions
 - Plot Segmentations
 - Study Correlations

In [None]:
f, ax = plt.subplots(figsize=(12, 9))
sns.distplot(df.v2, color="b", kde = False,  ax=ax)
plt.show()

In [12]:
df.head()

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### Data Cleaning

data already cleaned
 - Remove Unwanted observations
 - Fix Structural Errors
 - Filter Unwanted Outliers
 - Handle Missing Data

In [24]:
df.columns = ['time', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10',
       'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20',
       'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28', 'amount',
       'cls']

In [22]:
X = df.drop(['time', 'amount', 'class'], axis=1)
y = df['class']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Algorithm Selection

In [None]:
# Setting up the parameter grids
param_grid_lr  = [{'clf_lr__penalty': ['l2'],
                   'clf_lr__C': np.logspace(-4, 4, 9)}]

param_grid_knn = [{'clf_knn__n_neighbors': list(range(2, 10)),
                   'clf_knn__p': [1, 2],
                   'clf_knn__leaf_size': np.arange(10,51,10)}]

param_grid_rf  = [{'n_estimators': [10, 50, 100, 250, 500, 1000],
                   'min_samples_leaf': [1, 3, 5],
                   'max_features': ['sqrt', 'log2']}]

param_grid_svc = [{'clf_svc__C': np.logspace(-4, 4, 9),
                    'clf_svc__gamma': np.logspace(-4, 0, 4)},
                  {'clf_svc__kernel': ['linear'],
                   'clf_svc__C': np.logspace(-4, 4, 9)}]



clf_lrn  = LogisticRegression(solver='lbfgs', random_state=0, max_iter=1000)
clf_rft  = RandomForestClassifier(random_state=0)
clf_svc = SVC(random_state=0)
clf_knn = KNeighborsClassifier()

# Building the model pipelines incl. preprocessing where needed 
# Note that the random forest does not need feature scaling

pipe_lrn  = Pipeline([('std', StandardScaler()),
                     ('clf_lr', clf_lr)])

pipe_svc = Pipeline([('std', StandardScaler()),
                     ('clf_svc', clf_svc)])

pipe_knn = Pipeline([('std', StandardScaler()),
                     ('clf_knn', clf_knn)])

In [None]:
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
outer_scores = {}

for name, gs_est in sorted(gridcvs.items()):
    nested_score = cross_val_score(gs_est, 
                                   X=X_train, 
                                   y=y_train, 
                                   cv=outer_cv,
                                   n_jobs=1)
    outer_scores[name] = nested_score
    print(f'{name}: outer accuracy {100*nested_score.mean():.2f} +/- {100*nested_score.std():.2f}')

#### Model Training