In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv('../input/creditcard.csv')

print(df.shape)
df.head()

In [None]:
print(df.Class.value_counts())

sns.countplot(df.Class)

From the plot above, we can see we have a very imbalanced class.  

By working through the tutorial found here:  https://elitedatascience.com/imbalanced-classes we can effective ways to deal with class imbalance.

In [None]:
# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# first setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

In [None]:
# Modeling the data as is
# Train model
take_0 = LogisticRegression().fit(X_train, y_train)
 
# Predict on training set
take_0_y_ = take_0.predict(X_test)

In [None]:
# Checking accuracy
accuracy_score(y_test, take_0_y_)

In [None]:
# Checking unique values
predictions = pd.DataFrame(take_0_y_)
predictions[0].value_counts()

We have a very high accuracy score of 0.999 but that is only because the model is predicting mostly no fraud cases.  We can attempt to deal with this in several different ways.

## 1. Up-sample Minority Class

In [None]:
from sklearn.utils import resample

In [None]:
# separate minority and majority classes
not_fraud = df[df.Class==0]
fraud = df[df.Class==1]

# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=284315, # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
upsampled.Class.value_counts()

In [None]:
# trying logistic regression again with the balanced dataset

y = upsampled.Class
X = upsampled.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

take_1 = LogisticRegression().fit(X_train, y_train)

take_1_y_ = take_1.predict(X_test)

In [None]:
# Checking accuracy
accuracy_score(y_test, take_1_y_)

In [None]:
# Checking unique values
predictions['Prediction'] = pd.DataFrame(take_1_y_)
predictions.Prediction.value_counts()

Our accuracy score decreased after upsampling, but the model is now predicting both classes more equally, making it a better model.

## 2. Down-sample Majority Class

In [None]:
# still using our separated classes fraud and not_fraud from above

# downsample majority
not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = 492, # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])

# checking counts
downsampled.Class.value_counts()

In [None]:
# trying logistic regression again with the balanced dataset

y = downsampled.Class
X = downsampled.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

take_2 = LogisticRegression().fit(X_train, y_train)

take_2_y_ = take_2.predict(X_test)

In [None]:
# Checking accuracy
accuracy_score(y_test, take_2_y_)

In [None]:
# Checking unique values
predictions['Prediction'] = pd.DataFrame(take_2_y_)
predictions.Prediction.value_counts()

Downsampling produced a higher accuracy than upsampling!  My concern here is the small number of total samples we had to train the model on.  I'm not sure if this method is truely better than upsampling?

## 3.  Change the performance metric

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
take_2_y_ = take_2.predict_proba(X_test)

take_2_y_ = [p[1] for p in take_2_y_]

roc_auc_score(y_test, take_2_y_)

## 4. Tree Based Algorithms

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

In [None]:
# train model
take_4 = RandomForestClassifier().fit(X_train, y_train)

# predict on test set
take_4_y_ = take_4.predict(X_test)

accuracy_score(y_test, take_4_y_)

In [None]:
# Checking unique values
predictions['Prediction'] = pd.DataFrame(take_4_y_)
predictions.Prediction.value_counts()

The Random Forest has an accuracy score of 0.9995 - which is higher than our first model!  This seems to be the best option for this dataset!