**Credit card fraud detection**

This notebook handles the skewed data and applies the dataset on multiple classifiers like the decision tree, random forest, logistic regression, SVM(both RBF and linear kernels),  k-Nearest Neighbors, Naive Bayes. We will pick the best model using cross-validation and pick the best values for the models using grid search. The model's accuracy is validated by K-fold cross validation and the confusion matrix is visualized.

The handling of skewed data is taken from the below link
https://www.kaggle.com/joparga3/in-depth-skewed-data-classif-93-recall-acc-now

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
#importing the dataset
dataset = pd.read_csv("../input/creditcard.csv")
dataset.head()

In [None]:
#Checking the target classes
count_classes = pd.value_counts(dataset['Class'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

In [None]:
#feature scaling is done on the values that have not been normalized 
from sklearn.preprocessing import StandardScaler
dataset['normAmount'] = StandardScaler().fit_transform(dataset['Amount'].reshape(-1, 1))
#dropping copied and unneeded columns
dataset = dataset.drop(['Time','Amount'],axis=1)

In [None]:
#assign x and y values
x = np.array(dataset.iloc[:,:-1])
y = np.array(dataset.iloc[:,-2])

In [None]:
# Number of data points in the minority class
number_records_fraud = len(dataset[dataset.Class == 1])
fraud_indices = np.array(dataset[dataset.Class == 1].index)

# Picking the indices of the normal classes
normal_indices = dataset[dataset.Class == 0].index

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
random_normal_indices = np.array(random_normal_indices)

# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])

# Under sample dataset
under_sample_data = dataset.iloc[under_sample_indices,:]

x_undersample = np.array(under_sample_data.ix[:, under_sample_data.columns != 'Class'])
y_undersample = np.array(under_sample_data.ix[:, under_sample_data.columns == 'Class'])

In [None]:
#splitting the sample data into trian and test set
from sklearn.cross_validation import train_test_split

# Undersampled dataset
x_train_undersample, x_test_undersample, y_train_undersample, y_test_undersample = train_test_split(x_undersample,y_undersample,test_size = 0.3)

In [None]:
#checking the target class
count_classes = pd.value_counts(np.ravel(y_train_undersample), sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

In [None]:
names = ["Logistic Regression","Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest","Naive Bayes" ]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(),
    SVC(kernel="linear"),
    SVC(kernel="rbf"),
    DecisionTreeClassifier(criterion = 'entropy'),
    RandomForestClassifier(criterion = 'entropy'),
    GaussianNB(),
]


In [None]:
from sklearn.model_selection import cross_val_score
results = {}
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, x_train_undersample, np.ravel(y_train_undersample), cv=5)
    results[name] = scores
           
for name, scores in results.items():
    print("%20s | Accuracy: %0.2f%% (+/- %0.2f%%)" % (name, 100*scores.mean(), 100*scores.std() * 2))
    

In [None]:
from sklearn.grid_search import GridSearchCV

clf = RandomForestClassifier()

# prepare a range of values to test
param_grid = [
  {'n_estimators': [10,30,50,80,100,200], 'criterion': ['gini','entropy']},
 ]

grid = GridSearchCV(estimator=clf, param_grid=param_grid)
grid.fit(x_train_undersample, np.ravel(y_train_undersample))
print(grid)

clf = RandomForestClassifier()
clf.fit(x_train_undersample, np.ravel(y_train_undersample))
y_pred = clf.predict(x_test_undersample)


In [None]:
#creating the confusion matrix and checking the accuracy
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cm = confusion_matrix(y_test_undersample,y_pred)
acc = accuracy_score(y_test_undersample,y_pred)
clr = classification_report(y_test_undersample,y_pred)

In [None]:
#visulaizing the confusion matirx
import seaborn as sns
print(acc)
print(clr)
label = ["0","1"]
sns.heatmap(cm, annot=True, xticklabels=label, yticklabels=label)

In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = clf, X = x_train_undersample, y = np.ravel(y_train_undersample), cv = 10)
accuracies.mean()
