In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
W_Data=pd.read_csv("../input/creditcard.csv")
W_Data.dropna(thresh=284315)
Data=W_Data

<h1>Exploring the Dataset</h1>

In [None]:
Data.sample(frac=0.1).head(n=5)

From the above it could be infered that the dataset has 28 anonymized features and 2 non anonymized features i) Amount and ii) Class (whether the transcation was a fraud or not) 

In [None]:
Data.describe()

In [None]:
Positives=W_Data[W_Data['Class']==1]
Negatives=W_Data[W_Data['Class']==0]

In [None]:
print((len(Positives)/len(W_Data))*100,"%")

It can infered that the datset is skewed with just 0.17274% fradulent examples. One could simply get a overall accuracy of 99.82726% by predicting every example isnt a fradulent example. But, the approach does not solve the problem. So recall rate (True positives/(True Positives+False Negatives)) would be the metric to optimize.  

<h2>Exploring Transcation Amount</h2>

**Fradulent Data**

In [None]:
sns.kdeplot(Positives['Amount'],shade=True,color="red")

**Non-Fradulent Data**

In [None]:
sns.kdeplot(Negatives['Amount'],shade=True,color="green")

<h2>Exploring Transcation Time</h2>

**Non-Fradulent Data**

In [None]:
sns.kdeplot(Negatives['Time'],shade=True,color="red")

**Fradulent Data**

In [None]:
sns.kdeplot(Positives['Time'],shade=True,color="green")

<h1>Supervised Learning Algorithms</h1> 

For the purpose of evaluating algorithms. Lets first evaluate them on a part of the data since running the algorithms on all 284315 samples would be cumbersome. Lets take 50,000 examples for the purpose of evaluating the algorithms. We will use the entire dataset to evaluate the final accuracy. 

In [None]:
Negatives=Data[Data['Class']==0]
Positives=Data[Data['Class']==1]

In [None]:
Train_Data=Data[1:50000]
Target=Train_Data['Class']
Train_Data.drop('Class',axis=1,inplace=True)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(Train_Data,Target,test_size=0.5,random_state=0)

<h3>Support Vector Machine</h3>

In [None]:
clf_l=svm.SVC(kernel='linear')
clf_l.fit(x_train,y_train)
print(classification_report(y_test,clf_l.predict(x_test)))

<h3>Random Forest Classifier</h3>

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train, y_train)
print(classification_report(y_test,clf.predict(x_test)))


From the above example it could be noted that Random Forest Classifier does the best among supervised learning classifiers. Lets try using the algorithm on the entire dataset. 

In [None]:
E_Data=pd.read_csv("../input/creditcard.csv")
E_Data.dropna(thresh=284315)
E_Train_Data=E_Data
E_Target=E_Train_Data['Class']
E_Train_Data.drop('Class',axis=1,inplace=True)
x_train_E,x_test_E,y_train_E,y_test_E=train_test_split(E_Train_Data,E_Target,test_size=0.5,random_state=0)

In [None]:
clf_E = RandomForestClassifier(max_depth=2, random_state=0)
clf_E.fit(x_train, y_train)
print(classification_report(y_test_E,clf_E.predict(x_test_E)))

<h1>Anomaly Detection Algorithms </h1>

<h2>Anomaly Detection </h2>

In [None]:
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

<h3>One Class SVM</h3>

In [None]:
W_Data=pd.read_csv("../input/creditcard.csv")
W_Data.dropna(thresh=284315)
Data=W_Data[1:50000]

In [None]:
Negatives=Data[Data['Class']==0]
Positives=Data[Data['Class']==1]

In [None]:
#RBF Kernel
clf_AD = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf_AD.fit(Negatives)

In [None]:
#Linear Kernel
clf_AD_L = svm.OneClassSVM(nu=0.1, kernel="linear", gamma=0.1)
clf_AD_L.fit(Negatives)

In [None]:
IFA=IsolationForest()
IFA.fit(Negatives)

In [None]:
train_AD_L=clf_AD_L.predict(Negatives)
test_AD_L=clf_AD_L.predict(Positives)

In [None]:
train_IFA=IFA.predict(Negatives)
test_IFA=IFA.predict(Positives)

In [None]:
train_AD=clf_AD.predict(Negatives)
test_AD=clf_AD.predict(Positives)

In [None]:
def Train_Accuracy(Mat):
   
   Sum=0
   for i in Mat:
    
        if(i==1):
        
           Sum+=1.0
            
   return(Sum/len(Mat)*100)

def Test_Accuracy(Mat):
   
   Sum=0
   for i in Mat:
    
        if(i==-1):
        
           Sum+=1.0
            
   return(Sum/len(Mat)*100)


In [None]:
print("Training: One Class SVM (RBF) : ",(Train_Accuracy(train_AD)),"%")
print("Test: One Class SVM (RBF) : ",(Test_Accuracy(test_AD)),"%")

In [None]:
print("Training: Isolation Forest: ",(Train_Accuracy(train_IFA)),"%")
print("Test: Isolation Forest: ",(Test_Accuracy(test_IFA)),"%")

In [None]:
print("Training: One Class SVM (Linear) : ",(Train_Accuracy(train_AD_L)),"%")
print("Test: One Class SVM (Linear) : ",(Test_Accuracy(test_AD_L)),"%")

Lets test Isolation Forest over the entire dataset (all 284315 examples)

In [None]:
W_Data=pd.read_csv("../input/creditcard.csv")
W_Data.dropna(thresh=284315)
Data=W_Data
Positives_E=W_Data[W_Data['Class']==1]
Negatives_E=W_Data[W_Data['Class']==0]

In [None]:
IFA=IsolationForest()
IFA.fit(Negatives_E)
train_IFA=IFA.predict(Negatives)
test_IFA=IFA.predict(Positives)

In [None]:
print("Training: Isolation Forest: ",(Train_Accuracy(train_IFA)),"%")
print("Test: Isolation Forest: ",(Test_Accuracy(test_IFA)),"%")

From the above analysis it could be noted that Isolation Forest  does the best among Anomaly detection algorithms. 

In [None]:
plt.figure(figsize=(20,18))
Corr=Data[Data.columns].corr()
sns.heatmap(Corr,annot=True)

The above matrix shows a correlation between variables. Eliminating lowly correlated variables did not help much in improving the algorithm. 

<h1>Evaluating Models on OverSampled Data </h1>

In [None]:
from imblearn.over_sampling import SMOTE 

W_Data=pd.read_csv("../input/creditcard.csv")
W_Data.dropna(thresh=284315)

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(W_Data, W_Data['Class'])

In [None]:
S_Positives=[]
S_Negatives=[]

for i in range(0,len(X_res)):
    if(y_res[i]==0):
        S_Negatives.append(X_res[i])
    else:
        S_Positives.append(X_res[i])

In [None]:
IFA=IsolationForest()
IFA.fit(S_Negatives)
S_train_IFA=IFA.predict(S_Negatives)
S_test_IFA=IFA.predict(S_Positives)

In [None]:
print("Training: Isolation Forest: ",(Train_Accuracy(S_train_IFA)),"%")
print("Test: Isolation Forest: ",(Test_Accuracy(S_test_IFA)),"%")

In [None]:
E_Data=pd.read_csv("../input/creditcard.csv")
E_Data.dropna(thresh=284315)
Outcome=E_Data['Class']
E_Data.drop('Class',axis=1,inplace=True)
X_res, y_res = sm.fit_sample(E_Data,Outcome)
x_train_E,x_test_E,y_train_E,y_test_E=train_test_split(X_res,y_res,test_size=0.5,random_state=0)
x_train_O,x_test_O,y_train_O,y_test_O=train_test_split(E_Data,Outcome,test_size=0.5,random_state=0)

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train_E, y_train_E)
print(classification_report(y_test_O,clf.predict(x_test_O)))