In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv("data/creditcardfraud/creditcard.csv")
print(df.shape)

In [None]:
df.head(10)

In [None]:
print(df.isna().sum())

In [None]:
df.hist(bins=50,figsize=[20,20])
plt.show()

In [None]:
df['Class'].value_counts().plot.barh()
plt.title("Number of fraudulant transations and legal transation")
df['Class'].value_counts()

class_names = {0:'Not Fraud', 1:'Fraud'}
rvs = df.Class.value_counts().rename(index = class_names)
print(rvs)

In [None]:
# dataset is having huge difference in the number of records for each of it's target class.
# So, it is an imbalanced dataset.
#we have very less no of records for class 1 as compared to class 0 
# so we would not we able to classify class 1 properly.

# To overcome this problem, we use two sampling techniques.
#     UnderSampling
#     OverSampling

# Undersampling techniques remove examples from the training dataset that belong to the majority class. 
# OverSampling direct opposite to UnderSampling, which adds new examples or copies and adds exiting examples.

# Here, I use Oversampling, since undersampling may lead to lose of some information.
# SMOTE is an oversampling technique. first it selects a minority class instance a at random 
# and finds its k nearest minority class neighbors. The synthetic instance is then created by 
# choosing one of the k nearest neighbors b at random and connecting a and b to form a line segment 
# in the feature space. The synthetic instances are generated as a convex combination of
# the two chosen instances a and b. 


In [None]:
x = df.drop("Class",axis=1) 
y = df['Class']
print(x.shape,y.shape)

In [None]:
#before SMOTE
y.value_counts().plot.bar()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size=.25)

In [None]:
smote = SMOTE(random_state=42)
X,Y  = smote.fit_resample(x_train, y_train)

print("AFTER SMOTE")
Y.value_counts().plot.bar()

In [None]:
X.shape,Y.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),linewidths=.5,annot=False)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [None]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X, Y)

y_pred_rf_clf = rf_clf.predict(x_test)

acc_rd_clf = accuracy_score(y_test, y_pred_rf_clf)
conf = confusion_matrix(y_test, y_pred_rf_clf)
clf_report = classification_report(y_test, y_pred_rf_clf)
recall_rf = recall_score(y_test,y_pred_rf_clf)
f1_rf = f1_score(y_test,y_pred_rf_clf)

print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
print(f"Recall : \n{recall_rf}")
print(f"F1 : \n{f1_rf}")

In [None]:
from sklearn import svm

In [None]:
svm_clf = svm.SVC()


svm_clf.fit(X, Y)


y_pred_svm_clf = svm_clf.predict(x_test)


acc_svm_clf = accuracy_score(y_test, y_pred_svm_clf)
conf_svm = confusion_matrix(y_test, y_pred_svm_clf)
clf_report_svm = classification_report(y_test, y_pred_svm_clf)
recall_svm = recall_score(y_test,y_pred_svm_clf)
f1_svm = f1_score(y_test,y_pred_svm_clf)

print(f"Accuracy Score of SVM is : {acc_svm_clf}")
print(f"Confusion Matrix : \n{conf_svm}")
print(f"Classification Report : \n{clf_report_svm}")
print(f"Recall : \n{recall_svm}")
print(f"F1 : \n{f1_svm}")

In [None]:
knn = KNeighborsClassifier()


knn.fit(X, Y)


y_pred_knn_clf = knn.predict(x_test)



acc_knn_clf = accuracy_score(y_test, y_pred_knn_clf)
conf_knn = confusion_matrix(y_test, y_pred_knn_clf)
clf_report_knn = classification_report(y_test, y_pred_knn_clf)
recall_knn = recall_score(y_test,y_pred_knn_clf)
f1_knn = f1_score(y_test,y_pred_knn_clf)

print(f"Accuracy Score of KNN is : {acc_knn_clf}")
print(f"Confusion Matrix : \n{conf_knn}")
print(f"Classification Report : \n{clf_report_knn}")
print(f"Recall : \n{recall_knn}")
print(f"F1 : \n{f1_knn}")