In [None]:
import pandas as pd
import csv
df = pd.read_csv('fraud.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [None]:
print(df['type'].unique())
#print unique values of type column in order to map them because they are non-numeric and therefore need to be numerically mapped so that we can use knn

['PAYMENT' 'TRANSFER' 'CASH_OUT' 'DEBIT' 'CASH_IN']


In [None]:
mapped_type = {'PAYMENT':0, 'TRANSFER':1, 'CASH_OUT':2, 'DEBIT':3, 'CASH_IN':4}
df = df.replace({'type':mapped_type})
#replacing the categorical column type with numerical mapping of it
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,0,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,0,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,1,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,2,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,1,0,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [None]:
#drop column isFraud from X because that's the result we want/target variable
#drop the categorical colums nameOrig and nameDest
X = df.drop(columns=['isFraud','nameOrig','nameDest'])
X.head()
y = df["isFraud"].values
#view target values for first 5 rows


In [None]:
print(df['nameDest'].unique())
#also a lot of unique values, therefore we dropped the column

['M1979787155' 'M2044282225' 'C553264065' ... 'M1601421613' 'M524833426'
 'M334249577']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors  import KNeighborsClassifier


#split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
#test size = 0.2 means 20% of data will be used as testing sample
#random_state=1 ensures that we get the same split each time so we can reproduce our results.
#stratify to y =training split will represent the proportion of each value in the y variable.


# Create KNN classifier
# Fit the classifier to the data
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print("KNN model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

[[20299     1]
 [   22     1]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20300
           1       0.50      0.04      0.08        23

    accuracy                           1.00     20323
   macro avg       0.75      0.52      0.54     20323
weighted avg       1.00      1.00      1.00     20323

KNN model accuracy(in %): 99.88682773212616


In [None]:
from sklearn.metrics import recall_score
recall_score(y_test, y_pred, average='macro')

0.5217144998929107

In [None]:
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average='macro')

0.7494586880566901

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute


cv = KFold(n_splits=10, random_state=1, shuffle=True)
model = LinearRegression()
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
mean(absolute(scores))
#cross-validation for knn

0.0042112683927655065

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
# making predictions on the testing set
y_predgauss = gnb.predict(X_test)


In [None]:
print(confusion_matrix(y_test, y_predgauss))
print(classification_report(y_test, y_predgauss))
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_predgauss)*100)

[[20034   266]
 [   23     0]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     20300
           1       0.00      0.00      0.00        23

    accuracy                           0.99     20323
   macro avg       0.50      0.49      0.50     20323
weighted avg       1.00      0.99      0.99     20323

Gaussian Naive Bayes model accuracy(in %): 98.5779658514983


In [None]:
recall_score(y_test, y_predgauss, average='macro')

0.49344827586206896

In [None]:
precision_score(y_test, y_predgauss, average='macro')

0.49942663409283544

In [None]:
from numpy import std
#cross validation of gauss
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
modelo = GaussianNB()
# evaluate model
scores = cross_val_score(modelo, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.982 (0.001)


In [None]:
#COMPARISON!!
#ACCURACY: KNN: 99.88682773212616% / GNB: 98.5779658514983%  ----> KNN
#RECALL SCORE: KNN:0.5217144998929107 / GNB: 0.49344827586206896 ----> KNN has a greater value therefore KNN wins (closer to 1)
#PRECISION SCORE: KNN: 0.7494586880566901 / GNB: 0.49942663409283544 ----> KNN has a greater value therefore KNN wins
#CONFUSION MATRIX ----> KNN has higher true positive and true negative scores, while having lower false positive and false negative values
#OVERALL WINNER: KNN