# Fraud Detection

____

## Table of Contents
- [Importing libraries](#Importing-libraries)
- [Load data](#Load-data)
- [Data Cleaning and Preparation](#Data-Cleaning-and-Preparation)
- [Data Exploration & Visualization](#Data-Exploration-&-Visualization)
- [Machine Learning for Fraud Detection](#Machine-Learning-for-Fraud-Detection)

____

## Importing libraries 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

____

## Load data

source: https://www.kaggle.com/ealaxi/paysim1

In [None]:
df = pd.read_csv('https://www.kaggle.com/datasets/ealaxi/paysim1')

### Check dimensions 

In [None]:
df.head(10)

In [None]:
df.shape

____

## Data Cleaning and Preparation

In [None]:
df.info()

### Check for nulls

In [None]:
df.isnull().sum()

### Check for uniqueness 

In [None]:
df['nameDest'].nunique()

In [None]:
df['nameOrig'].nunique()

### Drop some columns 

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

In [None]:
df.head()

____

## Data Exploration & Visualization

In [None]:
df['isFraud'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'isFraud', data=df)
plt.title('isFraud Distribution')
plt.xlabel('isFraud')
plt.ylabel('Yes or No')
plt.show()

In [None]:
df['isFlaggedFraud'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'isFlaggedFraud', data=df)
plt.title('isFlaggedFraud Distribution')
plt.xlabel('isFraud')
plt.ylabel('Yes or No')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.catplot(x ='type', y ='amount', estimator = sum, hue ='isFraud', col='isFlaggedFraud', data=df)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.ylim([0, 8000])
sns.histplot(df['step'], kde=True)
plt.show()

In [None]:
df['step'] = df['step'] % 24

In [None]:
plt.figure(figsize=(12,8))
sns.lineplot(x ='step', y ='amount', hue = 'type', ci =None, estimator='mean', data=df)
plt.show()

In [None]:
sns.displot(x ='step', col ='isFraud', data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x = 'type', hue = 'isFraud', data=df)
plt.title('isFraud Distribution')
plt.show()

In [None]:
df = df.loc[(df.type == 'TRANSFER')|(df.type == 'CASH_OUT')]

In [None]:
df.shape

In [None]:
df.sample(10)

### Convert to categorical

In [None]:
df = pd.concat([df, pd.get_dummies(df['type'], prefix='type', drop_first=True)], axis=1)

In [None]:
df.sample(10)

type_TRANSFER 0 is cash_out, type_TRANSFER 1 is transfer

### Drop more columns 

In [None]:
df.drop(['type', 'isFlaggedFraud'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
df['isFraud'].value_counts()

In [None]:
df['origBalanceDiscrepancy'] = df.newbalanceOrig + df.amount - df.oldbalanceOrg

In [None]:
df['destBalanceDiscrepancy'] = df.oldbalanceDest + df.amount - df.newbalanceDest

In [None]:
df.head()

In [None]:
sns.catplot(x ='isFraud', y ='origBalanceDiscrepancy', estimator=sum, hue = 'type_TRANSFER', data=df, aspect=2)

In [None]:
sns.catplot(x ='isFraud', y ='destBalanceDiscrepancy', estimator=sum, hue = 'type_TRANSFER', data=df, aspect=2)

In [None]:
df.to_csv('Datasets/prepared_data.csv', index=False)

____

## Machine Learning for Fraud Detection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
df = pd.read_csv('Datasets/prepared_data.csv')

In [None]:
df.head().T

### Train Test Split

In [None]:
X = df.drop(['isFraud'], axis=1)
y = df['isFraud']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### ML Models

### LogisticRegression()

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred_lr = model.predict(X_test)

In [None]:
print('Logistic Regression \n')
print('accuracy: ', accuracy_score(y_test, y_pred_lr))
print('precision: ', precision_score(y_test, y_pred_lr))
print('recall: ', recall_score(y_test, y_pred_lr))

### GaussianNB()

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gauss_model = GaussianNB()
gauss_model.fit(X_train, y_train)

In [None]:
y_pred_gauss = gauss_model.predict(X_test)

In [None]:
print('naive_bayes \n')
print('accuracy: ', accuracy_score(y_test, y_pred_gauss))
print('precision: ', precision_score(y_test, y_pred_gauss))
print('recall: ', recall_score(y_test, y_pred_gauss))

### SVC()

In [None]:
from sklearn.svm import SVC

In [None]:
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
y_pred_svc = svc_model.predict(X_test)

In [None]:
print('support vector classifier \n')
print('accuracy: ', accuracy_score(y_test, y_pred_svc))
print('precision: ', precision_score(y_test, y_pred_svc))
print('recall: ', recall_score(y_test, y_pred_svc))

### Random Forests()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test)

In [None]:
print('RandomForestClassifier \n')
print('accuracy: ', accuracy_score(y_test, y_pred_rf))
print('precision: ', precision_score(y_test, y_pred_rf))
print('recall: ', recall_score(y_test, y_pred_rf))

### ROC & AUC Curves

In [None]:
from sklearn.metrics import auc, roc_curve

In [None]:
fpr_logistic, tpr_logistic, _ = roc_curve(y_test, y_pred_lr)
auc_logistic = auc(fpr_logistic, tpr_logistic)
print('auc for logistic regression', auc_logistic)

In [None]:
fpr_gauss, tpr_gauss, _ = roc_curve(y_test, y_pred_gauss)
auc_gauss = auc(fpr_gauss, tpr_gauss)
print('auc for gauss', auc_gauss)

In [None]:
fpr_svc, tpr_svc, _ = roc_curve(y_test, y_pred_svc)
auc_svc = auc(fpr_svc, tpr_svc)
print('auc for svc', auc_svc)

In [None]:
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
auc_rf = auc(fpr_rf, tpr_rf)
print('auc for rf', auc_rf)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(fpr_logistic, tpr_logistic, color='purple', label='logistic_regression (area =%0.2f)' %auc_logistic)
plt.plot(fpr_gauss, tpr_gauss, color='blue', label='gauss (area =%0.2f)' %auc_gauss)
plt.plot(fpr_svc, tpr_svc, color='orange', label='support_vector_classifier (area =%0.2f)' %auc_svc)
plt.plot(fpr_rf, tpr_rf, color='green', label='random_forest (area =%0.2f)' %auc_rf)
plt.plot([0,1], [0,1], color='navy', linestyle='--')
plt.xlim([-0.01, 1.0])
plt.ylim([-0.01, 1.0])
plt.xlabel('False Positive Rate (FP)')
plt.ylabel('True Positive Rate (TP)')
plt.title('ROC curves for ML models applied for Fraud Detection')
plt.legend(loc='lower right')
plt.show()