<a href="https://colab.research.google.com/github/giakomorssi/Deloitte_Project/blob/main/03_FraudDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import the Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Change Colab runtime to GPU
import os
os.environ['COLAB_TPU_ADDR'] = ''
os.environ['COLAB_GPU_ALLOC'] = '1'
os.environ['COLAB_GPU'] = '1'
print("Runtime switched to GPU")

import tensorflow as tf

if not tf.test.gpu_device_name():
    print('GPU device not found')
else:
    print('GPU device found:', tf.test.gpu_device_name())

# This code sets the runtime to use the GPU if available
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/University/Deloitte/SupplyChainDataset.csv', encoding = 'latin-1')

# Cleaning

In [None]:
# Remove Na and Empty Columns

df.drop(['Product Description', 'Order Zipcode', 'Order Profit Per Order', 'Customer Email', 'Customer Password'], axis = 1, inplace = True) 
df.dropna(inplace = True) #remove 1 missing value

## Create the Category Column

1. **COMPLETE:** The order or transaction has been successfully fulfilled and completed.
2. **PENDING**: The order or transaction is still in progress and has not yet been completed.
3. **CLOSED**: The order or transaction has been closed or terminated for some reason, such as a return or cancellation.
4. **PENDING_PAYMENT**: The order or transaction is awaiting payment before it can be processed.
5. **CANCELED**: The order or transaction has been canceled by the customer or the seller for some reason.
6. **PROCESSING**: The order or transaction is being processed by the seller or merchant.
7. **SUSPECTED_FRAUD**: The order or transaction is under review due to suspected fraudulent activity.
8. **ON_HOLD**: The order or transaction has been placed on hold for some reason, such as a delay in shipping or a credit hold.
9. **PAYMENT_REVIEW**: The payment for the order or transaction is under review by the payment processor or financial institution.

In [None]:
# Regular -> Complete, Pending, Pending_Payment, Processing
# Suspected -> Closed, Canceled, On_Hold, Payment_Review
# Fraud -> Suspected_Fraud

# define dictionaries to map status values to categories
regular_dict = {'COMPLETE': 'Regular', 'PENDING': 'Regular', 'PENDING_PAYMENT': 'Regular', 'PROCESSING': 'Regular'}
suspected_dict = {'CLOSED': 'Suspected', 'CANCELED': 'Suspected', 'ON_HOLD': 'Suspected', 'PAYMENT_REVIEW': 'Suspected'}
fraud_dict = {'SUSPECTED_FRAUD': 'Fraud'}

# create a function to map status values to categories
def map_category(status):
    if status in regular_dict:
        return regular_dict[status]
    elif status in suspected_dict:
        return suspected_dict[status]
    elif status in fraud_dict:
        return fraud_dict[status]
    else:
        return 'Unknown'

# apply the function to the 'status' column to create a new 'category' column
df['Category'] = df['Order Status'].apply(map_category)

print('Regular: ', len([x for x in df['Category'] if x == 'Regular']), '\n')
print('Suspected: ', len([x for x in df['Category'] if x == 'Suspected']), '\n')
print('Fraud: ', len([x for x in df['Category'] if x == 'Fraud']))

## Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Type
df['Type'] = le.fit_transform(df['Type'])

# Delivery Status
df['Delivery Status'] = le.fit_transform(df['Delivery Status'])

# Customer Segment
df['Customer Segment'] = le.fit_transform(df['Customer Segment'])

# Order Status
df['Order Status'] = le.fit_transform(df['Order Status'])

# Shipping Mode
df['Shipping Mode'] = le.fit_transform(df['Shipping Mode'])

# Category
df['Category'] = le.fit_transform(df['Category'])

# EDA

In [None]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

temp = df["Category"].value_counts()
df1 = pd.DataFrame({'Category': temp.index,'values': temp.values})

# Define a list of colors for the bars
colors = ['red', 'blue', 'green']

traces = []
for i, category in enumerate(df1['Category']):
    if category == 1:
        name = "Regular"
    elif category == 0:
        name = "Suspected"
    else:
        name = "Fraud"
    trace = go.Bar(
        x=[name], y=[df1.loc[i, 'values']],
        name=name,
        marker=dict(color=colors[i]),
        text=[df1.loc[i, 'values']],
        legendgroup="group"
    )
    traces.append(trace)

layout = dict(title='Credit Card Fraud Class - data unbalance',
              xaxis=dict(title='Class', showticklabels=True), 
              yaxis=dict(title='Number of transactions'),
              hovermode='closest', width=600,
              showlegend=True
             )
fig = go.Figure(data=traces, layout=layout)
iplot(fig, filename='class')

In [None]:
# Correlation Plot

df.corr().style.background_gradient(cmap='coolwarm').set_properties(**{'max_width': '50px'})

## Prepare the Data

In [None]:
from sklearn.model_selection import train_test_split

df.drop(['Category Name', 'Customer City',
       'Customer Country', 'Customer Fname', 'Customer Id', 'Customer Lname',
       'Customer State',	'Customer Street', 'Department Name', 
       'Market', 'Order City', 'Order Country', 'Order Customer Id', 'Order Region',	
       'Order State', 'Product Image',	'Product Name',
       'shipping date (DateOrders)', 'order date (DateOrders)', 'Category Id', 'Customer Zipcode', 
       'Department Id', 'Latitude',	'Longitude', 'Order Id',	'Order Item Cardprod Id',
       'Order Item Id', 'Product Card Id', 'Product Category Id'], axis = 1, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split  

X = df.drop(['Category'], axis=1) #Not scaled
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)

In [None]:
from sklearn.preprocessing import StandardScaler

s = StandardScaler()

X_train = s.fit_transform(X_train)
X_test = s.transform(X_test)

# PCA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Initialize a PCA object
pca = PCA()

# Fit the PCA object to the data
pca.fit(X_train)

# Create a scree plot
num_components = len(pca.explained_variance_ratio_)
plt.plot(np.arange(1, num_components+1), pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()

# Determine the number of components to keep
variance_threshold = 0.95
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
num_components_to_keep = np.argmax(cumulative_variance_ratio >= variance_threshold) + 1

print(f'\n Number of components to keep: {num_components_to_keep}')

# Transform the data using the chosen number of components
pca = PCA(n_components=num_components_to_keep)
X_train_p = pca.fit_transform(X_train)
X_test_p = pca.transform(X_test)

In [None]:
# Plot the cumulative variance explained

fig = plt.figure(figsize=(10, 5))

cum = np.insert(cumulative_variance_ratio, 0, 0)
ylab = np.insert(np.cumsum(pca.explained_variance_ratio_), 0, 0)

plt.plot(cum, 'ro-', linewidth=2)
plt.title('Cumulative Variance Explained')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Variance Explained')
plt.yticks(ylab)
plt.xticks(np.arange(0, 21))
plt.show()

In [None]:
import seaborn as sns

# Heatmap loadings
fig = plt.figure(figsize=(20, 10))
sns.heatmap(pca.components_, cmap='coolwarm', annot=True, cbar=False)
plt.xlabel('Features')
plt.ylabel('Principal Components')
plt.title('PCA Loadings Heatmap')
plt.show()

In [None]:
# Get the principal components as vectors in the original feature space
pc_vectors = pca.components_

# Get the names of the original columns
column_names = X.columns

# Print the names of the columns chosen as principal components
num_pcs = pc_vectors.shape[0]
for i in range(num_pcs):
    pc_name = f'PC{i+1}'
    pc_loadings = pc_vectors[i]
    relevant_columns = column_names[np.abs(pc_loadings) >= 0.40]
    print(f'{pc_name}:\n {relevant_columns.tolist()}, \n {pc_loadings[np.abs(pc_loadings) >= 0.40]} \n ')

# Models

In [None]:
import pandas as pd
import numpy as np
import pickle

# Split the dataset into features and target
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
y_train = np.ravel(y_train)

print(X_train.shape, y_train.shape)
print(type(X), type(y))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

fraud = StackingClassifier(estimators=[('lr', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000, class_weight='balanced')),
                                      ('dt', DecisionTreeClassifier(max_depth=4))],
                                              final_estimator=DecisionTreeClassifier(max_depth=5))

fraud1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000, class_weight='balanced')

regular = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=4, subsample=0.8)

regular2 = SGDClassifier(loss='log_loss', penalty='elasticnet', alpha=0.0001, l1_ratio=1, max_iter=1000)

suspected = DecisionTreeClassifier(class_weight='balanced', max_depth= 4)

reg_sus = XGBClassifier(objective='multi:softmax', num_class=3, max_depth=4, subsample=0.8,
                    colsample_bytree=0.8, learning_rate=0.3)

## Voting2class1

In [None]:
from sklearn.ensemble import VotingClassifier

# Initialize the voting classifier with the base models
model = VotingClassifier(estimators=[('fraud', fraud1), ('reg_sus', reg_sus)], voting='soft')

model.fit(X_train, y_train)

with open('/content/drive/MyDrive/University/Deloitte/models/Voting2class1.pkl', 'wb') as file:
    pickle.dump(model, file)

## Voting2class

In [None]:
from sklearn.ensemble import VotingClassifier

# Initialize the voting classifier with the base models
model = VotingClassifier(estimators=[('fraud', fraud), ('reg_sus', reg_sus)], voting='hard')

model.fit(X_train, y_train)

with open('/content/drive/MyDrive/University/Deloitte/models/Voting2class.pkl', 'wb') as file:
    pickle.dump(model, file)

## BaggingLogistic

In [None]:
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier(estimator=LogisticRegression(max_iter=1000, multi_class='multinomial'),
                  n_estimators=30)

model.fit(X_train, y_train)

with open('/content/drive/MyDrive/University/Deloitte/models/BaggingLogistic.pkl', 'wb') as file:
    pickle.dump(model, file)

# Try the model over 100 split

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, recall_score
import os
import pickle
import pandas as pd

# Standardize the data and split it into training and test sets
s = StandardScaler()

recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []
fraud_recall = []
suspected_recall = []
regular_recall = [] 
low = []
avg_conf_matrix = np.zeros((3, 3))

np.set_printoptions(precision=2)
#StackingXGB

with open('/content/drive/MyDrive/University/Deloitte/bests/Voting2class.pkl', 'rb') as f:
    model = pickle.load(f)

for i in range(1, 101):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)

  X_train = s.fit_transform(X_train)
  X_test = s.transform(X_test)

    # PCA
  pca = PCA()
  pca.fit(X_train)

  cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
  num_components_to_keep = np.argmax(cumulative_variance_ratio >= 0.95) + 1

  pca = PCA(n_components=num_components_to_keep)
  X_train = pca.fit_transform(X_train)
  X_test = pca.transform(X_test)

  y_pred = model.predict(X_test)

  conf_matrix = confusion_matrix(y_test, y_pred)

  recalls = []
  for j in range(conf_matrix.shape[0]):
      tp = conf_matrix[j,j]
      fn = np.sum(conf_matrix[j,:]) - tp
      recall = tp / (tp + fn)
      recalls.append(recall)

  recall_scores.append(recall_score(y_test, y_pred, average="macro"))
  fraud_recall.append(recalls[0])
  regular_recall.append(recalls[1])
  suspected_recall.append(recalls[2])

  conf_matrix = confusion_matrix(y_test, y_pred)
  avg_conf_matrix += conf_matrix

  if i % 10 == 0:
      print(f'Iteration: {i}')
      print(f'Fraud Recall {round(np.average(fraud_recall), 4)}, {round(np.std(fraud_recall), 4)}')
      print(f'Suspected Recall {round(np.average(suspected_recall), 4)}, {round(np.std(suspected_recall), 4)}')
      print(f'Regular Recall {round(np.average(regular_recall), 4)}, {round(np.std(regular_recall), 4)}')
      print(f'Total Recall {round(np.average(recall_scores), 4)}, {round(np.std(recall_scores), 4)} \n')

  if recalls[0] < 0.7:
    low.append(round(recalls[0], 4))
  
print(f'\n Fraud Recall: {round(np.average(fraud_recall), 4)}, std: {round(np.std(fraud_recall), 4)}, Under 0.7: {len(low)}, {low}\n Suspected Recall: {round(np.average(suspected_recall), 4)}, std: {round(np.std(suspected_recall), 4)}\n Regular Recall: {round(np.average(regular_recall), 4)}, std: {round(np.std(regular_recall), 4)}\n Total: {round(np.average(recall_scores), 4)}, std: {round(np.std(recall_scores), 4)}')

np.set_printoptions(precision=2)
avg_conf_matrix /= 100
print("\n Average Confusion Matrix:")
print(avg_conf_matrix)

print('\n', model)


# Results

**Baseline Model**
```python
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000, class_weight='balanced')
```
```python
Fraud Recall: 0.9228, std: 0.1951
Average Confusion Matrix: [749 42 2]
```
```python
Fraud Recall: 0.9228, std: 0.1951
Average Confusion Matrix: [749 42 2]
```
```python
Fraud Recall: 0.9228, std: 0.1951
Average Confusion Matrix: [749 42 2]
```


**Best Fraud**

*Voting2class*
```python
VotingClassifier(estimators=[('fraud',
                              StackingClassifier(estimators=[('lr', LogisticRegression(class_weight='balanced', max_iter=2000, multi_class='multinomial')),
                                                             ('dt', DecisionTreeClassifier(max_depth=4))],
                                                              final_estimator=DecisionTreeClassifier(max_depth=5))),
                             ('reg_sus',
                              XGBClassifier(learning_rate=0.3, max_depth=4, n_estimators=100, num_class=3, objective='multi:softmax'))])
```
```python
Fraud Recall: 0.9228, std: 0.1951
Average Confusion Matrix: [749 42 2]
```

**Best Suspected**

*Voting2class1*
```python
VotingClassifier(estimators=[('fraud',
                              LogisticRegression(class_weight='balanced', max_iter=2000, multi_class='multinomial')),
                             ('reg_sus',
                              XGBClassifier(colsample_bytree=0.8,
                                            learning_rate=0.3, max_depth=4, n_estimators=100, num_class=3, objective='multi:softmax'))], 
                  voting='soft')
```
```python
Suspected Recall: 0.8843, std: 0.1285
Average Confusion Matrix: [24 787 6190]
```

**Best Regular**

*Bagging Logistic Model:*
```python
BaggingClassifier(estimator=LogisticRegression(max_iter=1000, multi_class='multinomial'),
                  n_estimators=30)
```
```python
Regular Recall: 0.9585, std: 0.0422
Average Confusion Matrix: [466  27115  708]
```

**Best Overall**
1. *Voting2class1* 

more balanced between the 3 recalls
```python
VotingClassifier(estimators=[('fraud',
                              LogisticRegression(class_weight='balanced', max_iter=2000, multi_class='multinomial')),
                             ('reg_sus',
                              XGBClassifier(colsample_bytree=0.8,
                                            learning_rate=0.3, max_depth=4, n_estimators=100, num_class=3, objective='multi:softmax'))], 
                  voting='soft')
```
```python
Fraud Recall: 0.8637, std: 0.2231
```
```python
Suspected Recall: 0.8913, std: 0.1258
```
```python
Regular Recall: 0.9366, std: 0.0675
```
```python
Total Recall: 0.8972, std: 0.1322
```
```python
 Average Confusion Matrix:
[[701  87  24]
 [235  26500  1560]
 [25  736  6240]]
```

2. *Voting2class* 

Better on Regular and Fraud, less in Suspected
```python
VotingClassifier(estimators=[('fraud', StackingClassifier(estimators=[('lr', LogisticRegression(class_weight='balanced', max_iter=2000, multi_class='multinomial')), 
                                                                      ('dt', DecisionTreeClassifier(max_depth=4))], 
                                                          final_estimator=DecisionTreeClassifier(max_depth=5))),
                             ('reg_sus',
                              XGBClassifier(learning_rate=0.3, max_depth=4 n_estimators=100, num_class=3, objective='multi:softmax'))])
```

```python
Fraud Recall: 0.9387, std: 0.1318	
```
```python
Suspected Recall: 0.7848, std: 0.0763 
```
```python
Regular Recall: 0.9541, std: 0.0395
```
```python
Total Recall: 0.8925, std: 0.0708
```
```python
 Average Confusion Matrix:
[[762  5  0]
 [625  27000  674]
 [5  1500  5490]]
 ```