In [None]:

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import statsmodels.api as sm
from ISLP.models import (ModelSpec as MS,
                         summarize)
from ISLP import confusion_table
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("../data/card_transdata.csv")
print(df.head())

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
corr_matrix = df.corr()
# Visualize the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
import matplotlib.pyplot as plt

df.hist(bins=20, figsize=(10, 8))
plt.show()

In [None]:
y = df['fraud']
X = MS(['distance_from_home','distance_from_last_transaction','ratio_to_median_purchase_price','repeat_retailer','used_chip','used_pin_number','online_order']).fit_transform(df) 
#X = df.drop('fraud', axis='columns')
#X = sm.add_constant(X)
model1 = sm.OLS(y, X)
results1 = model1.fit()
summarize(results1)

In [None]:
# Perform regression
model = sm.OLS.from_formula('fraud ~ distance_from_home * distance_from_last_transaction', data=df)
result = model.fit()

# Print the summary of the regression
print(result.summary())

In [None]:
# Perform regression with variable interactions
model = sm.OLS.from_formula('fraud ~ distance_from_last_transaction * ratio_to_median_purchase_price* distance_from_home* used_chip', data=df)
result = model.fit()

# Print the summary of the regression
print(result.summary())

In [None]:
#Try the KNN model
# Select predictors (excluding the last column)
predictors = df.iloc[:, :-1]
# Standardize the predictors
scaler = StandardScaler()
predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)

# Display the head of the standardized predictors
print(predictors_standardized.head())
# Create a random vector of True and False values
np.random.seed(4)
split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])

# Define the training set for X (predictors)
training_X = predictors_standardized[split]

# Define the training set for Y (response)
training_Y = df.loc[split, 'fraud']

# Define the testing set for X (predictors)
testing_X = predictors_standardized[~split]

# Define the testing set for Y (response)
testing_Y = df.loc[~split, 'fraud']

In [None]:
predictors

In [None]:
predictors_standardized

In [None]:
knn = KNeighborsClassifier(n_neighbors =3)
knn_fit=knn.fit(training_X,training_Y)
knn_pred = knn.predict(testing_X)
confusion_table(knn_pred,testing_Y)

In [None]:
prediction_accuracy = knn.score(testing_X,testing_Y)
print(prediction_accuracy)

In [None]:
#Data is highly imbalanced
df.fraud.value_counts()

In [None]:
#Liz: Trying random undersampling, reducing the samples of non-fraud to match that of fraud

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.utils import resample

#Separate majority, minority class
df_majority = df[df['fraud']==0.0]
df_minority = df[df['fraud']==1.0]

#downsample majority "non-fraud" class
df_majority_downsampled = resample(df_majority, replace= False, n_samples=len(df_minority), random_state=42)

#combine downsampled majority + minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

#separate predictors (x) and target variable (y)
X_downsampled = df_downsampled.drop('fraud', axis=1)
y_downsampled = df_downsampled['fraud']


In [None]:
df_downsampled.shape

In [None]:
#split the downsampled dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_downsampled, y_downsampled, test_size=0.25, random_state=42)

#train a random forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
#calculate the accuracy

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
#Even after random undersampling, the accuracy still seems high at 99.99%.
#Let's check the class distribution after undersampling

print("Class distribution in training set:")
print(y_train.value_counts())

print("Class distribution in testing set")
print(y_test.value_counts())

In [None]:
#ROC AUC score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC AUC Score:", roc_auc)

#Comparing with the original dataset KNN results, the original dataset has an accuracy of 99.87% while the undersampled dataset has an accuracy of 99.99%
#There might be an issue of potential overfitting.
