In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import statsmodels.api as sm
from ISLP.models import (ModelSpec as MS,
                         summarize)
from ISLP import confusion_table
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("../data/card_transdata.csv")
print(df.head())

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
corr_matrix = df.corr()
# Visualize the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
import matplotlib.pyplot as plt
# plot the distribution of all variables
df.hist(bins=20, figsize=(10, 8))
plt.show()

In [None]:
#Run the multiple regression on multiple columns which seem to have a correlation with the output 
y = df['fraud']
X = MS(['distance_from_home','distance_from_last_transaction','ratio_to_median_purchase_price','repeat_retailer','used_chip','used_pin_number','online_order']).fit_transform(df) 
#X = df.drop('fraud', axis='columns')
#X = sm.add_constant(X)
model1 = sm.OLS(y, X)
results1 = model1.fit()
summarize(results1)

In [None]:
# Perform regression with interaction between two fields to test their combined effect
model = sm.OLS.from_formula('fraud ~ distance_from_home * distance_from_last_transaction', data=df)
result = model.fit()

# Print the summary of the regression
print(result.summary())

In [None]:
# Perform regression with variable interactions between multiple variables
model = sm.OLS.from_formula('fraud ~ distance_from_last_transaction * ratio_to_median_purchase_price* distance_from_home* used_chip', data=df)
result = model.fit()

# Print the summary of the regression
print(result.summary())

In [None]:
# Select predictors (excluding the last column)
predictors = df.iloc[:, :-1]
# Standardize the predictors
scaler = sklearn.preprocessing.StandardScaler()
predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)

# Display the head of the standardized predictors
print(predictors_standardized.head())
# Create a random vector of True and False values
np.random.seed(4)
split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])

# Define the training set for X (predictors)
training_X = predictors_standardized[split]

# Define the training set for Y (response)
training_Y = df.loc[split, 'fraud']

# Define the testing set for X (predictors)
testing_X = predictors_standardized[~split]

# Define the testing set for Y (response)
testing_Y = df.loc[~split, 'fraud']

In [None]:
#Try the KNN model and view the confusion table results
knn = KNeighborsClassifier(n_neighbors =3)
knn_fit=knn.fit(training_X,training_Y)
knn_pred = knn.predict(testing_X)
confusion_table(knn_pred,testing_Y)

In [None]:
# check the prediction accuracy
prediction_accuracy = knn.score(testing_X,testing_Y)
print(prediction_accuracy)

## Working with imbalanced data

We can see that KNN was really effective, however we still fail to detect 204/21906 cases of fraud, almost 1 in 100. 

Can we get closer to understanding the predictors of credit card fraud by balancing out the data through subsampling? Subsampling will repeat the analysis using a dataframe that is 50/50 fraud and not-fraud.

This will help us enhance the fraud signal and create a more proactive fraud catching model. It will also help clarify the direction and relative magnitude of correlations between fraud and the predictors.

In [None]:
s = 5
np.random.seed(s) 

df_fraud = df.loc[df.fraud==True]
df_full_notfraud = df.loc[df.fraud==False]
undersplit = np.random.choice([True, False], size=len(df_full_notfraud), replace=True, p=[0.5, 0.5])
df_under_notfraud = df_full_notfraud[undersplit][:len(df_fraud)]

df_subsample_0 = pd.concat([df_under_notfraud,df_fraud])
df_subsample = df_subsample_0.sample(frac=1, random_state=s)

print(len(df_subsample) == 2*len(df_fraud))
df_subsample

df_subsample.to_csv('subsample.csv', index=False)

We now have the data frame ```df_subsample``` to work with. ```df_subsample``` has data containing all of the frauds from the original dataset as well as an equal number of randomly selected non-frauds. If we change the seed parameter s, then an entirely different set of non-frauds can be drawn. This can help us to vary the outcomes of the experiment which will allow us to bootstrap measure the standard error of our correlations and our predicted accuracy.

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,20))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax1)
ax1.set_title("Original Correlation Matrix)", fontsize=14)

corr_subsample_matrix = df_subsample.corr()
sns.heatmap(corr_subsample_matrix, annot=True, cmap='coolwarm', ax=ax2)
ax2.set_title('Subsampled Correlation Matrix)', fontsize=14)
plt.show()

Shown above are the correclation matrices for our dataframes. Wee see the signal of our correlations has been enhanced. We might expect that fitting on this subsample and testing on the full_sample we will reduce the false_negative (not fraud labels applied to frauds) by a factor of 1/50. By the same logic this validation should increase the false positives (fraud labels applied to non_frauds) 50times. We would then have a model that reduces undetected frauds from 1 missed fraud in 100 frauds to 1 missed fraud in 4,000 frauds.