In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn.datasets
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

## 1) Importing datasets
After data cleaning and transformation (Appendix B) and sampling (Appendix C)

In [2]:
# Creating list with names of files exported after sampling
csv_files = [file for file in os.listdir("./data/csv_balanced/") if file.endswith('.csv')]

# Creating an empty list to store dataframes
list_df = []

# Loop to import each file and append to the created list
for csv in csv_files:
    df = pd.read_csv(os.path.join("./data/csv_balanced/", csv))
    list_df.append(df)

# 2) Standardising features


In [3]:
list_std = []

for df in list_df:
    std_scaler = StandardScaler()
    cols = df.columns.difference(['fraud_bool'])
    df[cols] = std_scaler.fit_transform(df[cols])
    list_std.append(df)

# 3) Correlation analysis 

In [None]:
# Defining function to plot correlation heatmap of one of the dataframes in the list
def plot_heatmap(dataframe_number):
    f,ax = plt.subplots(figsize=(15, 15))
    heatmap = sns.heatmap(list_std[dataframe_number].corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
    heatmap.set_title('Correlation heatmap', fontdict={'fontsize':20}, pad=16)
    plt.tight_layout()
    plt.show()

# Plot heatmap
plot_heatmap(2)

In [None]:
# Identifying pairs with high correlation 

def get_highcorr(dataframe_number):
    corr = list_std[dataframe_number].corr().abs()
    tab = corr.unstack()
    print(tab[tab < 1].sort_values(ascending = False).head(10))

get_highcorr(2)

In [None]:
# IVF analysis

import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Add a constant term to the data
df_with_const = add_constant(df)

# Calculate VIF for each variable
vif_data = pd.DataFrame()
vif_data["Variable"] = df_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(df_with_const.values, i) for i in range(df_with_const.shape[1])]

print(vif_data)

# 4) Logistic regression with backward elimination

In [None]:
# Dropping all the inf columns!
# cols_to_drop = ['INTERNET', 'TELEAPP', 'linux', 'other', 'windows', 'x11', 'macintosh', 'velocity_4w', 'month']
# list_std = [df.drop(columns=cols_to_drop) for df in list_std]

In [None]:
# Creating function specifying the model
def backward_elimination(X, y, sig_lvl=0.05):
    num_vars = len(X.columns)
    for i in range(0, num_vars):
        regressor_Logit = sm.Logit(y.astype(float), X.astype(float)).fit()
        max_var = max(regressor_Logit.pvalues)
        print(max_var)
        print(type(max_var))
        if max_var > sig_lvl or pd.isnull(max_var):
            for j in range(0, num_vars - i):
                if regressor_Logit.pvalues[j].astype(float) == max_var or pd.isnull(regressor_Logit.pvalues[j]):
                    print(f"dropping {X.columns[j]}")
                    X = X.drop(X.columns[j], axis=1)
                    break
                    
    return(regressor_Logit, X, y)

# Creating function to run the model using a given dataframe
def run_model(dataframe):
    # Separate features (X) and target variable (Y)
    X = dataframe.drop('fraud_bool', axis=1)
    y = dataframe['fraud_bool']

    # Add constant column to X for intercept
    X = sm.add_constant(X)

    # Perform backward elimination
    return(backward_elimination(X, y))

In [None]:
# Running the model through the list of dataframes
list_models = [run_model(df) for df in list_std]

In [None]:
# Printing regression results
[print(model.summary().tables[0]) for model, X, y in list_models]

In [None]:
# Printing features' coefficients
[print(model.summary().tables[1]) for model, X, y in list_models]

In [None]:
accuracies = []
cross_tabs = []
for i, model_tuple in enumerate(list_models):
    logit_model = model_tuple[0]
    X = model_tuple[1]
    y = model_tuple[2]
    df = list_std[0]

    df.loc[:, 'probability'] = logit_model.predict(X)
    df.loc[:, 'yhat'] = (df.probability > 0.5) * 1
    ct = pd.crosstab(df.fraud_bool, df.yhat)
    cross_tabs.append(ct)
    print(ct)

    # Find accuracy of the model using formula ACC=(TP+TN)/N
    acc = (ct.iloc[0][0] + ct.iloc[1][1]) / df.shape[0]
    accuracies.append(acc)
    print('Accuracy of the model is', acc, '\n')


In [None]:
print(f"Average accuracy for Fraud is {sum([ct.iloc[0][0] for ct in cross_tabs])/sum([ct.iloc[0][0]+ct.iloc[0][1] for ct in cross_tabs])}")
print(f"Average accuracy for non-Fraud is {sum([ct.iloc[1][1] for ct in cross_tabs])/sum([ct.iloc[1][0]+ct.iloc[1][1] for ct in cross_tabs])}")
print(f"Average accuracy is {sum(accuracies)/len(accuracies)}")