In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('ml_data/credit_card_fraud_dataset.csv')
df

Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35.462834,784.00,394,purchase,New York,0
3,4,2024-04-13 23:50:35.462850,3514.40,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0
...,...,...,...,...,...,...,...
99995,99996,2024-06-07 00:57:36.027591,1057.29,289,refund,San Antonio,0
99996,99997,2023-10-22 23:12:36.027594,297.25,745,refund,San Antonio,0
99997,99998,2024-05-31 19:27:36.027597,3448.56,690,purchase,San Antonio,0
99998,99999,2024-10-18 09:43:36.027601,3750.79,644,purchase,Philadelphia,0


In [21]:
from datetime import datetime

# List of federal holidays 2023 & 2024
hols_2023 = ["2023-01-02", "2023-01-16", "2023-05-29", "2023-06-19", "2023-07-04", "2023-09-04", "2023-11-10", "2023-11-23", "2023-12-25"]
hols_2024 = ["2024-01-01", "2024-01-15", "2024-05-27", "2024-06-19", "2024-07-04", "2024-09-02", "2024-11-11", "2024-11-24", "2024-12-25"]

for i in range(len(hols_2023)):
    hols_2023[i] = datetime.strptime(hols_2023[i], '%Y-%m-%d')

for i in range(len(hols_2024)):
    hols_2024[i] = datetime.strptime(hols_2024[i], '%Y-%m-%d')



In [4]:
# Hot encode Location
city_encoded = pd.get_dummies(df["Location"], dtype=int)
df = pd.merge(df, city_encoded, right_index=True, left_index=True, how='inner')

# Hot encode MerchantID
# Result -> Accuracy went down
#merchant_encoded = pd.get_dummies(df['MerchantID'], drop_first=True, dtype=int,prefix='MerchID', prefix_sep='_', )
#df = pd.merge(df, merchant_encoded, right_index=True, left_index=True, how='inner')

# Hot encode Year
df["Date"] = df["TransactionDate"].astype(str).str.split(" ").str[0]
df["Year"] = df["Date"].astype(str).str.split("-").str[0]
df['Year'] = np.where(df["Year"]==2024, 1, 0)

# Hot encode month
#df["Month"] = df["Date"].astype(str).str.split("-").str[1]
#month_encoded = pd.get_dummies(df["Month"], dtype=int, drop_first=True, prefix='Month', prefix_sep='_' )
#df = pd.merge(df, month_encoded, right_index=True, left_index=True, how='inner')
df['Date'] = pd.to_datetime(df['Date']) 
df['DayOfWeek'] = df['Date'].dt.day_name()
dayofweek = pd.get_dummies(df['DayOfWeek'], dtype=int, prefix='WD', prefix_sep='_' ) 
df = pd.merge(df, dayofweek, right_index=True, left_index=True, how='inner')
# df["Day"] = df["Date"].astype(str).str.split("-").str[2]


# Change TransactionType to binary
df['TransactionType'] = np.where(df["TransactionType"]=='refund', 1, 0)

df

Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud,Chicago,Dallas,Houston,...,Date,Year,DayOfWeek,WD_Friday,WD_Monday,WD_Saturday,WD_Sunday,WD_Thursday,WD_Tuesday,WD_Wednesday
0,1,2024-04-03 14:15:35.462794,4189.27,688,1,San Antonio,0,0,0,0,...,2024-04-03,0,Wednesday,0,0,0,0,0,0,1
1,2,2024-03-19 13:20:35.462824,2659.71,109,1,Dallas,0,0,1,0,...,2024-03-19,0,Tuesday,0,0,0,0,0,1,0
2,3,2024-01-08 10:08:35.462834,784.00,394,0,New York,0,0,0,0,...,2024-01-08,0,Monday,0,1,0,0,0,0,0
3,4,2024-04-13 23:50:35.462850,3514.40,944,0,Philadelphia,0,0,0,0,...,2024-04-13,0,Saturday,0,0,1,0,0,0,0
4,5,2024-07-12 18:51:35.462858,369.07,475,0,Phoenix,0,0,0,0,...,2024-07-12,0,Friday,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,2024-06-07 00:57:36.027591,1057.29,289,1,San Antonio,0,0,0,0,...,2024-06-07,0,Friday,1,0,0,0,0,0,0
99996,99997,2023-10-22 23:12:36.027594,297.25,745,1,San Antonio,0,0,0,0,...,2023-10-22,0,Sunday,0,0,0,1,0,0,0
99997,99998,2024-05-31 19:27:36.027597,3448.56,690,0,San Antonio,0,0,0,0,...,2024-05-31,0,Friday,1,0,0,0,0,0,0
99998,99999,2024-10-18 09:43:36.027601,3750.79,644,0,Philadelphia,0,0,0,0,...,2024-10-18,0,Friday,1,0,0,0,0,0,0


In [22]:
# Convert to NumPy array
public_holidays = np.array(hols_2023 + hols_2024, dtype='datetime64[D]')

In [25]:
# Check if each date is a business day
df['is_business_day'] = np.is_busday(df['Date'].values.astype('datetime64[D]'), holidays=public_holidays)

df['is_business_day'] = np.where(df['is_business_day']==True, 1, 0)

df


Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud,Chicago,Dallas,Houston,...,DayOfWeek,WD_Friday,WD_Monday,WD_Saturday,WD_Sunday,WD_Thursday,WD_Tuesday,WD_Wednesday,busday,is_business_day
0,1,2024-04-03 14:15:35.462794,4189.27,688,1,San Antonio,0,0,0,0,...,Wednesday,0,0,0,0,0,0,1,<attribute 'weekmask' of 'numpy.busdaycalendar...,1
1,2,2024-03-19 13:20:35.462824,2659.71,109,1,Dallas,0,0,1,0,...,Tuesday,0,0,0,0,0,1,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,1
2,3,2024-01-08 10:08:35.462834,784.00,394,0,New York,0,0,0,0,...,Monday,0,1,0,0,0,0,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,1
3,4,2024-04-13 23:50:35.462850,3514.40,944,0,Philadelphia,0,0,0,0,...,Saturday,0,0,1,0,0,0,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,0
4,5,2024-07-12 18:51:35.462858,369.07,475,0,Phoenix,0,0,0,0,...,Friday,1,0,0,0,0,0,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,2024-06-07 00:57:36.027591,1057.29,289,1,San Antonio,0,0,0,0,...,Friday,1,0,0,0,0,0,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,1
99996,99997,2023-10-22 23:12:36.027594,297.25,745,1,San Antonio,0,0,0,0,...,Sunday,0,0,0,1,0,0,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,0
99997,99998,2024-05-31 19:27:36.027597,3448.56,690,0,San Antonio,0,0,0,0,...,Friday,1,0,0,0,0,0,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,1
99998,99999,2024-10-18 09:43:36.027601,3750.79,644,0,Philadelphia,0,0,0,0,...,Friday,1,0,0,0,0,0,0,<attribute 'weekmask' of 'numpy.busdaycalendar...,1


In [26]:
#

df = df.drop(columns=['TransactionID','TransactionDate', 'MerchantID', 'Location', 'busday','Date', 'DayOfWeek'])
df

Unnamed: 0,Amount,TransactionType,IsFraud,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,...,San Jose,Year,WD_Friday,WD_Monday,WD_Saturday,WD_Sunday,WD_Thursday,WD_Tuesday,WD_Wednesday,is_business_day
0,4189.27,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,2659.71,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,784.00,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
3,3514.40,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,369.07,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1057.29,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
99996,297.25,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99997,3448.56,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
99998,3750.79,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [27]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(df.drop('IsFraud', axis=1),df['IsFraud'], test_size=0.25, random_state=42)

In [41]:
pen_var = 'l2'
tol_var = 1e-4
dual_var = False
c_var = 1.0
fit_intercept_var = False
intercept_scaling_var = 1
class_weight_var = 'balanced'
random_state_var = None
sol_var = 'saga'
max_iter_var = 10000
multi_class_var = 'auto'
verbose_var = 0
warm_start_var = False
n_jobs_var = None
l1_ratio_var = None


# Train the model
LogReg = LogisticRegression(penalty = pen_var, 
                            tol = tol_var,
                            dual=dual_var,
                            C = c_var,
                            fit_intercept = fit_intercept_var, 
                            intercept_scaling = intercept_scaling_var,
                            class_weight = class_weight_var,
                            random_state= random_state_var,
                            solver=sol_var,
                            max_iter=max_iter_var,
                            multi_class= multi_class_var,
                            verbose=verbose_var,
                            warm_start=warm_start_var,
                            n_jobs=n_jobs_var,
                            l1_ratio=l1_ratio_var
                             )
LogReg.fit(X_train, y_train)



In [42]:
# Scoring
train_score = LogReg.score(X_train, y_train)
print(f"Training Accuracy: {round(train_score*100)}%")
test_score = LogReg.score(X_test, y_test)
print(f"Testing Accuracy: {round(test_score*100)}%")

Training Accuracy: 83%
Testing Accuracy: 83%


In [31]:
# Predictions for X_test
y_pred = LogReg.predict(X_test)

In [32]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[18204  6535]
 [  192    69]]


In [43]:
#LogReg.decision_function(X_test)

In [44]:
var_list = [pen_var, tol_var, dual_var, c_var, fit_intercept_var, intercept_scaling_var, class_weight_var, random_state_var, sol_var, max_iter_var, multi_class_var, verbose_var, warm_start_var, n_jobs_var, l1_ratio_var, test_score]

def rec_findings(var_list):
    output=''

    for i in range(len(var_list)):
        
        if i != (len(var_list)-1):
            output = output + str(var_list[i]) + ','

        else:
            output = output + str(var_list[i])
        
    return output

rec_findings(var_list)

with open('output.txt', 'ab') as f:
    output = rec_findings(var_list)
    f.write((output + '\n').encode('utf-8'))
    f.close()