In [41]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
customer = pd.read_csv('/Users/charlotteli/Desktop/UTS/Year1 Sem2/36106 Machine Learning Algorithms and Applications - Autumn 2023/AT3/dataset/customers.csv', sep='|')

In [4]:
# Create an empty list to store the dataframes
dataframes = []

# Set the path to the directory containing the transaction datasets
directory_path = "/Users/charlotteli/Desktop/UTS/Year1 Sem2/36106 Machine Learning Algorithms and Applications - Autumn 2023/AT3/dataset/transactions/"


In [5]:
filename = os.listdir(directory_path)

i = 0
for trans in filename:
    if i == 0:
        filepath = os.path.join(directory_path, trans)
        df = pd.read_csv(filepath, delimiter='|')
        i += 1
    else:
        filepath = os.path.join(directory_path, trans)
        df = pd.concat([df, pd.read_csv(filepath, delimiter='|')])

In [6]:
df.shape

(4260904, 10)

In [7]:
df = pd.merge(df, customer, on='cc_num', how='left')

In [8]:
df.shape

(4260904, 24)

In [9]:
print(df['category'].unique())

['gas_transport' 'grocery_net' 'grocery_pos' 'personal_care'
 'health_fitness' 'food_dining' 'home' 'entertainment' 'shopping_net'
 'misc_net' 'misc_pos' 'kids_pets' 'shopping_pos' 'travel']


In [10]:
# Match the category as number
label_encoder = LabelEncoder()

# Fit the encoder on the column and transform the values
df['category_encoded'] = label_encoder.fit_transform(df['category'])
unique_encoded_values = df['category_encoded'].unique()
print(unique_encoded_values)

category_mapping = dict(zip(unique_encoded_values, label_encoder.classes_))
print(category_mapping)


[ 2  3  4 10  5  1  6  0 11  8  9  7 12 13]
{2: 'entertainment', 3: 'food_dining', 4: 'gas_transport', 10: 'grocery_net', 5: 'grocery_pos', 1: 'health_fitness', 6: 'home', 0: 'kids_pets', 11: 'misc_net', 8: 'misc_pos', 9: 'personal_care', 7: 'shopping_net', 12: 'shopping_pos', 13: 'travel'}


## Data Cleaning

In [11]:
# Check if there is any duplicate
assert df.duplicated().sum() == 0

In [12]:
# Check Null Value
total = df.isnull().sum().sort_values(ascending=False)
total

cc_num              0
gender              0
acct_num_y          0
dob                 0
job                 0
city_pop            0
long                0
lat                 0
zip                 0
state               0
city                0
street              0
last                0
acct_num_x          0
first               0
ssn                 0
merch_long          0
merch_lat           0
merchant            0
is_fraud            0
amt                 0
category            0
unix_time           0
trans_num           0
category_encoded    0
dtype: int64

In [13]:
df['is_fraud'] = df['is_fraud'].astype(int)

In [14]:
df['gender2'] = df['gender'].apply(lambda x: 0 if x == 'F' else 1)

In [15]:
df['dob'] = pd.to_datetime(df['dob'])
today = pd.to_datetime('today')
print(today)
age = today - df['dob']
df['age'] = np.round(age.dt.days / 365.25)

2023-05-24 15:14:00.868350


In [16]:
print(df.head())

                cc_num    acct_num_x                         trans_num   
0  4896331812335761701  149852234418  f3ec0819590302134f03ffdc2f44697f  \
1  4896331812335761701  149852234418  c1607c993e41f2c3b42d72d1506bef7b   
2  4896331812335761701  149852234418  6f530db25d20fe351249a54491fd3fde   
3  4896331812335761701  149852234418  6d11805f2acd938fec99376001afafe8   
4  4896331812335761701  149852234418  605342f297c575cb1ccf2c08cad082ee   

    unix_time       category    amt  is_fraud                     merchant   
0  1646060228  gas_transport  65.17         0       Larson, Ryan and Huang  \
1  1644848624  gas_transport  47.58         0                   Myers-Reed   
2  1645632153  gas_transport  64.43         0                Baker-Bullock   
3  1645311286  gas_transport  82.47         0                 Spencer-Hall   
4  1641571926  gas_transport  50.28         0  King, Rodriguez and Hancock   

   merch_lat  merch_long  ...    zip      lat     long city_pop   
0  38.143430  -90.3

In [17]:
fraud_distribution = df['is_fraud'].value_counts()
percentage_distribution = fraud_distribution / len(df) * 100
print(fraud_distribution)
print(percentage_distribution)

is_fraud
0    4255870
1       5034
Name: count, dtype: int64
is_fraud
0    99.881856
1     0.118144
Name: count, dtype: float64


## Split the dataset

In [18]:
x1 = df[['is_fraud','amt', 'age', 'gender2','zip','category_encoded','merch_lat','merch_long' ]]
y = x1.pop('is_fraud')
print(x1.shape)
print(y.shape)

(4260904, 7)
(4260904,)


In [19]:
x, x_test, y, y_test = train_test_split(x1, y, test_size=0.1, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

In [20]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(3451331, 7)
(383482, 7)
(426091, 7)
(3451331,)
(383482,)
(426091,)


## Build logistic regression model

In [46]:
log_reg = LogisticRegression()

In [47]:
log_reg.fit(x_train, y_train)
train_preds = log_reg.predict(x_train)
val_preds = log_reg.predict(x_val)

In [48]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {}\n".format(accuracy_score(y_train, train_preds), accuracy_score(y_val, val_preds))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {}\n".format(f1_score(y_train, train_preds), f1_score(y_val, val_preds))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {}\n".format(recall_score(y_train, train_preds), recall_score(y_val, val_preds))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {}".format(precision_score(y_train, train_preds), recall_score(y_val, val_preds))) 

Accuracy score: 

Training set : 0.9987775730580463 
Validation Set : 0.9987352731027793

f1 score: 

Training set : 0.0 
Validation Set : 0.0

Recall score: 

Training set : 0.0 
Validation Set : 0.0

Precision score: 

Training set : 0.0 
Validation Set : 0.0


In [59]:
l2_model = LogisticRegression(penalty='l2', solver='saga').fit(x_train, y_train)
train_preds_l2 = l2_model.predict(x_train)
val_preds_l2 = l2_model.predict(x_val)



In [60]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {}\n".format(accuracy_score(y_train, train_preds_l2), accuracy_score(y_val, val_preds_l2))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {}\n".format(f1_score(y_train, train_preds_l2), f1_score(y_val, val_preds_l2))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {}\n".format(recall_score(y_train, train_preds_l2), recall_score(y_val, val_preds_l2))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {}\n".format(precision_score(y_train, train_preds_l2), recall_score(y_val, val_preds_l2))) 


Accuracy score: 

Training set : 0.9987958268853379 
Validation Set : 0.9987717806833176

f1 score: 

Training set : 0.0 
Validation Set : 0.0

Recall score: 

Training set : 0.0 
Validation Set : 0.0

Precision score: 

Training set : 0.0 
Validation Set : 0.0



In [61]:
log_reg_l1 = LogisticRegression(penalty='l1', solver='saga').fit(x_train, y_train)



In [64]:
y_train_preds_elastic = log_reg_l1.predict(x_train)
y_val_preds_elastic = log_reg_l1.predict(x_val)

In [65]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {}\n".format(accuracy_score(y_train, y_train_preds_elastic), accuracy_score(y_val, y_val_preds_elastic))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {}\n".format(f1_score(y_train, y_train_preds_elastic), f1_score(y_val, y_val_preds_elastic))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {}\n".format(recall_score(y_train, y_train_preds_elastic), recall_score(y_val, y_val_preds_elastic))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {}\n".format(precision_score(y_train, y_train_preds_elastic), recall_score(y_val, y_val_preds_elastic))) 


Accuracy score: 

Training set : 0.9987958268853379 
Validation Set : 0.9987717806833176

f1 score: 

Training set : 0.0 
Validation Set : 0.0

Recall score: 

Training set : 0.0 
Validation Set : 0.0

Precision score: 

Training set : 0.0 
Validation Set : 0.0



## Build Decision Tree model

In [21]:
tree_1 = DecisionTreeClassifier(random_state=8).fit(x_train, y_train)

In [22]:
y_train_preds = tree_1.predict(x_train)
y_val_preds = tree_1.predict(x_val)

In [23]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {} \n".format(accuracy_score(y_train, y_train_preds), accuracy_score(y_val, y_val_preds))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {} \n".format(f1_score(y_train, y_train_preds),f1_score(y_val, y_val_preds))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {} \n".format(recall_score(y_train, y_train_preds), recall_score(y_val, y_val_preds))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {} \n".format(precision_score(y_train, y_train_preds), precision_score(y_val, y_val_preds))) 

# The difference in performance between the training and validation sets is huge, indicating that the model is overfitting.

Accuracy score: 

Training set : 1.0 
Validation Set : 0.9980859597060618 

f1 score: 

Training set : 1.0 
Validation Set : 0.25858585858585864 

Recall score: 

Training set : 1.0 
Validation Set : 0.27645788336933047 

Precision score: 

Training set : 1.0 
Validation Set : 0.2428842504743833 



### Reduce Overfitting with min_samples_split

In [24]:
# min_samples_split=20
tree_2 = DecisionTreeClassifier(random_state=8, min_samples_split=20).fit(x_train, y_train)
y_train_preds2 = tree_2.predict(x_train)
y_val_preds2 = tree_2.predict(x_val)

In [25]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {} \n".format(accuracy_score(y_train, y_train_preds2), accuracy_score(y_val, y_val_preds2))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {} \n".format(f1_score(y_train, y_train_preds2),f1_score(y_val, y_val_preds2))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {} \n".format(recall_score(y_train, y_train_preds2), recall_score(y_val, y_val_preds2))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {} \n".format(precision_score(y_train, y_train_preds2), precision_score(y_val, y_val_preds2))) 

Accuracy score: 

Training set : 0.9993683596270541 
Validation Set : 0.9985240506725218 

f1 score: 

Training set : 0.6836912362159026 
Validation Set : 0.3046683046683047 

Recall score: 

Training set : 0.5756169069142438 
Validation Set : 0.2678185745140389 

Precision score: 

Training set : 0.8417291889960701 
Validation Set : 0.35327635327635326 



In [26]:
# min_samples_split=30
tree_3 = DecisionTreeClassifier(random_state=8, min_samples_split=30).fit(x_train, y_train)
y_train_preds3 = tree_3.predict(x_train)
y_val_preds3 = tree_3.predict(x_val)

In [27]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {} \n".format(accuracy_score(y_train, y_train_preds3), accuracy_score(y_val, y_val_preds3))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {} \n".format(f1_score(y_train, y_train_preds3),f1_score(y_val, y_val_preds3))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {} \n".format(recall_score(y_train, y_train_preds3), recall_score(y_val, y_val_preds3))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {} \n".format(precision_score(y_train, y_train_preds3), precision_score(y_val, y_val_preds3))) 

# The difference in performance between the training and validation sets is large, indicating that the model still is overfitting.

Accuracy score: 

Training set : 0.9992770904905962 
Validation Set : 0.9986205349925159 

f1 score: 

Training set : 0.6233962264150943 
Validation Set : 0.31741935483870964 

Recall score: 

Training set : 0.5045199120449548 
Validation Set : 0.265658747300216 

Precision score: 

Training set : 0.8155608214849921 
Validation Set : 0.3942307692307692 



# Reduce Overfitting with max_depth

In [28]:
tree_3.tree_.max_depth

36

In [29]:
# min_samples_split=30 and max_depth=10
tree_depth = DecisionTreeClassifier(random_state=8, min_samples_split=30, max_depth=10).fit(x_train, y_train)
y_train_preds4 = tree_depth.predict(x_train)
y_val_preds4 = tree_depth.predict(x_val)

In [30]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {} \n".format(accuracy_score(y_train, y_train_preds4), accuracy_score(y_val, y_val_preds4))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {} \n".format(f1_score(y_train, y_train_preds4),f1_score(y_val, y_val_preds4))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {} \n".format(recall_score(y_train, y_train_preds4), recall_score(y_val, y_val_preds4))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {} \n".format(precision_score(y_train, y_train_preds4), precision_score(y_val, y_val_preds4))) 

Accuracy score: 

Training set : 0.9989916933496091 
Validation Set : 0.9988161113168285 

f1 score: 

Training set : 0.39075630252100835 
Validation Set : 0.30153846153846153 

Recall score: 

Training set : 0.27266064011727337 
Validation Set : 0.21166306695464362 

Precision score: 

Training set : 0.6893143915997529 
Validation Set : 0.5240641711229946 



In [31]:
# min_samples_split=15 and max_depth=20
tree_depth2 = DecisionTreeClassifier(random_state=8, min_samples_split=15, max_depth=20).fit(x_train, y_train)
y_train_preds5 = tree_depth.predict(x_train)
y_val_preds5 = tree_depth.predict(x_val)

In [32]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {} \n".format(accuracy_score(y_train, y_train_preds5), accuracy_score(y_val, y_val_preds5))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {} \n".format(f1_score(y_train, y_train_preds5),f1_score(y_val, y_val_preds5))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {} \n".format(recall_score(y_train, y_train_preds5), recall_score(y_val, y_val_preds5))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {} \n".format(precision_score(y_train, y_train_preds5), precision_score(y_val, y_val_preds5))) 

# The result is the same as the previous model with max_depth=10

Accuracy score: 

Training set : 0.9989916933496091 
Validation Set : 0.9988161113168285 

f1 score: 

Training set : 0.39075630252100835 
Validation Set : 0.30153846153846153 

Recall score: 

Training set : 0.27266064011727337 
Validation Set : 0.21166306695464362 

Precision score: 

Training set : 0.6893143915997529 
Validation Set : 0.5240641711229946 



## Assess Decision Tree model on Testing Set

In [33]:
y_test_preds = tree_depth2.predict(x_test)

In [34]:
print("Test set score: \n")
print("Accuracy  score : {} \nf1 score : {} \nRecall score : {} \nPrecision score : {} \n".format(accuracy_score(y_test, y_test_preds), f1_score(y_test, y_test_preds), recall_score(y_test, y_test_preds), precision_score(y_test, y_test_preds))) 

Test set score: 

Accuracy  score : 0.9985988908472603 
f1 score : 0.2546816479400749 
Recall score : 0.21338912133891214 
Precision score : 0.3157894736842105 



## Build a binary tree model with randomforest

In [35]:
rf = RandomForestClassifier(random_state=8,min_samples_split=30,max_depth=20).fit(x_train, y_train)

In [36]:
y_train_rfpreds = rf.predict(x_train)
y_val_rfpreds = rf.predict(x_val)

In [37]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {} \n".format(accuracy_score(y_train, y_train_rfpreds), accuracy_score(y_val, y_val_rfpreds))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {} \n".format(f1_score(y_train, y_train_rfpreds),f1_score(y_val, y_val_rfpreds))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {} \n".format(recall_score(y_train, y_train_rfpreds), recall_score(y_val, y_val_rfpreds))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {} \n".format(precision_score(y_train, y_train_rfpreds), precision_score(y_val, y_val_rfpreds))) 

Accuracy score: 

Training set : 0.9990794855665829 
Validation Set : 0.9989152033211468 

f1 score: 

Training set : 0.3891559315516247 
Validation Set : 0.26501766784452296 

Recall score: 

Training set : 0.2472514048375275 
Validation Set : 0.16198704103671707 

Precision score: 

Training set : 0.9133574007220217 
Validation Set : 0.7281553398058253 



In [38]:
rf2 = RandomForestClassifier(random_state=8, min_samples_split=30,max_depth=20, n_estimators=50).fit(x_train, y_train)
y_train_preds2 = rf2.predict(x_train)
y_val_preds2 = rf2.predict(x_val)

In [39]:
print("Accuracy score: \n")
print("Training set : {} \nValidation Set : {} \n".format(accuracy_score(y_train, y_train_preds2), accuracy_score(y_val, y_val_preds2))) 

print("f1 score: \n")
print("Training set : {} \nValidation Set : {} \n".format(f1_score(y_train, y_train_preds2),f1_score(y_val, y_val_preds2))) 

print("Recall score: \n")
print("Training set : {} \nValidation Set : {} \n".format(recall_score(y_train, y_train_preds2), recall_score(y_val, y_val_preds2))) 

print("Precision score: \n")
print("Training set : {} \nValidation Set : {} \n".format(precision_score(y_train, y_train_preds2), precision_score(y_val, y_val_preds2))) 

Accuracy score: 

Training set : 0.9990783265934214 
Validation Set : 0.9989099879524984 

f1 score: 

Training set : 0.3911961722488038 
Validation Set : 0.2535714285714285 

Recall score: 

Training set : 0.24969460053750306 
Validation Set : 0.15334773218142547 

Precision score: 

Training set : 0.9028268551236749 
Validation Set : 0.7319587628865979 



## Assess model on Testing Set

In [66]:
y_test_preds2 = rf.predict(x_test)

In [67]:
print("Test set score: \n")
print("Accuracy  score : {} \nf1 score : {} \nRecall score : {} \nPrecision score : {} \n".format(accuracy_score(y_test, y_test_preds2), f1_score(y_test, y_test_preds2), recall_score(y_test, y_test_preds2), precision_score(y_test, y_test_preds2))) 

Test set score: 

Accuracy  score : 0.9989650098218456 
f1 score : 0.2276707530647986 
Recall score : 0.13598326359832635 
Precision score : 0.6989247311827957 

