In [17]:
#imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [18]:
df = pd.read_csv('synthetic_fraud_dataset.csv')

In [22]:
df

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.8400,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.20,Tablet,New York,Clothing,0,...,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,TXN_11284,USER_4796,45.05,Online,2023-01-29 18:38:00,76960.11,Mobile,Tokyo,Clothing,0,...,2,389.00,3,Amex,98,1537.54,PIN,0.1493,1,0
49996,TXN_44732,USER_1171,126.15,POS,2023-05-09 08:55:00,28791.75,Mobile,Tokyo,Clothing,0,...,13,434.95,4,Visa,93,2555.72,Biometric,0.3653,0,1
49997,TXN_38158,USER_2510,72.02,Online,2023-01-30 19:32:00,29916.41,Laptop,Mumbai,Clothing,0,...,1,369.15,2,Visa,114,4686.59,Biometric,0.5195,0,0
49998,TXN_860,USER_2248,64.89,Bank Transfer,2023-03-09 19:47:00,67895.67,Mobile,Tokyo,Electronics,0,...,13,242.29,4,Discover,72,4886.92,Biometric,0.7063,0,1


In [23]:
df.columns

Index(['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type',
       'Timestamp', 'Account_Balance', 'Device_Type', 'Location',
       'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity',
       'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d',
       'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age',
       'Transaction_Distance', 'Authentication_Method', 'Risk_Score',
       'Is_Weekend', 'Fraud_Label'],
      dtype='object')

In [24]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['hour'] = df['Timestamp'].dt.hour
df['day'] = df['Timestamp'].dt.day
df['month'] = df['Timestamp'].dt.month
df['dayofweek'] = df['Timestamp'].dt.dayofweek

df = df.sort_values(['User_ID', 'Timestamp'])

In [25]:
# Transactions per user
user_txn_count = df.groupby('User_ID')['Transaction_ID'].transform('count')
df['user_transaction_count'] = user_txn_count

# Average amount per user
df['user_avg_amount'] = df.groupby('User_ID')['Transaction_Amount'].transform('mean')
#standard deviation in transaction amount
df['user_amt_std'] = (
    df.groupby('User_ID')['Transaction_Amount'].transform('std')
    .fillna(0)
)
# Time gap from previous transaction (seconds)
df['time_gap_prev_txn'] = (
    df.groupby('User_ID')['Timestamp']
      .diff()
      .dt.total_seconds()
      .fillna(0)
)


In [26]:
df.drop(columns=['User_ID', 'Transaction_ID', 'Timestamp','Card_Type','Card_Age','Risk_Score','Is_Weekend'], inplace=True)


In [27]:
df

Unnamed: 0,Transaction_Amount,Transaction_Type,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,...,Authentication_Method,Fraud_Label,hour,day,month,dayofweek,user_transaction_count,user_avg_amount,user_amt_std,time_gap_prev_txn
47834,69.02,Online,36077.57,Tablet,London,Groceries,0,0,6,225.91,...,Password,1,10,21,2,1,5,149.73600,143.502670,0.0
28102,191.87,Online,4201.83,Laptop,Tokyo,Electronics,0,0,7,357.91,...,OTP,0,23,26,4,2,5,149.73600,143.502670,5575920.0
13089,64.78,Online,25291.69,Tablet,London,Clothing,0,0,10,106.00,...,Password,1,20,1,9,4,5,149.73600,143.502670,11047920.0
21475,383.60,Bank Transfer,53558.88,Laptop,Mumbai,Clothing,0,0,8,441.82,...,OTP,0,3,29,10,6,5,149.73600,143.502670,4952280.0
41209,39.41,Bank Transfer,70995.83,Laptop,Mumbai,Clothing,0,0,2,120.49,...,PIN,0,6,3,12,6,5,149.73600,143.502670,3034560.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22745,90.20,Online,20400.56,Laptop,London,Restaurants,0,0,5,256.98,...,PIN,0,17,4,9,0,16,100.98625,92.502614,1105020.0
37716,14.89,Online,46190.69,Tablet,New York,Electronics,0,0,12,11.85,...,OTP,0,17,30,9,5,16,100.98625,92.502614,2246820.0
4367,49.53,Bank Transfer,81626.81,Tablet,New York,Electronics,0,0,12,220.06,...,OTP,0,4,4,11,5,16,100.98625,92.502614,2974440.0
29179,228.91,Online,34061.29,Mobile,Mumbai,Restaurants,0,0,6,181.33,...,OTP,0,9,13,12,2,16,100.98625,92.502614,3389220.0


In [28]:
df

Unnamed: 0,Transaction_Amount,Transaction_Type,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,...,Authentication_Method,Fraud_Label,hour,day,month,dayofweek,user_transaction_count,user_avg_amount,user_amt_std,time_gap_prev_txn
47834,69.02,Online,36077.57,Tablet,London,Groceries,0,0,6,225.91,...,Password,1,10,21,2,1,5,149.73600,143.502670,0.0
28102,191.87,Online,4201.83,Laptop,Tokyo,Electronics,0,0,7,357.91,...,OTP,0,23,26,4,2,5,149.73600,143.502670,5575920.0
13089,64.78,Online,25291.69,Tablet,London,Clothing,0,0,10,106.00,...,Password,1,20,1,9,4,5,149.73600,143.502670,11047920.0
21475,383.60,Bank Transfer,53558.88,Laptop,Mumbai,Clothing,0,0,8,441.82,...,OTP,0,3,29,10,6,5,149.73600,143.502670,4952280.0
41209,39.41,Bank Transfer,70995.83,Laptop,Mumbai,Clothing,0,0,2,120.49,...,PIN,0,6,3,12,6,5,149.73600,143.502670,3034560.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22745,90.20,Online,20400.56,Laptop,London,Restaurants,0,0,5,256.98,...,PIN,0,17,4,9,0,16,100.98625,92.502614,1105020.0
37716,14.89,Online,46190.69,Tablet,New York,Electronics,0,0,12,11.85,...,OTP,0,17,30,9,5,16,100.98625,92.502614,2246820.0
4367,49.53,Bank Transfer,81626.81,Tablet,New York,Electronics,0,0,12,220.06,...,OTP,0,4,4,11,5,16,100.98625,92.502614,2974440.0
29179,228.91,Online,34061.29,Mobile,Mumbai,Restaurants,0,0,6,181.33,...,OTP,0,9,13,12,2,16,100.98625,92.502614,3389220.0


In [29]:
# Numerical
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna("Unknown")


In [30]:

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])



In [31]:
# Numeric correlation only
corr_with_target = df.corr()['Fraud_Label']
corr_with_target


Transaction_Amount              0.001901
Transaction_Type               -0.004592
Account_Balance                -0.003153
Device_Type                     0.005368
Location                        0.004680
Merchant_Category               0.005734
IP_Address_Flag                 0.003028
Previous_Fraudulent_Activity   -0.000718
Daily_Transaction_Count        -0.007065
Avg_Transaction_Amount_7d       0.000703
Failed_Transaction_Count_7d     0.509871
Transaction_Distance           -0.000116
Authentication_Method          -0.000163
Fraud_Label                     1.000000
hour                            0.005822
day                             0.005262
month                          -0.001213
dayofweek                       0.005272
user_transaction_count         -0.000508
user_avg_amount                 0.008564
user_amt_std                    0.008512
time_gap_prev_txn               0.001515
Name: Fraud_Label, dtype: float64

In [32]:
df

Unnamed: 0,Transaction_Amount,Transaction_Type,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,...,Authentication_Method,Fraud_Label,hour,day,month,dayofweek,user_transaction_count,user_avg_amount,user_amt_std,time_gap_prev_txn
47834,69.02,2,36077.57,2,0,2,0,0,6,225.91,...,3,1,10,21,2,1,5,149.73600,143.502670,0.0
28102,191.87,2,4201.83,0,4,1,0,0,7,357.91,...,1,0,23,26,4,2,5,149.73600,143.502670,5575920.0
13089,64.78,2,25291.69,2,0,0,0,0,10,106.00,...,3,1,20,1,9,4,5,149.73600,143.502670,11047920.0
21475,383.60,1,53558.88,0,1,0,0,0,8,441.82,...,1,0,3,29,10,6,5,149.73600,143.502670,4952280.0
41209,39.41,1,70995.83,0,1,0,0,0,2,120.49,...,2,0,6,3,12,6,5,149.73600,143.502670,3034560.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22745,90.20,2,20400.56,0,0,3,0,0,5,256.98,...,2,0,17,4,9,0,16,100.98625,92.502614,1105020.0
37716,14.89,2,46190.69,2,2,1,0,0,12,11.85,...,1,0,17,30,9,5,16,100.98625,92.502614,2246820.0
4367,49.53,1,81626.81,2,2,1,0,0,12,220.06,...,1,0,4,4,11,5,16,100.98625,92.502614,2974440.0
29179,228.91,2,34061.29,1,1,3,0,0,6,181.33,...,1,0,9,13,12,2,16,100.98625,92.502614,3389220.0


In [33]:
X = df.drop(columns=['Fraud_Label'])
y = df['Fraud_Label']


In [34]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [35]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [36]:

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_resampled
)


In [37]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42
)

rf.fit(X_train, y_train)


In [38]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))


              precision    recall  f1-score   support

           0       0.81      1.00      0.90      6787
           1       1.00      0.77      0.87      6787

    accuracy                           0.88     13574
   macro avg       0.91      0.88      0.88     13574
weighted avg       0.91      0.88      0.88     13574

ROC-AUC: 0.9187783898307569


In [39]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
    eval_metric='logloss',
    random_state=42
)

xgb.fit(X_train, y_train)


In [40]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_xgb))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92      6787
           1       1.00      0.82      0.90      6787

    accuracy                           0.91     13574
   macro avg       0.92      0.91      0.91     13574
weighted avg       0.92      0.91      0.91     13574

ROC-AUC: 0.9146283568620437


In [41]:
y_train.value_counts()

Fraud_Label
0    27146
1    27146
Name: count, dtype: int64