In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, recall_score, roc_auc_score, confusion_matrix

In [63]:
df = pd.read_csv("dataset_for_ML.csv")
df.head()

Unnamed: 0,Amount,BankNameSent,BankNameReceived,RemainingBalance,City,Gender,TransactionType,Status,DeviceType,PaymentMethod,MerchantName,Purpose,CustomerAge,PaymentMode,Currency,Months,Days,Hours,Minutes
0,271.64,SBI Bank,HDFC Bank,5557.02,Delhi,Female,Transfer,Success,Tablet,Phone Number,Amazon,Food,21,Scheduled,USD,2,2,17,12
1,1064.63,ICICI Bank,SBI Bank,9753.32,Bangalore,Male,Payment,Success,Laptop,QR Code,Zomato,Travel,22,Instant,EUR,3,3,11,15
2,144.15,Axis Bank,Axis Bank,7597.35,Hyderabad,Female,Transfer,Success,Mobile,UPI ID,Swiggy,Bill Payment,23,Scheduled,GBP,4,4,21,29
3,612.89,HDFC Bank,ICICI Bank,2327.84,Mumbai,Male,Payment,Success,Tablet,Phone Number,IRCTC,Others,24,Instant,INR,5,5,6,27
4,743.32,SBI Bank,HDFC Bank,1136.84,Delhi,Female,Transfer,Failed,Laptop,QR Code,Flipkart,Shopping,25,Scheduled,USD,6,6,2,6


In [64]:
X = df.drop(columns="Purpose")
y = df["Purpose"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(f"Train Size : {X_train.shape}")
print(f"Test Size : {X_test.shape}")

Train Size : (14000, 18)
Test Size : (6000, 18)


In [65]:
cols = [
    "BankNameSent", 
    "BankNameReceived", 
    "City", 
    "Gender", 
    "TransactionType", 
    "Status", 
    "DeviceType", 
    "PaymentMethod", 
    "MerchantName", 
    "PaymentMode", 
    "Currency"
]

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Fit-transform on X_train
X_train_ohe = pd.DataFrame(
    data=ohe.fit_transform(X_train[cols]),
    index=X_train.index,
    columns=ohe.get_feature_names_out(cols)
)
# Transform X_test
X_test_ohe = pd.DataFrame(
    data=ohe.transform(X_test[cols]),
    index=X_test.index,
    columns=ohe.get_feature_names_out(cols)
)

# Drop original categorical columns and concatenate OHE columns
X_train = pd.concat([X_train.drop(columns=cols), X_train_ohe], axis=1)
X_test = pd.concat([X_test.drop(columns=cols), X_test_ohe], axis=1)

print(f"X_train After Encoding : {X_train.shape}")
print(f"X_test After Encoding : {X_test.shape}")

X_train After Encoding : (14000, 42)
X_test After Encoding : (6000, 42)


In [66]:
lb = LabelEncoder()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [67]:
scaling = StandardScaler()
X_train_scaling = scaling.fit_transform(X_train)
X_test_scaling = scaling.transform(X_test)