In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV

**Load the data**

In [None]:
from google.colab import files
uploaded = files.upload()
import pandas as pd
df = pd.read_csv('cleaned_fraud_data.csv')


Saving cleaned_fraud_data.csv to cleaned_fraud_data.csv


**Prepare Features (X) and Target (y)**

In [None]:
X = df.drop(columns=['trans_date_trans_time', 'dob', 'is_fraud'])
y = df['is_fraud']


separate the dataset into two parts:

fraud_df: Transactions labeled as fraudulent (is_fraud == 1)

no_fraud_df: Transactions labeled as non-fraudulent (is_fraud == 0).

In [None]:
fraud_df = df[df['is_fraud'] == 1]
no_fraud_df = df[df['is_fraud'] == 0]
fraud_count = len(fraud_df)
no_fraud_count = len(no_fraud_df)


**Sampling Fraud and Non-Fraud Subsets**



In [None]:
fraud_subset_size = min(7000, fraud_count)
no_fraud_subset_size = min(10000, no_fraud_count)

fraud_subset = fraud_df.sample(n=fraud_subset_size, random_state=42, replace=False)
no_fraud_subset = no_fraud_df.sample(n=no_fraud_subset_size, random_state=42, replace=False)


**Combine the Fraud and Non-Fraud Samples**

In [None]:
balanced_df = pd.concat([fraud_subset, no_fraud_subset])
balanced_df.reset_index(drop=True, inplace=True)


**Features and Target After Sampling**

In [None]:
X = balanced_df.drop(columns=['trans_date_trans_time', 'dob', 'is_fraud'])
y = balanced_df['is_fraud']


**Train-Test Split (After Sampling)**

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)


**Standardize the Data**

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


**Logistic Regression**

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
# Predictions for training, validation, and test sets
y_train_pred_lr = log_reg.predict(X_train)
y_val_pred_lr = log_reg.predict(X_val)
y_test_pred_lr = log_reg.predict(X_test)

In [None]:
# Evaluate on Training Set
print(f"Logistic Regression - Training Set: Accuracy: {accuracy_score(y_train, y_train_pred_lr):.4f}, Precision: {precision_score(y_train, y_train_pred_lr):.4f}, Recall: {recall_score(y_train, y_train_pred_lr):.4f}, F1-Score: {f1_score(y_train, y_train_pred_lr):.4f}")

# Evaluate on Validation Set
print(f"Logistic Regression - Validation Set: Accuracy: {accuracy_score(y_val, y_val_pred_lr):.4f}, Precision: {precision_score(y_val, y_val_pred_lr):.4f}, Recall: {recall_score(y_val, y_val_pred_lr):.4f}, F1-Score: {f1_score(y_val, y_val_pred_lr):.4f}")

# Evaluate on Test Set
print(f"Logistic Regression - Test Set: Accuracy: {accuracy_score(y_test, y_test_pred_lr):.4f}, Precision: {precision_score(y_test, y_test_pred_lr):.4f}, Recall: {recall_score(y_test, y_test_pred_lr):.4f}, F1-Score: {f1_score(y_test, y_test_pred_lr):.4f}")

Logistic Regression - Training Set: Accuracy: 0.8833, Precision: 0.9368, Recall: 0.7393, F1-Score: 0.8264
Logistic Regression - Validation Set: Accuracy: 0.8835, Precision: 0.9350, Recall: 0.7451, F1-Score: 0.8293
Logistic Regression - Test Set: Accuracy: 0.8782, Precision: 0.9159, Recall: 0.7375, F1-Score: 0.8171


**Decision Tree**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

In [None]:
# Predictions for training, validation, and test sets
y_train_pred_dt = decision_tree.predict(X_train)
y_val_pred_dt = decision_tree.predict(X_val)
y_test_pred_dt = decision_tree.predict(X_test)

In [None]:
# Evaluate on Training Set
print(f"Decision Tree - Training Set: Accuracy: {accuracy_score(y_train, y_train_pred_dt):.4f}, Precision: {precision_score(y_train, y_train_pred_dt):.4f}, Recall: {recall_score(y_train, y_train_pred_dt):.4f}, F1-Score: {f1_score(y_train, y_train_pred_dt):.4f}")

# Evaluate on Validation Set
print(f"Decision Tree - Validation Set: Accuracy: {accuracy_score(y_val, y_val_pred_dt):.4f}, Precision: {precision_score(y_val, y_val_pred_dt):.4f}, Recall: {recall_score(y_val, y_val_pred_dt):.4f}, F1-Score: {f1_score(y_val, y_val_pred_dt):.4f}")

# Evaluate on Test Set
print(f"Decision Tree - Test Set: Accuracy: {accuracy_score(y_test, y_test_pred_dt):.4f}, Precision: {precision_score(y_test, y_test_pred_dt):.4f}, Recall: {recall_score(y_test, y_test_pred_dt):.4f}, F1-Score: {f1_score(y_test, y_test_pred_dt):.4f}")
cm = confusion_matrix(y_test, y_test_pred_dt)
print("Confusion Matrix:")
print(cm)
report = classification_report(y_test, y_test_pred_dt)
print("Classification Report:")
print(report)


Decision Tree - Training Set: Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
Decision Tree - Validation Set: Accuracy: 0.9594, Precision: 0.9444, Recall: 0.9490, F1-Score: 0.9467
Decision Tree - Test Set: Accuracy: 0.9541, Precision: 0.9337, Recall: 0.9424, F1-Score: 0.9381
Confusion Matrix:
[[1942   79]
 [  68 1113]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2021
           1       0.93      0.94      0.94      1181

    accuracy                           0.95      3202
   macro avg       0.95      0.95      0.95      3202
weighted avg       0.95      0.95      0.95      3202



In [None]:
import pickle
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('decision_tree_model.pkl', 'wb') as model_file:
    pickle.dump(decision_tree, model_file)
from google.colab import files

files.download('scaler.pkl')
files.download('decision_tree_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Random Forest**

In [None]:
rf = RandomForestClassifier(n_estimators=50, max_depth=10, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

In [None]:
# Predictions for training, validation, and test sets
y_train_pred_rf = rf.predict(X_train)
y_val_pred_rf = rf.predict(X_val)
y_test_pred_rf = rf.predict(X_test)

In [None]:
# Evaluate on Training Set
print(f"Random Forest - Training Set: Accuracy: {accuracy_score(y_train, y_train_pred_rf):.4f}, Precision: {precision_score(y_train, y_train_pred_rf):.4f}, Recall: {recall_score(y_train, y_train_pred_rf):.4f}, F1-Score: {f1_score(y_train, y_train_pred_rf):.4f}")

# Evaluate on Validation Set
print(f"Random Forest - Validation Set: Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}, Precision: {precision_score(y_val, y_val_pred_rf):.4f}, Recall: {recall_score(y_val, y_val_pred_rf):.4f}, F1-Score: {f1_score(y_val, y_val_pred_rf):.4f}")

# Evaluate on Test Set
print(f"Random Forest - Test Set: Accuracy: {accuracy_score(y_test, y_test_pred_rf):.4f}, Precision: {precision_score(y_test, y_test_pred_rf):.4f}, Recall: {recall_score(y_test, y_test_pred_rf):.4f}, F1-Score: {f1_score(y_test, y_test_pred_rf):.4f}")

Random Forest - Training Set: Accuracy: 0.9783, Precision: 0.9746, Recall: 0.9676, F1-Score: 0.9711
Random Forest - Validation Set: Accuracy: 0.9535, Precision: 0.9464, Recall: 0.9301, F1-Score: 0.9382
Random Forest - Test Set: Accuracy: 0.9528, Precision: 0.9455, Recall: 0.9255, F1-Score: 0.9354


**SVM (Support Vector Machine)**

In [None]:
param_distributions = {'C': [0.1, 1, 10]}
svm_model = LinearSVC(random_state=42)

In [None]:
# Hyperparameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(svm_model, param_distributions, n_iter=3, cv=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

In [None]:
# Best SVM model after hyperparameter tuning
best_svm_model = random_search.best_estimator_

# Predictions for training, validation, and test sets
y_train_pred_svm = best_svm_model.predict(X_train)
y_val_pred_svm = best_svm_model.predict(X_val)
y_test_pred_svm = best_svm_model.predict(X_test)

In [None]:
# Evaluate on Training Set
print(f"SVM (Tuned) - Training Set: Accuracy: {accuracy_score(y_train, y_train_pred_svm):.4f}, Precision: {precision_score(y_train, y_train_pred_svm):.4f}, Recall: {recall_score(y_train, y_train_pred_svm):.4f}, F1-Score: {f1_score(y_train, y_train_pred_svm):.4f}")

# Evaluate on Validation Set
print(f"SVM (Tuned) - Validation Set: Accuracy: {accuracy_score(y_val, y_val_pred_svm):.4f}, Precision: {precision_score(y_val, y_val_pred_svm):.4f}, Recall: {recall_score(y_val, y_val_pred_svm):.4f}, F1-Score: {f1_score(y_val, y_val_pred_svm):.4f}")

# Evaluate on Test Set
print(f"SVM (Tuned) - Test Set: Accuracy: {accuracy_score(y_test, y_test_pred_svm):.4f}, Precision: {precision_score(y_test, y_test_pred_svm):.4f}, Recall: {recall_score(y_test, y_test_pred_svm):.4f}, F1-Score: {f1_score(y_test, y_test_pred_svm):.4f}")

SVM (Tuned) - Training Set: Accuracy: 0.8694, Precision: 0.9605, Recall: 0.6805, F1-Score: 0.7966
SVM (Tuned) - Validation Set: Accuracy: 0.8694, Precision: 0.9597, Recall: 0.6850, F1-Score: 0.7994
SVM (Tuned) - Test Set: Accuracy: 0.8729, Precision: 0.9521, Recall: 0.6901, F1-Score: 0.8002


**Best Model for Credit Card Fraud Detection: Decision Tree**

The Decision Tree is the best model for credit card fraud detection. It scored 97% on training accuracy and 96.06% on the test set. With high precision (94.54%) and recall (95.90%), random forest effectively identifies fraudulent transactions.Gives the strong F1-Score of 95.22% shows it balances precision and recall well.It's easy to interpret, making it a reliable model for credit card fraud detection.