# Credit Card Fraud Detection - Colab Version

This notebook is adapted for Google Colab. It covers data preprocessing, handling imbalanced datasets (Undersampling & SMOTE), and benchmarking various machine learning models.

## 1. Colab Environment Setup and Data Loading
First, we import the necessary libraries. Since we are in Colab, we need a way to access the `creditcard.csv` file. You can either mount your Google Drive or upload the file directly.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")

# Option 1: Mount Google Drive (Uncomment if your file is in Drive)
# from google.colab import drive
# drive.mount('/content/drive')
# file_path = '/content/drive/MyDrive/path_to_your_file/creditcard.csv'

# Option 2: Upload file directly
from google.colab import files
uploaded = files.upload()

# Assuming the file is named 'creditcard.csv'
import io
# If you used Option 2, use this:
df = pd.read_csv(io.BytesIO(uploaded['creditcard.csv']))

# If you used Option 1, use this instead:
# df = pd.read_csv(file_path)

## 2. Exploratory Data Analysis (EDA)
Let's inspect the dataset structure, check for missing values, and visualize the class imbalance.

In [None]:
# Display first 5 rows
display(df.head())

# Check data shape and info
print(f"Dataset Shape: {df.shape}")
df.info()

In [None]:
# Check for null values
print("\nMissing Values:")
print(df.isnull().sum().max())

# Class distribution
print('\nNo Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

# Visualize Class Distribution
sns.countplot(x='Class', data=df, palette='viridis')
plt.title('Class Distribution \n (0: No Fraud || 1: Fraud)', fontsize=14)
plt.show()

## 3. Feature Scaling and Data Splitting
We scale `Amount` and `Time` using `RobustScaler` because it is less prone to outliers. Then, we split the data using Stratified K-Fold to maintain the class ratio.

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold

# Scale Amount and Time
df['Amount_Scale'] = RobustScaler().fit_transform(df['Amount'].values.reshape(-1,1))
df['Time_Scale'] = RobustScaler().fit_transform(df['Time'].values.reshape(-1,1))

# Drop original columns and insert scaled ones
df.drop(['Time', 'Amount'], axis=1, inplace=True)
df.insert(0, 'Amount_Scale', df['Amount_Scale'])
df.insert(1, 'Time_Scale', df['Time_Scale'])

display(df.head())

In [None]:
# Prepare X and y
X = df.drop('Class', axis=1)
y = df['Class']

# Stratified Split
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Convert to numpy arrays
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

print("Training set class distribution:")
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts/len(y_train))))

## 4. Random Undersampling and Correlation Analysis
To better understand correlations without the bias of the majority class, we create a balanced subsample (50/50 ratio).

In [None]:
# Shuffle data
df = df.sample(frac=1)

# Create subsample
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])
new_df = normal_distributed_df.sample(frac=1, random_state=42)

print("Distribution of the Classes in the subsample dataset")
print(new_df['Class'].value_counts()/len(new_df))

sns.countplot(x='Class', data=new_df, palette='viridis')
plt.title('Equally Distributed Classes', fontsize=14)
plt.show()

In [None]:
# Correlation Heatmap
f, ax = plt.subplots(figsize=(24,20))

sub_sample_corr = new_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax)
plt.title('SubSample Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()

In [None]:
# Boxplots for Negative Correlation (V14, V12, V10) and Positive (V11, V4, V2, V19)
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V14", data=new_df, palette=['blue', 'red'], ax=axes[0])
axes[0].set_title('V14 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=new_df, palette=['blue', 'red'], ax=axes[1])
axes[1].set_title('V12 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V10", data=new_df, palette=['blue', 'red'], ax=axes[2])
axes[2].set_title('V10 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V11", data=new_df, palette=['blue', 'red'], ax=axes[3])
axes[3].set_title('V11 vs Class Positive Correlation')

plt.show()

## 5. Outlier Removal and Feature Transformation
We remove extreme outliers from features with high correlation (V14, V12, V10) using the IQR method. We also apply `PowerTransformer` to fix skewness in features like V2 and V10.

In [None]:
from scipy.stats import norm

def outlier_cutoff(data, feature_name):
    data_fraud = data[feature_name].loc[data['Class'] == 1].values
    q1, q3 = np.percentile(data_fraud, 25), np.percentile(data_fraud, 75)
    iqr = q3 - q1
    cut_off = iqr * 1.5
    lower, upper = q1 - cut_off, q3 + cut_off
    
    outlier_remove = data[(data['Class'] == 1) & ((data[feature_name] < lower) | (data[feature_name] > upper))]
    print(f'Feature {feature_name} Outliers removed: {len(outlier_remove)}')
    return data.drop(outlier_remove.index)

# Remove outliers
df_cleaned = new_df.copy()
df_cleaned = outlier_cutoff(df_cleaned, 'V14')
df_cleaned = outlier_cutoff(df_cleaned, 'V12')
df_cleaned = outlier_cutoff(df_cleaned, 'V10')

In [None]:
from sklearn.preprocessing import PowerTransformer

# Apply PowerTransformer
power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)
feature_to_transform = ['V2', 'V10']

df_cleaned[feature_to_transform] = power_transformer.fit_transform(df_cleaned[feature_to_transform])

# Visualize distribution after transformation
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df_cleaned['V2'].loc[df_cleaned['Class'] == 1], ax=ax1, kde=True, color='#56F9BB')
ax1.set_title('V2 Distribution (Transformed)')
sns.histplot(df_cleaned['V10'].loc[df_cleaned['Class'] == 1], ax=ax2, kde=True, color='#FF5733')
ax2.set_title('V10 Distribution (Transformed)')
plt.show()

## 6. Dimensionality Reduction Visualization
Using t-SNE, PCA, and TruncatedSVD to visualize if the fraud and non-fraud classes are separable in 2D space.

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD

X_red = df_cleaned.drop('Class', axis=1).values
y_red = df_cleaned['Class'].values

# t-SNE
t0 = time.time()
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X_red)
t1 = time.time()
print("t-SNE took {:.2f} s".format(t1 - t0))

# PCA
t0 = time.time()
X_reduced_pca = PCA(n_components=2, random_state=42).fit_transform(X_red)
t1 = time.time()
print("PCA took {:.2f} s".format(t1 - t0))

# TruncatedSVD
t0 = time.time()
X_reduced_svd = TruncatedSVD(n_components=2, algorithm='randomized', random_state=42).fit_transform(X_red)
t1 = time.time()
print("Truncated SVD took {:.2f} s".format(t1 - t0))

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 6))
f.suptitle('Clusters using Dimensionality Reduction', fontsize=14)

blue_patch = plt.Line2D([0], [0], marker='o', color='w', label='No Fraud', markerfacecolor='blue', markersize=10)
red_patch = plt.Line2D([0], [0], marker='o', color='w', label='Fraud', markerfacecolor='red', markersize=10)

# t-SNE scatter plot
ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y_red == 0), cmap='coolwarm', label='No Fraud', linewidths=2)
ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y_red == 1), cmap='coolwarm', label='Fraud', linewidths=2)
ax1.set_title('t-SNE', fontsize=14)
ax1.grid(True)
ax1.legend(handles=[blue_patch, red_patch])

# PCA scatter plot
ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y_red == 0), cmap='coolwarm', label='No Fraud', linewidths=2)
ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y_red == 1), cmap='coolwarm', label='Fraud', linewidths=2)
ax2.set_title('PCA', fontsize=14)
ax2.grid(True)
ax2.legend(handles=[blue_patch, red_patch])

# Truncated SVD scatter plot
ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y_red == 0), cmap='coolwarm', label='No Fraud', linewidths=2)
ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y_red == 1), cmap='coolwarm', label='Fraud', linewidths=2)
ax3.set_title('Truncated SVD', fontsize=14)
ax3.grid(True)
ax3.legend(handles=[blue_patch, red_patch])

plt.show()

## 7. SMOTE Oversampling Implementation
We apply SMOTE to the training data to handle the imbalance. Note: We only apply SMOTE to the training set to avoid data leakage.

In [None]:
from imblearn.over_sampling import SMOTE

# Create copies of original data
X_train_org = X_train.copy()
y_train_org = y_train.copy()
X_test_org = X_test.copy()
y_test_org = y_test.copy()

print('Length of X (train): {} | Length of y (train): {}'.format(len(X_train_org), len(y_train_org)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(X_test_org), len(y_test_org)))

# Apply SMOTE
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_org, y_train_org)

print('Length of X (train) after SMOTE: {}'.format(len(X_train_smote)))
print('Length of y (train) after SMOTE: {}'.format(len(y_train_smote)))

## 8. Model Training and Evaluation
We will train Logistic Regression, Random Forest, XGBoost, Decision Tree, and MLP on the SMOTE dataset and evaluate them on the original test set.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

def plot_model_performance(model, X_test, y_test, y_pred, model_name):
    print(f"--- {model_name} Report ---")
    print(classification_report(y_test, y_pred))
    
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax[0], annot_kws={"size": 16})
    ax[0].set_title(f'{model_name} - Confusion Matrix')
    
    # ROC Curve
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        ax[1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        ax[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        ax[1].set_title(f'{model_name} - ROC Curve')
        ax[1].legend(loc="lower right")
    
    plt.tight_layout()
    plt.show()

In [None]:
# 1. Logistic Regression
log_reg_sm = LogisticRegression(solver='liblinear', random_state=42)
log_reg_sm.fit(X_train_smote, y_train_smote)
y_pred_log = log_reg_sm.predict(X_test_org)
plot_model_performance(log_reg_sm, X_test_org, y_test_org, y_pred_log, "Logistic Regression (SMOTE)")

# 2. Random Forest
rf_sm = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sm.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_sm.predict(X_test_org)
plot_model_performance(rf_sm, X_test_org, y_test_org, y_pred_rf, "Random Forest (SMOTE)")

# 3. XGBoost
xgb_sm = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_sm.fit(X_train_smote, y_train_smote)
y_pred_xgb = xgb_sm.predict(X_test_org)
plot_model_performance(xgb_sm, X_test_org, y_test_org, y_pred_xgb, "XGBoost (SMOTE)")

## 9. Hyperparameter Tuning for XGBoost
Using `RandomizedSearchCV` to find the best parameters for XGBoost to further improve performance.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.5, 1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_base = XGBClassifier(use_label_encoder=False, eval_metric='aucpr', random_state=42)

xgb_random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=xgb_param_dist,
    n_iter=10, # Reduced iterations for faster execution in Colab demo
    scoring='f1',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

print("Searching for best hyperparameters...")
xgb_random_search.fit(X_train_smote, y_train_smote)

best_xgb = xgb_random_search.best_estimator_
print(f"Best Parameters: {xgb_random_search.best_params_}")

y_pred_best_xgb = best_xgb.predict(X_test_org)
plot_model_performance(best_xgb, X_test_org, y_test_org, y_pred_best_xgb, "XGBoost (Tuned)")