<a href="https://colab.research.google.com/github/greatermonk/Credit-Card-Fraud-Detection-Model/blob/main/Credit_Card_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pandas numpy matplotlib seaborn imblearn xgboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
raw_data = pd.read_csv('/content/drive/My Drive/Credit Card Model/creditcard.csv')
print(raw_data.head(10))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   
5   2.0 -0.425966  0.960523  1.141109 -0.168252  0.420987 -0.029728  0.476201   
6   4.0  1.229658  0.141004  0.045371  1.202613  0.191881  0.272708 -0.005159   
7   7.0 -0.644269  1.417964  1.074380 -0.492199  0.948934  0.428118  1.120631   
8   7.0 -0.894286  0.286157 -0.113192 -0.271526  2.669599  3.721818  0.370145   
9   9.0 -0.338262  1.119593  1.044367 -0.222187  0.499361 -0.

# Create a Class Fraud Detection Model
**Initialize 3 ML models to use:**



1.   Random Forest
2.   Logistic Regression(Classifier)
3.   XGBoost




In [None]:
class FraudDetectionModel:
    def __init__ (self):
        self.models = {
            'random_forest': RandomForestClassifier(random_state=42, n_jobs=-1),
            'logistic_regression': LogisticRegression(random_state=42, max_iter=1000),
            'xgboost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
        }
        self.best_model = None
        self.scaler = StandardScaler()

**Data Loading & Data Preprocessing:**



1.   Load the creditcard.csv file using pd.read_csv()
2.   Separate Features & Targets
3.   Split the data into:
     - Train Set: X_train, y_train (80%),
     - Test Set: X_test, y_test (20%)
4.   Use StandardScaler class to scale the features   (Removing the mean & scaling to unit variance)

5.   Return the Scaled features and target values.



In [None]:
    def load_and_preprocess(self, data_path=raw_data):
        # Load the dataset
        df = pd.read_csv(data_path)

        # Separate features and target
        X = df.drop('Class', axis=1)
        y = df['Class']

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Scale the features
        self.scaler.fit(X_train)
        X_train_scaled = self.scaler.transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test

# What is SMOTE?
**SMOTE** is a technique used to address class imbalance in datasets by generating synthetic samples of the minority class.

We have 492 frauds out of 284,807 transactions (0.172% fraud rate)
This extreme imbalance can make models biased towards the majority class (non-fraudulent transactions)

**How SMOTE Works:**

*For each minority class sample:*

1. Find its k-nearest neighbors (default k=5) in the minority class.

2. Randomly select one of these neighbors.

3. Create a synthetic sample along the line between the original sample and the selected neighbor:

**Returns:**

- *X_train_balanced:* Features with synthetic samples added.
- *y_train_balanced:* Corresponding balanced labels





In [None]:
    def apply_smote (self, X_train, y_train):
        # Apply SMOTE to handle class imbalance
        smote = SMOTE(random_state=42)
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        return X_train_balanced, y_train_balanced

**Train And Evaluate**

Train the models using preprocessed dataset.

Also give the best model based on the F1 score.

In [None]:
    def train_and_evaluate (self, X_train, X_test, y_train, y_test):
        results = {}

        # Train and evaluate each model
        for name, model in self.models.items():
            print(f"\nTraining {name}...")

            # Train the model
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]

            # Calculate metrics
            results[name] = self.calculate_metrics(y_test, y_pred, y_pred_proba)

        # Find the best model based on F1 score
        best_model_name = max(results.items(), key=lambda x: x[1]['f1_score'])[0]
        self.best_model = self.models[best_model_name]

        return results

**Calculating Metrics**

1. TN (True Negative): Model correctly predicts a negative class.
2. FP (False Positive): Model incorrectly predicts a positive class.
3. FN (False Negative): Model incorrectly predicts a negative class.
4. TP (True Positive): Model correctly predicts a positive class.

**A) Performance Metrics:**

1. Accuracy: Overall proportion of correct predictions.
2. Precision: Proportion of positive predictions that are actually positive.
3. Recall (Sensitivity): Proportion of actual positive cases correctly identified.
4. Specificity: Proportion of actual negative cases correctly identified.
5. F1-score: Harmonic mean of precision and recall, balancing both metrics.

**B) Visualization Tool:**

1. ROC Curve (Receiver Operating Characteristic Curve): Plots the true positive rate (sensitivity) against the false positive rate (1-specificity) at various threshold settings.

**Evaluation Matrix:**

Confusion Matrix: A table that visualizes the performance of a classification model, showing correct and incorrect predictions for each class.

In [None]:
    def calculate_metrics (self, y_true, y_pred, y_pred_proba):
        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

        # Calculate various metrics
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        specificity = tn / (tn + fp)
        f1_score = 2 * (precision * recall) / (precision + recall)

        # Calculate ROC curve and AUC
        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
        roc_auc = auc(fpr, tpr)

        # Calculate Precision-Recall curve and AUC
        precision_curve, recall_curve, _ = precision_recall_curve(y_true, y_pred_proba)
        pr_auc = auc(recall_curve, precision_curve)

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'specificity': specificity,
            'f1_score': f1_score,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc,
            'confusion_matrix': {
                'tn': tn, 'fp': fp,
                'fn': fn, 'tp': tp
            }
        }

**Plot The Results using matplotlib and seaborn library**

This function provides a clear visual comparison of various model performance metrics and confusion matrices in a single figure.

In [None]:
    def plot_results (self, results):
        # Set up the plotting style
        plt.style.use('ggplot')

        # Create a figure with multiple subplots
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))

        # Plot metrics comparison
        metrics = ['accuracy', 'precision', 'recall', 'specificity']
        model_names = list(results.keys())
        metric_values = [[results[model][metric] for model in model_names] for metric in metrics]

        # Bar plot for metrics
        ax = axes[0, 0]
        x = np.arange(len(model_names))
        width = 0.2
        for i, metric in enumerate(metrics):
            ax.bar(x + i * width, [results[model][metric] for model in model_names],
                   width, label=metric.capitalize())
        ax.set_xticks(x + width * 1.5)
        ax.set_xticklabels(model_names)
        ax.set_title('Model Performance Metrics')
        ax.legend()

        # Plot confusion matrices
        for i, (name, result) in enumerate(results.items()):
            ax = axes[i // 2, i % 2 + 1] if i < 2 else axes[1, i - 2]
            cm = np.array([[result['confusion_matrix']['tn'], result['confusion_matrix']['fp']],
                           [result['confusion_matrix']['fn'], result['confusion_matrix']['tp']]])
            sns.heatmap(cm, annot=True, fmt='d', ax=ax)
            ax.set_title(f'Confusion Matrix - {name}')
            ax.set_xlabel('Predicted')
            ax.set_ylabel('Actual')

        plt.tight_layout()
        plt.show()


# Main function

- Initialize FraudDetectionModel() class.
- Load And Preprocess the Data.
- Train and Evaluate the data using different metrics.
- Print the results.
- Visualize the model predictions and conclusion.

In [None]:
fraud_detector = FraudDetectionModel()

    # Load and preprocess the data
X_train, X_test, y_train, y_test = fraud_detector.load_and_preprocess()

    # Apply SMOTE to handle class imbalance
X_train_balanced, y_train_balanced = fraud_detector.apply_smote(X_train, y_train)

    # Train and evaluate models
results = fraud_detector.train_and_evaluate(X_train_balanced, X_test, y_train_balanced, y_test)

    # Print results
for model_name, metrics in results.items():
  print(f"\nResults for {model_name}:")
  for metric_name, value in metrics.items():
    if metric_name != 'confusion_matrix':
      print(f"{metric_name}: {value:.4f}")


fraud_detector.plot_results(results)



AttributeError: 'FraudDetectionModel' object has no attribute 'load_and_preprocess'