In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# Pretty plots
sns.set(style='whitegrid')

In [None]:
df = pd.read_csv('creditcard.csv')
df.dropna(inplace=True)
df.head()

### Exploratory Data Analysis (EDA)

Before diving into modeling, we explore the dataset to understand its structure:

- The dataset contains **284,807 transactions**, out of which only **492 are fraudulent**
- This means **less than 0.2%** of transactions are fraud — a highly imbalanced dataset
- Most features (`V1` to `V28`) are PCA components; `Amount` and `Time` are the only non-anonymized columns
- We'll need to scale features and use an anomaly detection model due to this extreme imbalance

In [None]:
# Get summary statistics
print(df.describe())

# Check the distribution of the target variable (fraud vs. non-fraud)
print(df['Class'].value_counts())

# Visualize the distribution of the target variable
import matplotlib.pyplot as plt
df['Class'].value_counts().plot(kind='bar')
plt.title('Distribution of Fraud vs. Non-Fraud Transactions')
plt.xlabel('Class (0 = Not Fraud, 1 = Fraud)')
plt.ylabel('Count')
plt.show()

In [None]:
print("Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("\nClass distribution:\n", df['Class'].value_counts())

In [None]:
# Visualize the imbalance
sns.countplot(x='Class', data=df)
plt.title("Fraud (1) vs Non-Fraud (0)")
plt.show()

### Preprocessing

To prepare the data for modeling:
- We separate the features from the target variable (`Class`)
- We scale the features using StandardScaler to normalize the range of values
- This ensures the Isolation Forest can fairly evaluate each feature without being skewed by value magnitude

In [None]:
# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Modeling with Isolation Forest

We apply the Isolation Forest algorithm — an unsupervised model that detects outliers by isolating points that behave differently. It's ideal for fraud detection because it doesn't require labeled data to identify rare cases.

In [None]:
# Set contamination as fraud rate
fraud_rate = y.mean()

iso_forest = IsolationForest(contamination=fraud_rate, random_state=42)
iso_forest.fit(X_scaled)

# Predict anomalies
y_pred = iso_forest.predict(X_scaled)

# Convert -1 to 1 (fraud), 1 to 0 (normal)
y_pred = [1 if val == -1 else 0 for val in y_pred]

### Evaluation

We compare the predicted anomalies with the true fraud labels using:
- Confusion Matrix
- Classification Report (Precision, Recall, F1-score)

These metrics show how well the model identified actual fraud cases.


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred, digits=4))

In [None]:
import seaborn as sns
cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples',
            xticklabels=['Not Fraud', 'Fraud'],
            yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### Conclusion

- Isolation Forest successfully detected anomalies in a highly imbalanced credit card transaction dataset.
- Despite being unsupervised, the model caught a meaningful portion of fraud cases.
- Future work can include trying other models like Autoencoders, One-Class SVM, or combining models for better results.