In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTETomek

# Load Dataset
df = pd.read_csv('Creditcard_data.csv')
print("Original Shape:", df.shape)

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Class Distribution
print("\nClass Distribution:\n", df['Class'].value_counts())

# Balance dataset using SMOTE (or choose another)
X = df.drop('Class', axis=1)
y = df['Class']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X_scaled, y)
print("\nBalanced Class Distribution:\n", pd.Series(y_bal).value_counts())

# Create 5 samples using sampling size formula (n = Z²*p*(1-p)/e²)
sample_size = int(1.96**2 * 0.5 * 0.5 / 0.05**2)  # ~385, rounding to 400 for simplicity
samples = []

for i in range(5):
    idx = np.random.choice(len(X_bal), sample_size, replace=False)
    samples.append((X_bal[idx], y_bal[idx]))

# Define models
models = {
    'M1': LogisticRegression(max_iter=1000),
    'M2': DecisionTreeClassifier(),
    'M3': RandomForestClassifier(),
    'M4': GaussianNB(),
    'M5': GradientBoostingClassifier()
}

# Define sampling techniques
samplers = {
    'Sampling1': RandomUnderSampler(),
    'Sampling2': RandomOverSampler(),
    'Sampling3': SMOTE(),
    'Sampling4': NearMiss(),
    'Sampling5': SMOTETomek()
}

# Accuracy matrix
accuracy_matrix = pd.DataFrame(columns=samplers.keys(), index=models.keys())

for model_name, model in models.items():
    for i, (samp_name, sampler) in enumerate(samplers.items()):
        X_s, y_s = sampler.fit_resample(samples[i][0], samples[i][1])
        X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.3, random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = round(accuracy_score(y_test, preds) * 100, 2)
        accuracy_matrix.loc[model_name, samp_name] = acc

# Show results
print("\n📊 Accuracy Matrix:\n")
print(accuracy_matrix)
m = accuracy_matrix

# Determine best technique per model
best = accuracy_matrix.astype(float).idxmax(axis=1)
print("\n✅ Best Sampling Technique for Each Model:")
for model, technique in best.items():
    print(f"{model} → {technique}")


Original Shape: (772, 31)

Missing Values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Class Distribution:
 Class
0    763
1      9
Name: count, dtype: int64

Balanced Class Distribution:
 Class
0    763
1    763
Name: count, dtype: int64

📊 Accuracy Matrix:

   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1     94.44     94.92     89.83     89.57      90.6
M2     97.22     95.76     90.68     93.91     93.16
M3     100.0     100.0     95.76     99.13     100.0
M4     70.37     75.42     77.12     60.87     71.79
M5      96.3     99.15     92.37     96.52     97.44

✅ Best Sampling Technique for Each Model:
M1 → Sampling2
M2 → Sampling1
M3 → S