<a href="https://colab.research.google.com/github/coderboic/Sampling/blob/main/Untitled20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek


In [2]:
data = pd.read_csv("Creditcard_data.csv")

X = data.drop("Class", axis=1)
y = data["Class"]


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X_scaled, y)

In [5]:
samples = {}

for i in range(5):
    X_s, _, y_s, _ = train_test_split(
        X_bal, y_bal, train_size=0.7, random_state=i
    )
    samples[f"Sample_{i+1}"] = (X_s, y_s)


In [6]:
sampling_methods = {
    "Sampling1": RandomOverSampler(random_state=1),
    "Sampling2": RandomUnderSampler(random_state=2),
    "Sampling3": SMOTE(random_state=3),
    "Sampling4": SMOTETomek(random_state=4),
    "Sampling5": RandomOverSampler(random_state=5)
}


In [7]:
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(n_estimators=100),
    "M4": GaussianNB(),
    "M5": SVC()
}


In [8]:
results = pd.DataFrame(
    index=models.keys(),
    columns=sampling_methods.keys()
)

for s_name, sampler in sampling_methods.items():
    X_res, y_res = sampler.fit_resample(X_bal, y_bal)

    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.3, random_state=42
    )

    for m_name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds) * 100
        results.loc[m_name, s_name] = round(acc, 2)


In [9]:
print("Accuracy Comparison Table:\n")
print(results)


Accuracy Comparison Table:

   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1      91.7     90.83      91.7      91.7      91.7
M2     98.47     99.56     98.47     98.03     98.03
M3     100.0     100.0     99.78     100.0     99.78
M4     78.17      65.5     78.17     78.17     78.17
M5     96.51      97.6     96.51     96.51     96.51


In [10]:
best_sampling = results.astype(float).idxmax(axis=1)

summary = pd.DataFrame({
    "Best Sampling Technique": best_sampling,
    "Accuracy (%)": results.max(axis=1)
})

print(summary)


   Best Sampling Technique Accuracy (%)
M1               Sampling1         91.7
M2               Sampling2        99.56
M3               Sampling1        100.0
M4               Sampling1        78.17
M5               Sampling2         97.6
