In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

df = pd.read_csv("Creditcard_data.csv")

X = df.drop("Class", axis=1)
y = df["Class"]

models = {
    "M1_Logistic": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_KNN": KNeighborsClassifier(),
    "M5_NaiveBayes": GaussianNB()
}

results = []

sample = df.sample(frac=0.3, random_state=42)
X_s, y_s = sample.drop("Class", axis=1), sample["Class"]

X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2)

for name, model in models.items():
    if y_train.nunique() < 2:
        continue
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results.append([name, "Simple Random", acc])

step = 10
sample = df.iloc[::step]
X_s, y_s = sample.drop("Class", axis=1), sample["Class"]

X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2)

for name, model in models.items():
    if y_train.nunique() < 2:
        continue
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results.append([name, "Systematic", acc])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

for name, model in models.items():
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results.append([name, "Stratified", acc])


kmeans = KMeans(n_clusters=2, random_state=42)
df["Cluster"] = kmeans.fit_predict(X)

sample = df[df["Cluster"] == 0]
X_s, y_s = sample.drop(["Class", "Cluster"], axis=1), sample["Class"]

X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2)

for name, model in models.items():
    if y_train.nunique() < 2:
        continue
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results.append([name, "Cluster", acc])


skf = StratifiedKFold(n_splits=5)

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=skf, scoring="accuracy")
    results.append([name, "Cross Validation", scores.mean()])


sample = df.sample(frac=1, replace=True, random_state=42)
X_s, y_s = sample.drop("Class", axis=1), sample["Class"]

X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2)

for name, model in models.items():
    if y_train.nunique() < 2:
        continue
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results.append([name, "Bootstrap", acc])

results_df = pd.DataFrame(results, columns=["Model", "Sampling", "Accuracy"])
print(results_df.pivot(index="Model", columns="Sampling", values="Accuracy"))

Sampling         Bootstrap  Cluster  Cross Validation  Simple Random  \
Model                                                                  
M1_Logistic       0.987097   1.0000          0.985748            1.0   
M2_DecisionTree   0.987097   0.9875          0.790943            1.0   
M3_RandomForest   1.000000   1.0000          0.989644            1.0   
M4_KNN            0.974194   1.0000          0.988345            1.0   
M5_NaiveBayes     0.974194   0.9750          0.946862            1.0   

Sampling         Stratified  
Model                        
M1_Logistic        0.987097  
M2_DecisionTree    0.967742  
M3_RandomForest    0.987097  
M4_KNN             0.987097  
M5_NaiveBayes      0.980645  
