In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import RandomOverSampler


In [4]:
df = pd.read_csv("Creditcard_data.csv")
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [11]:
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)

train_bal_df = pd.concat([X_train_bal, y_train_bal], axis=1)


In [16]:
def simple_random_sampling(data, size=0.8):
    return data.sample(frac=size, random_state=42)
def systematic_sampling(data, step=2):
    return data.iloc[::step, :]
def stratified_sampling(X, y, frac=0.8):
    X_s, _, y_s, _ = train_test_split(
        X, y,
        train_size=frac,
        stratify=y,
        random_state=42
    )
    return pd.concat([X_s, y_s], axis=1)
def cluster_sampling(data, n_clusters=5):
    data = data.copy()
    data["cluster"] = np.random.randint(0, n_clusters, size=len(data))
    selected_cluster = np.random.choice(data["cluster"].unique())
    return data[data["cluster"] == selected_cluster].drop("cluster", axis=1)
def bootstrap_sampling(data):
    return data.sample(frac=1, replace=True, random_state=42)


In [17]:
models = {
    "M1_Logistic": LogisticRegression(
        max_iter=1000, class_weight="balanced"
    ),
    "M2_DecisionTree": DecisionTreeClassifier(
        max_depth=5, random_state=42
    ),
    "M3_RandomForest": RandomForestClassifier(
        n_estimators=100, max_depth=8, random_state=42
    ),
    "M4_SVM": SVC(
        class_weight="balanced"
    ),
    "M5_KNN": KNeighborsClassifier(
        n_neighbors=7
    )
}


In [18]:
samples = {
    "Sampling1": simple_random_sampling(train_bal_df),
    "Sampling2": systematic_sampling(train_bal_df),
    "Sampling3": stratified_sampling(X_train_bal, y_train_bal),
    "Sampling4": cluster_sampling(train_bal_df),
    "Sampling5": bootstrap_sampling(train_bal_df)
}


In [19]:
results = {}

for model_name, model in models.items():
    acc_list = []

    for sample_name, sample_df in samples.items():
        X_sample = sample_df.drop("Class", axis=1)
        y_sample = sample_df["Class"]

        model.fit(X_sample, y_sample)
        y_pred = model.predict(X_test)

        acc_list.append(round(accuracy_score(y_test, y_pred) * 100, 2))

    results[model_name] = acc_list



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
result_df = pd.DataFrame(
    results,
    index=["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"]
).T

print(result_df)


                 Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1_Logistic          90.32      90.97      90.97      87.74      87.10
M2_DecisionTree      98.71      96.77      98.71      96.13      96.77
M3_RandomForest      99.35      99.35      99.35      99.35      99.35
M4_SVM               69.03      69.03      69.03      67.10      70.97
M5_KNN               96.77      95.48      96.77      90.32      96.77
