In [1]:
import pandas as pd

df = pd.read_csv("Creditcard_data.csv")

print("Shape of dataset:", df.shape)
print("\nClass distribution:")
print(df['Class'].value_counts())


Shape of dataset: (772, 31)

Class distribution:
Class
0    763
1      9
Name: count, dtype: int64


In [2]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.Class == 0]
df_minority = df[df.Class == 1]

# Oversample minority class
df_minority_oversampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

# Combine balanced dataset
df_balanced = pd.concat([df_majority, df_minority_oversampled])

print("\nBalanced class distribution:")
print(df_balanced['Class'].value_counts())



Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64


In [3]:
from sklearn.model_selection import train_test_split

samples = {}

for i in range(1, 6):
    samples[f"Sampling{i}"], _ = train_test_split(
        df_balanced,
        train_size=0.8,
        random_state=42 + i,
        stratify=df_balanced['Class']
    )
    print(f"Sampling{i} shape:", samples[f"Sampling{i}"].shape)


Sampling1 shape: (1220, 31)
Sampling2 shape: (1220, 31)
Sampling3 shape: (1220, 31)
Sampling4 shape: (1220, 31)
Sampling5 shape: (1220, 31)


In [4]:
X_samples = {}
y_samples = {}

for key, sample in samples.items():
    X_samples[key] = sample.drop('Class', axis=1)
    y_samples[key] = sample['Class']
#simple random sampling

In [5]:
#Systematic Sampling
def systematic_sampling(X, y, k=2):
    return X.iloc[::k], y.iloc[::k]

X_samples["Sampling2"], y_samples["Sampling2"] = systematic_sampling(
    X_samples["Sampling2"], y_samples["Sampling2"]
)


In [6]:
#bootstrap sampling
from sklearn.utils import resample

X_samples["Sampling4"], y_samples["Sampling4"] = resample(
    X_samples["Sampling4"],
    y_samples["Sampling4"],
    replace=True,
    n_samples=len(X_samples["Sampling4"]),
    random_state=42
)


In [7]:
#Cluster
X_samples["Sampling5"] = X_samples["Sampling5"].iloc[:len(X_samples["Sampling5"])//2]
y_samples["Sampling5"] = y_samples["Sampling5"].iloc[:len(y_samples["Sampling5"])//2]


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": KNeighborsClassifier(n_neighbors=5),
    "M3": DecisionTreeClassifier(random_state=42),
    "M4": RandomForestClassifier(random_state=42),
    "M5": SVC()
}


In [9]:
results = {}

for samp_key in X_samples:
    X = X_samples[samp_key]
    y = y_samples[samp_key]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    results[samp_key] = {}

    for model_key, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[samp_key][model_key] = round(acc * 100, 2)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [10]:
accuracy_table = pd.DataFrame(results).T

print("\nAccuracy Table:")
print(accuracy_table)

print("\nAverage Accuracy for each Model:")
print(accuracy_table.mean())



Accuracy Table:
              M1     M2      M3     M4     M5
Sampling1  92.62  97.54   99.59  100.0  72.13
Sampling2  95.08  95.90   99.18  100.0  63.93
Sampling3  91.39  96.72  100.00  100.0  75.41
Sampling4  92.62  98.36  100.00  100.0  75.41
Sampling5  95.90  93.44  100.00  100.0  65.57

Average Accuracy for each Model:
M1     93.522
M2     96.392
M3     99.754
M4    100.000
M5     70.490
dtype: float64
