In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from math import ceil

In [31]:
credit = "/content/Creditcard_data.csv"
data = pd.read_csv(credit)

In [32]:
X = data.drop('Class', axis=1)
y = data['Class']

In [33]:
smote = SMOTE(random_state=42) #handling imbalanced dataset
X_smote, y_smote = smote.fit_resample(X, y)

In [34]:
Z = 1.96
p = 0.5
E = 0.05
sample_size = ceil((Z**2 * p * (1 - p)) / (E**2))
print(f"Calculated sample size: {sample_size}")

Calculated sample size: 385


In [35]:
samples = []
for i in range(5):
    X_sample, _, y_sample, _ = train_test_split(X_smote, y_smote, train_size=sample_size, random_state=i)
    samples.append((X_sample, y_sample))
samples

[(      Time        V1        V2        V3        V4        V5        V6  \
  1213   501 -2.499838 -2.812298  1.807173  1.689883  1.934385 -0.199941   
  851    531  1.133531  0.384276  0.322125  0.630268 -0.233969 -0.929163   
  694    524 -0.292211  0.838605  1.360847 -0.001346  0.350836 -0.894645   
  137     84 -0.481376  1.003407  0.906184 -0.423864  0.760671  0.377627   
  1009   505 -0.195381  1.010810 -0.473275  2.036819 -0.424707 -1.213634   
  ...    ...       ...       ...       ...       ...       ...       ...   
  763    574 -0.402057  0.584300  2.474227  0.929684  0.014314  0.297490   
  835    532  0.244796  0.380242  0.971548  0.455242  0.255845 -0.990334   
  1216   424 -1.977808  1.576583 -0.800228  3.076116 -0.162537 -1.299571   
  559    417 -2.680348  1.872052  1.144712 -0.693664  0.155172  0.601325   
  684    517  1.314713 -0.328688  0.002645 -0.805044 -0.467260 -0.522747   
  
              V7        V8        V9  ...       V20       V21       V22  \
  1213 -0.

In [36]:
sampling_techniques = [
    lambda X, y: (X, y),
    lambda X, y: RandomUnderSampler(random_state=42).fit_resample(X, y),
    lambda X, y: SMOTE(random_state=42).fit_resample(X, y),
    lambda X, y: SMOTEENN(random_state=42).fit_resample(X, y),
    lambda X, y: lambda X, y: TomekLinks().fit_resample(X, y)
]

In [37]:
models = [
    LogisticRegression(max_iter=1000, solver='saga'),
    RandomForestClassifier(),
    SVC(),
    KNeighborsClassifier(),
    GaussianNB()
]

In [38]:
scaler = StandardScaler()
results = []

for i, sample in enumerate(samples):
    X_sample, y_sample = sample
    for j, model in enumerate(models):
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)

        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        if i in [1, 2, 3]:
            X_res, y_res = sampling_techniques[i](X_train_scaled, y_train)
        else:
            X_res, y_res = X_train_scaled, y_train

        model.fit(X_res, y_res)

        y_pred = model.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        results.append((f"Sampling{i+1}", f"M{j+1}", acc))

In [39]:
results_df = pd.DataFrame(results, columns=["Sampling Technique", "Model", "Accuracy"])
results_df.to_csv("results.csv", index=False)
results_df

Unnamed: 0,Sampling Technique,Model,Accuracy
0,Sampling1,M1,0.887931
1,Sampling1,M2,0.982759
2,Sampling1,M3,0.974138
3,Sampling1,M4,0.827586
4,Sampling1,M5,0.836207
5,Sampling2,M1,0.896552
6,Sampling2,M2,1.0
7,Sampling2,M3,0.991379
8,Sampling2,M4,0.87931
9,Sampling2,M5,0.836207


In [40]:
best_accuracies = results_df.groupby("Model")["Accuracy"].max()
best_accuracies.to_csv("best_accuracies.csv", index=False)
best_accuracies

Unnamed: 0_level_0,Accuracy
Model,Unnamed: 1_level_1
M1,0.931034
M2,1.0
M3,0.991379
M4,0.896552
M5,0.905172


In [41]:
print('results saved')

results saved
