In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
import random


In [2]:
def evaluate_models(models, model_names, X_train, X_test, y_train, y_test, sample_technique):
    accuracies = []
    results = {'Sample Technique': sample_technique}

    for model, name in zip(models, model_names):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        results[name] = accuracy
        print(f"{name} : {accuracy:.4f}")

    return results


In [3]:
df = pd.read_csv('/kaggle/input/creditcard/Creditcard_data.csv')
df.Class.value_counts()

Class
0    763
1      9
Name: count, dtype: int64

In [4]:
Amount = normalize([df['Amount']])[0]
df['Amount'] = Amount
df = df.iloc[:, 1:]

In [5]:
# Simple Random Sampling
n = int((1.96*1.96 * 0.5*0.5)/(0.05**2))
SimpleSampling = df.sample(n=n, random_state=42)
SimpleSampling.shape
X = SimpleSampling.drop('Class', axis=1)
y = SimpleSampling['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
models = [RandomForestClassifier(random_state=42),
          LogisticRegression(),
          GaussianNB(),
          DecisionTreeClassifier(random_state=42),
          KNeighborsClassifier()]
model_names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'Decision Trees', 'KNN']


In [7]:
results_simple_random = evaluate_models(models, model_names, X_train, X_test, y_train, y_test, 'Simple Random Sampling')

Random Forest : 0.9870
Logistic Regression : 0.9870
Naive Bayes : 0.9610
Decision Trees : 0.9610
KNN : 0.9870


In [8]:
# Systematic Sampling
SystematicSampling = df.sample(frac=1, random_state=42).reset_index(drop=True)
sampling_interval = 2
SystematicSample = SystematicSampling.iloc[::sampling_interval]
SystematicSample.shape
X = SystematicSample.drop('Class', axis=1)
y = SystematicSample['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
results_systematic = evaluate_models(models, model_names, X_train, X_test, y_train, y_test, 'Systematic Sampling')


Random Forest : 0.9744
Logistic Regression : 0.9744
Naive Bayes : 0.9615
Decision Trees : 0.8846
KNN : 0.9744


In [10]:
# Cluster Sampling
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, n_init='auto', random_state=42)
clusters = kmeans.fit_predict(df)
clusters = pd.Series(clusters)
selected_clusters = random.sample(range(num_clusters), 3)
ClusterSample = df.loc[clusters.isin(selected_clusters)]
print(ClusterSample.shape)
X = ClusterSample.drop('Class', axis=1)
y = ClusterSample['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(408, 30)


In [11]:
results_cluster = evaluate_models(models, model_names, X_train, X_test, y_train, y_test, 'Cluster Sampling')

Random Forest : 0.9756
Logistic Regression : 0.9756
Naive Bayes : 0.9756
Decision Trees : 0.9756
KNN : 0.9756


In [12]:
# Stratified Sampling
n = int((1.96*1.96 * 0.5*0.5)/((0.05)**2))
StratifiedSampling = df.groupby('Class')
StratifiedSample = StratifiedSampling.sample(frac=0.45)
StratifiedSample.shape
X = StratifiedSample.drop('Class', axis=1)
y = StratifiedSample['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
results_stratified = evaluate_models(models, model_names, X_train, X_test, y_train, y_test, 'Stratified Sampling')


Random Forest : 1.0000
Logistic Regression : 0.9857
Naive Bayes : 1.0000
Decision Trees : 0.9714
KNN : 1.0000


In [14]:
# Bootstrap Sampling
n_bootstrap = 100
desired_sample_size = 400
BootstrapSamples = pd.DataFrame()
for _ in range(n_bootstrap):
    resampled_data = df.sample(n=len(df), replace=True, random_state=42)
    BootstrapSamples = pd.concat([BootstrapSamples, resampled_data])
    if BootstrapSamples.shape[0] >= desired_sample_size:
        break
BootstrapSamples = BootstrapSamples.iloc[:desired_sample_size, :]
print("Final Shape of Bootstrap Samples DataFrame:", BootstrapSamples.shape)
X = BootstrapSamples.drop('Class', axis=1)
y = BootstrapSamples['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Final Shape of Bootstrap Samples DataFrame: (400, 30)


In [15]:
results_bootstrap = evaluate_models(models, model_names, X_train, X_test, y_train, y_test, 'Bootstrap Sampling')

Random Forest : 1.0000
Logistic Regression : 0.9625
Naive Bayes : 0.9500
Decision Trees : 0.9875
KNN : 0.9500


In [16]:
results_df = pd.DataFrame([results_simple_random, results_systematic, results_cluster, results_stratified, results_bootstrap])
results_df.to_csv('sampling_results.csv', index=False)

In [18]:
results_df

Unnamed: 0,Sample Technique,Random Forest,Logistic Regression,Naive Bayes,Decision Trees,KNN
0,Simple Random Sampling,0.987013,0.987013,0.961039,0.961039,0.987013
1,Systematic Sampling,0.974359,0.974359,0.961538,0.884615,0.974359
2,Cluster Sampling,0.97561,0.97561,0.97561,0.97561,0.97561
3,Stratified Sampling,1.0,0.985714,1.0,0.971429,1.0
4,Bootstrap Sampling,1.0,0.9625,0.95,0.9875,0.95
