In [30]:
#importing dataset
import numpy as np
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv')

In [31]:
#defining sampling techniques
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import  NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
sample1 = RandomUnderSampler(random_state=42)
sample2 = RandomOverSampler(random_state=42)
sample3 = SMOTE(sampling_strategy='minority', random_state=42)
sample4 = TomekLinks(sampling_strategy='majority')#we’ll use ratio='majority' to resample the majority class.
sample5 = NearMiss(version=3, n_neighbors=3)


In [32]:
#defining models
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  
from sklearn.ensemble import ExtraTreesClassifier
model1 = GaussianNB()
model2 = SVC(kernel='linear')
model3 = DecisionTreeClassifier(random_state=42)
model4 = RandomForestClassifier(random_state=42)
model5 = ExtraTreesClassifier(random_state=42)



In [33]:
# Splitting the dataset into features and target
X = df.drop('Class', axis=1)
y = df['Class']

from sklearn.model_selection import train_test_split
# Splitting the data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
samples = {
    'Sampling1': sample1,
    'Sampling2': sample2,
    'Sampling3': sample3,
    'Sampling4': sample4,
    'Sampling5': sample5,
}
models = {
    'NaiveBayes  ': model1,
    'SVM         ': model2,
    'DecicionTree': model3,
    'RandomForest': model4,
    'ExtraTree   ': model5,
}

In [35]:
z = 1.96  # for 95% confidence interval
m = 0.05  # margin of error
n1 = int(np.ceil((z**2 * 0.05 * 0.5 / (m**2))))
n2 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n3 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n4 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n5 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))

In [40]:
# Evaluating accuracy of all models on each sampling technique
from sklearn.metrics import accuracy_score
result = {}
for sampler_name, sampler in samples.items():
    if sampler_name == 'Sampling1':
        n = n1
    elif sampler_name == 'Sampling2':
        n = n2
    elif sampler_name == 'Sampling3':
        n = n3
    elif sampler_name == 'Sampling4':
        n = n4
    else:
        n = n5

    # Undersample or oversample the training data
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    
    # Limit the resampled data to the sample size
    if len(X_resampled) > n:
        X_resampled = X_resampled[:n]
        y_resampled = y_resampled[:n]
    
    for model_name, model in models.items():
        # Train the model on the resampled data
        model.fit(X_resampled, y_resampled)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Calculate the accuracy score
        accuracy = accuracy_score(y_test, y_pred)
        if model_name in result:
            result[model_name][sampler_name] = accuracy
        else:
            result[model_name] = {sampler_name: accuracy}
            


In [41]:
# Print the results
print('                  Sampling1   Sampling2   Sampling3   Sampling4   Sampling5')
for model_name, model_results in result.items():
    print(model_name, end='')
    for sampler_name in samplers.keys():
        if sampler_name in model_results:
            print(f'    {model_results[sampler_name]:.4f}   ', end='')
        else:
            print('              ', end='')
    print() 

                  Sampling1   Sampling2   Sampling3   Sampling4   Sampling5
NaiveBayes      0.6968       0.9935       0.9935       0.9935       0.4839   
SVM             0.6258       0.9871       0.9871       0.9871       0.3613   
DecicionTree    0.5742       0.9806       0.9806       0.9806       0.6968   
RandomForest    0.7097       0.9935       0.9935       0.9935       0.7677   
ExtraTree       0.8065       0.9935       0.9935       0.9935       0.6516   
