In [92]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
import pandas as pd


In [93]:
url = 'https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv'
data = pd.read_csv(url)

In [94]:
X = data.drop('Class', axis=1)
y = data['Class']


In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)


In [96]:
# z-value and margin of error 
z = 2.33  # 98% confidence interval
m = 0.05  # margin of error

# sample size 
n1 = int(np.ceil((z**2 * 0.5 * 0.5) / (m**2)))
n2 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n3 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n4 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n5 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))


In [97]:
s1 = RandomUnderSampler(sampling_strategy='majority', random_state=8)
s2 = TomekLinks(sampling_strategy='majority')
s3 = NearMiss(version=3, n_neighbors=3)
s4 = RandomOverSampler(sampling_strategy='minority', random_state=8)
s5 = SMOTE(sampling_strategy='minority', random_state=8)

m1 = SVC(random_state=8)
m2 = RandomForestClassifier(random_state=8)
m3 = DecisionTreeClassifier(random_state=8)
m4 = LogisticRegression(random_state=8,max_iter=500)
m5 = GaussianNB()


In [98]:
# Define a dictionary to hold the sampling techniques and models
samplers = {
    'Sampling1': s1,
    'Sampling2': s2,
    'Sampling3': s3,
    'Sampling4': s4,
    'Sampling5': s5,
}
models = {
    'M1': m1,
    'M2': m2,
    'M3': m3,
    'M4': m4,
    'M5': m5,
}



In [99]:
# Evaluate each model on each sampling technique
results = {}
for sampler_name, sampler in samplers.items():
    if sampler_name == 'Sampling1':
        n = n1
    elif sampler_name == 'Sampling2':
        n = n2
    elif sampler_name == 'Sampling3':
        n = n3
    elif sampler_name == 'Sampling4':
        n = n4
    else:
        n = n5

    # Undersample or oversample the training data
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    
    # Limit the resampled data to the sample size
    if len(X_resampled) > n:
        X_resampled = X_resampled[:n]
        y_resampled = y_resampled[:n]
    
    for model_name, model in models.items():
        # Train the model on the resampled data
        model.fit(X_resampled, y_resampled)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Calculate the accuracy score
        accuracy = accuracy_score(y_test, y_pred)
        
        # Add the accuracy score to the results dictionary
        if model_name in results:
            results[model_name][sampler_name] = accuracy
        else:
            results[model_name] = {sampler_name: accuracy}


In [101]:
# Print the results
print('Results:')
print('     Sampling1   Sampling2   Sampling3   Sampling4   Sampling5')
for model_name, model_results in results.items():
    print(model_name, end='')
    for sampler_name in samplers.keys():
        if sampler_name in model_results:
            print(f'    {model_results[sampler_name]:.4f}   ', end='')
        else:
            print('              ', end='')
    print() 


Results:
     Sampling1   Sampling2   Sampling3   Sampling4   Sampling5
M1    0.4387       0.9871       0.4387       0.9871       0.9871   
M2    0.4581       0.9871       0.6645       0.9871       0.9871   
M3    0.4839       0.9742       0.6968       0.9742       0.9742   
M4    0.2968       0.9613       0.3226       0.9613       0.9613   
M5    0.2968       0.9871       0.4516       0.9871       0.9871   
