In [84]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [85]:
url = 'https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv'
data = pd.read_csv(url)

In [86]:
X = data.drop('Class', axis=1)
y = data['Class']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
conf = 1.96  # 95% confidence interval
error = 0.05  # margin of error

In [89]:
n1 = int(np.ceil((conf**2 * 0.5 * 0.5) / (error**2)))
n2 = int(np.ceil((conf**2 * 0.05 * (1-0.05)) / (error**2)))
n3 = int(np.ceil((conf**2 * 0.05 * (1-0.05)) / (error**2)))
n4 = int(np.ceil((conf**2 * 0.05 * (1-0.05)) / (error**2)))
n5 = int(np.ceil((conf**2 * 0.05 * (1-0.05)) / (error**2)))

In [90]:
slr1 = RandomUnderSampler(sampling_strategy='majority', random_state=42)
slr2 = RandomOverSampler(sampling_strategy='minority', random_state=42)
slr3 = SMOTE(sampling_strategy='minority', random_state=42)
slr4 = TomekLinks(sampling_strategy='majority')
slr5 = NearMiss(version=3, n_neighbors=3)

In [91]:
mdl1 = GradientBoostingClassifier()
mdl2 = AdaBoostClassifier(n_estimators=100, random_state=42)
mdl3 = RandomForestClassifier(random_state=42)
mdl4 = SVC(random_state=42)
mdl5 = KNeighborsClassifier(n_neighbors=3)

In [92]:
samplers = {
    'Sampling1': slr1,
    'Sampling2': slr2,
    'Sampling3': slr3,
    'Sampling4': slr4,
    'Sampling5': slr5,
}

In [93]:
models = {
    'M1': mdl1,
    'M2': mdl2,
    'M3': mdl3,
    'M4': mdl4,
    'M5': mdl5,
}

In [94]:
results = {}
for sampler_name, sampler in samplers.items():
    if sampler_name == 'Sampling1':
        n = n1
    elif sampler_name == 'Sampling2':
        n = n2
    elif sampler_name == 'Sampling3':
        n = n3
    elif sampler_name == 'Sampling4':
        n = n4
    else:
        n = n5

    
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    
    
    if len(X_resampled) > n:
        X_resampled = X_resampled[:n]
        y_resampled = y_resampled[:n]
    
    for model_name, model in models.items():
        
        model.fit(X_resampled, y_resampled)
        
        
        y_pred = model.predict(X_test)
        
        
        accuracy = accuracy_score(y_test, y_pred)
        
        
        if model_name in results:
            results[model_name][sampler_name] = accuracy
        else:
            results[model_name] = {sampler_name: accuracy}
            

In [95]:

print('Results for different models and samples:')
print('        Sampling1   Sampling2   Sampling3   Sampling4   Sampling5')
for model_name, model_results in results.items():
    print(model_name, end='')
    for sampler_name in samplers.keys():
        if sampler_name in model_results:
            print(f'    {model_results[sampler_name]:.4f}   ', end='')
        else:
            print('              ', end='')
    print() 

Results:
        Sampling1   Sampling2   Sampling3   Sampling4   Sampling5
M1    0.5742       0.9871       0.9742       0.9871       0.8452   
M2    0.5742       0.9935       0.9935       0.9935       0.6194   
M3    0.7097       0.9935       0.9935       0.9935       0.7677   
M4    0.6258       0.9935       0.9935       0.9935       0.5161   
M5    0.8258       0.9935       0.9935       0.9935       0.5871   
