In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE
)

In [None]:
# kagglehub.dataset_download("nareshbhat/wine-quality-binary-classification")

wine = pd.read_csv("C:\\Users\\danis\\Downloads\\Datasets smote\\wine.csv")

In [3]:
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,bad
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,bad
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,bad
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,good
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,bad
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,bad
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,good
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,good
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,bad


In [4]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   object 
dtypes: float64(11), object(1)
memory usage: 150.0+ KB


In [5]:
wine['quality'].value_counts()

quality
good    855
bad     744
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
label_encoder = LabelEncoder()
wine['quality'] = label_encoder.fit_transform(wine['quality'])

In [8]:
wine['quality'].value_counts()

quality
1    855
0    744
Name: count, dtype: int64

In [9]:
from imblearn.datasets import make_imbalance

In [10]:
from sklearn.model_selection import train_test_split
from collections import Counter

In [11]:
df_resampled, y_resampled = make_imbalance(wine, wine['quality'], sampling_strategy={ 0 : 350, 1: 855},random_state=42)
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['quality']

Original class distribution: Counter({1: 855, 0: 350})


In [12]:
def evaluate_oversampling(X, y, sampler, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    print("Class distribution after SMOTE:", Counter(y_train_resampled))

    
    # Train the classifier
    classifier.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1, X_test, y_test

oversamplers = {
    "RandomOverSampler": RandomOverSampler(sampling_strategy='minority'),
    "SMOTE": SMOTE(sampling_strategy='minority'),
    "ADASYN": ADASYN(sampling_strategy='minority'),
    "BorderlineSMOTE": BorderlineSMOTE(sampling_strategy='minority'),
    "SVMSMOTE": SVMSMOTE(sampling_strategy='minority')
}
resultsNB = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=GaussianNB())
    resultsNB.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsNB = pd.DataFrame(resultsNB)

# Print results
print(resultsNB)

Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 735, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.726141   0.765912  0.726141  0.735019
1              SMOTE  0.721992   0.767169  0.721992  0.731342
2             ADASYN  0.709544   0.772414  0.709544  0.719806
3    BorderlineSMOTE  0.713693   0.774458  0.713693  0.723795
4           SVMSMOTE  0.705394   0.754629  0.705394  0.715488


In [13]:
resultsKNN = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=KNeighborsClassifier())
    resultsKNN.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsKNN = pd.DataFrame(resultsKNN)

# Print results
print(resultsKNN)

Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 735, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.597510   0.642771  0.597510  0.610736
1              SMOTE  0.597510   0.636534  0.597510  0.609933
2             ADASYN  0.597510   0.659425  0.597510  0.611712
3    BorderlineSMOTE  0.614108   0.662130  0.614108  0.627087
4           SVMSMOTE  0.618257   0.646063  0.618257  0.628137


In [14]:
resultsRF = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=RandomForestClassifier())
    resultsRF.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsRF = pd.DataFrame(resultsRF)

# Print results
print(resultsRF)

Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 735, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
Class distribution after SMOTE: Counter({0: 690, 1: 690})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.792531   0.785825  0.792531  0.785124
1              SMOTE  0.821577   0.818954  0.821577  0.819874
2             ADASYN  0.800830   0.805739  0.800830  0.802767
3    BorderlineSMOTE  0.813278   0.813961  0.813278  0.813604
4           SVMSMOTE  0.821577   0.818954  0.821577  0.819874


_______________________________________________________________________________________________________________________________________________________________________

In [15]:
wine[wine['quality']==0].corr().abs().sum().sort_values()

quality                 0.000000
residual sugar          2.303573
volatile acidity        2.355570
alcohol                 2.401095
free sulfur dioxide     2.569588
chlorides               2.702847
total sulfur dioxide    2.876960
sulphates               2.924946
fixed acidity           3.584450
pH                      3.733440
density                 3.832888
citric acid             4.104481
dtype: float64

In [16]:
df_resampled2[df_resampled2['quality']==0].count()

fixed acidity           350
volatile acidity        350
citric acid             350
residual sugar          350
chlorides               350
free sulfur dioxide     350
total sulfur dioxide    350
density                 350
pH                      350
sulphates               350
alcohol                 350
quality                 350
dtype: int64

In [17]:
690-350

340

In [18]:
minority_samples = df_resampled2[df_resampled2['quality']==0]
random_values = minority_samples['citric acid'].sample(n=340, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,citric acid
0,0.58
1,0.24
2,0.03
3,0.10
4,0.17
...,...
335,0.43
336,0.05
337,0.23
338,0.19


In [19]:
random_values['fixed acidity'] = np.nan
random_values['volatile acidity'] = np.nan
random_values['residual sugar'] = np.nan
random_values['chlorides'] = np.nan
random_values['free sulfur dioxide'] = np.nan
random_values['total sulfur dioxide'] = np.nan
random_values['density'] = np.nan
random_values['pH'] = np.nan
random_values['sulphates'] = np.nan
random_values['alcohol'] = np.nan
random_values['quality'] = 0

In [20]:
random_values

Unnamed: 0,citric acid,fixed acidity,volatile acidity,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.58,,,,,,,,,,,0
1,0.24,,,,,,,,,,,0
2,0.03,,,,,,,,,,,0
3,0.10,,,,,,,,,,,0
4,0.17,,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
335,0.43,,,,,,,,,,,0
336,0.05,,,,,,,,,,,0
337,0.23,,,,,,,,,,,0
338,0.19,,,,,,,,,,,0


In [21]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [22]:
random_values['citric acid'] = calculate_percentiles(random_values['citric acid'])
random_values['citric acid'] = 'P' + random_values['citric acid'].round().astype(str,errors='ignore')

In [24]:
random_values['citric acid'].unique()

array(['P96.0', 'P54.0', 'P13.0', 'P28.0', 'P38.0', 'P41.0', 'P31.0',
       'P45.0', 'P0.0', 'P36.0', 'P1.0', 'P21.0', 'P73.0', 'P63.0',
       'P80.0', 'P88.0', 'P60.0', 'P86.0', 'P19.0', 'P82.0', 'P11.0',
       'P77.0', 'P16.0', 'P70.0', 'P2.0', 'P94.0', 'P46.0', 'P52.0',
       'P97.0', 'P34.0', 'P61.0', 'P99.0', 'P84.0', 'P29.0', 'P74.0',
       'P55.0', 'P100.0', 'P39.0', 'P64.0', 'P42.0', 'P78.0', 'P72.0',
       'P89.0', 'P3.0', 'P81.0', 'P85.0', 'P69.0', 'P17.0', 'P93.0',
       'P56.0', 'P83.0', 'P98.0', 'P4.0', 'P65.0', 'P9.0', 'P37.0',
       'P26.0', 'P50.0', 'P90.0', 'P12.0', 'P14.0', 'P43.0', 'P47.0',
       'P27.0', 'P79.0', 'P95.0', 'P5.0', 'P23.0', 'P10.0', 'P32.0',
       'P6.0', 'P66.0', 'P44.0', 'P91.0', 'P30.0', 'P75.0', 'P25.0',
       'P71.0', 'P53.0', 'P24.0', 'P40.0', 'P35.0', 'P15.0', 'P57.0',
       'P7.0', 'P48.0', 'P58.0', 'P51.0', 'P22.0', 'P76.0', 'P33.0',
       'P18.0', 'P92.0', 'P87.0', 'P8.0', 'P67.0', 'P49.0', 'P62.0',
       'P59.0', 'P20.0', 'P68

In [25]:
wineP = dataframe_to_percentiles(minority_samples)
wineP = 'P' + wineP.round().astype(str,errors='ignore')
wineP['quality'] = 0

In [26]:
wineP

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1334,P29.0,P91.0,P0.0,P31.0,P95.0,P2.0,P3.0,P24.0,P68.0,P24.0,P63.0,0
1163,P77.0,P89.0,P53.0,P10.0,P34.0,P32.0,P19.0,P50.0,P40.0,P79.0,P63.0,0
1484,P16.0,P95.0,P20.0,P31.0,P7.0,P3.0,P3.0,P22.0,P90.0,P71.0,P86.0,0
686,P25.0,P68.0,P40.0,P13.0,P14.0,P44.0,P47.0,P52.0,P79.0,P59.0,P3.0,0
266,P59.0,P92.0,P1.0,P87.0,P76.0,P83.0,P65.0,P96.0,P97.0,P87.0,P52.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1597,P3.0,P67.0,P33.0,P42.0,P27.0,P93.0,P52.0,P12.0,P94.0,P87.0,P75.0,0
1355,P5.0,P4.0,P59.0,P22.0,P61.0,P13.0,P39.0,P6.0,P63.0,P5.0,P71.0,0
153,P43.0,P56.0,P16.0,P22.0,P75.0,P83.0,P86.0,P8.0,P58.0,P38.0,P71.0,0
81,P56.0,P20.0,P100.0,P30.0,P100.0,P77.0,P70.0,P63.0,P10.0,P99.0,P26.0,0


In [27]:
ZERO = pd.concat([wineP,random_values], ignore_index=True)
ZERO['quality']=0
ZERO

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,P29.0,P91.0,P0.0,P31.0,P95.0,P2.0,P3.0,P24.0,P68.0,P24.0,P63.0,0
1,P77.0,P89.0,P53.0,P10.0,P34.0,P32.0,P19.0,P50.0,P40.0,P79.0,P63.0,0
2,P16.0,P95.0,P20.0,P31.0,P7.0,P3.0,P3.0,P22.0,P90.0,P71.0,P86.0,0
3,P25.0,P68.0,P40.0,P13.0,P14.0,P44.0,P47.0,P52.0,P79.0,P59.0,P3.0,0
4,P59.0,P92.0,P1.0,P87.0,P76.0,P83.0,P65.0,P96.0,P97.0,P87.0,P52.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
685,,,P84.0,,,,,,,,,0
686,,,P20.0,,,,,,,,,0
687,,,P54.0,,,,,,,,,0
688,,,P44.0,,,,,,,,,0


In [28]:


# Step 1: Identify unique values of 'pH'
unique_size = wineP['citric acid'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'citric acid') that has missing values
for column in wineP.columns:
    if column != 'citric acid' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'citric acid'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'citric acid' is equal to the current size_value
            non_missing_values = wineP.loc[wineP['citric acid'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [29]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['citric acid']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [30]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   fixed acidity         690 non-null    object
 1   volatile acidity      690 non-null    object
 2   citric acid           690 non-null    object
 3   residual sugar        690 non-null    object
 4   chlorides             690 non-null    object
 5   free sulfur dioxide   690 non-null    object
 6   total sulfur dioxide  690 non-null    object
 7   density               690 non-null    object
 8   pH                    690 non-null    object
 9   sulphates             690 non-null    object
 10  alcohol               690 non-null    object
 11  quality               690 non-null    int64 
dtypes: int64(1), object(11)
memory usage: 64.8+ KB


In [31]:
print(myMode.shape)
print(ZERO.shape)

(690, 12)
(690, 12)


In [32]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = wine.columns.difference(['quality'])
for variable in columns_to_impute:
    mapping_data[variable] = [wine[wine['quality']==0][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,alcohol,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,residual sugar,sulphates,total sulfur dioxide,volatile acidity
P0,8.400,0.03900,0.00,0.992560,4.600,3.00,2.7400,1.200,0.3300,6.00,0.18000
P1,9.000,0.04800,0.00,0.993400,5.600,3.00,2.9343,1.400,0.4000,8.00,0.24000
P2,9.000,0.05000,0.00,0.993700,5.686,3.00,3.0000,1.400,0.4300,10.00,0.28000
P3,9.100,0.05329,0.00,0.993972,6.000,4.00,3.0300,1.500,0.4400,11.00,0.31000
P4,9.100,0.05572,0.00,0.994236,6.100,4.00,3.0400,1.500,0.4400,11.72,0.32000
...,...,...,...,...,...,...,...,...,...,...,...
P96,11.400,0.16656,0.59,1.000114,11.600,37.28,3.5900,5.628,1.0356,131.56,0.93500
P97,11.500,0.19594,0.61,1.000400,12.071,40.00,3.6071,6.100,1.0971,137.84,0.96500
P98,11.907,0.28022,0.65,1.000600,12.414,45.14,3.6300,6.742,1.1700,143.14,1.02000
P99,12.857,0.40927,0.68,1.001628,12.871,51.00,3.6800,7.900,1.2800,147.00,1.10425


In [33]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['quality'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [34]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [35]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [37]:
percent = pd.concat([a, df_resampled2[df_resampled2['quality']==1][0:690]], ignore_index=True)

In [39]:
percent['quality'].value_counts()

quality
0    690
1    690
Name: count, dtype: int64

In [40]:
def evaluate_oversampling2(X, y, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(xtest)
    
    # Evaluate metrics
    accuracy = accuracy_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred, average='weighted')
    recall = recall_score(ytest, y_pred, average='weighted')
    f1 = f1_score(ytest, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1

In [41]:
percent.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [42]:
classifiers = {
    "GaussianNaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(),
}

Percentile_Results = []

for name, classifier in classifiers.items():
    accuracy, precision, recall, f1 = evaluate_oversampling2(
       percent[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']]
        , percent[['quality']] ,
        classifier
    )
    Percentile_Results.append({
        "Classifier": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

Percentile_Results = pd.DataFrame(Percentile_Results)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)


In [43]:
print('Percentile\n',Percentile_Results,'\n')
print('GaussianNaiveBayes\n',resultsNB,'\n')
print('KNN\n',resultsKNN,'\n')
print('RandomForest\n',resultsRF)

# Percentile
#             Classifier  Accuracy  Precision    Recall  F1 Score
# 0  GaussianNaiveBayes  0.780083   0.782574  0.780083  0.781202
# 1                 KNN  0.726141   0.717548  0.726141  0.720525
# 2        RandomForest  0.809129   0.817332  0.809129  0.790545 

# GaussianNaiveBayes
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.717842   0.757866  0.717842  0.726989
# 1              SMOTE  0.726141   0.769412  0.726141  0.735251
# 2             ADASYN  0.709544   0.772414  0.709544  0.719806
# 3    BorderlineSMOTE  0.697095   0.762121  0.697095  0.707801
# 4           SVMSMOTE  0.709544   0.756851  0.709544  0.719409 

# KNN
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.576763   0.630578  0.576763  0.591374
# 1              SMOTE  0.609959   0.653262  0.609959  0.622603
# 2             ADASYN  0.564315   0.633372  0.564315  0.579680
# 3    BorderlineSMOTE  0.614108   0.655702  0.614108  0.626432
# 4           SVMSMOTE  0.672199   0.685731  0.672199  0.677608 

# RandomForest
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.813278   0.808367  0.813278  0.808085
# 1              SMOTE  0.800830   0.800830  0.800830  0.800830
# 2             ADASYN  0.809129   0.809129  0.809129  0.809129
# 3    BorderlineSMOTE  0.804979   0.808916  0.804979  0.806585
# 4           SVMSMOTE  0.792531   0.789881  0.792531  0.790970

Percentile
            Classifier  Accuracy  Precision    Recall  F1 Score
0  GaussianNaiveBayes  0.726141   0.769412  0.726141  0.735251
1                 KNN  0.688797   0.718558  0.688797  0.697590
2        RandomForest  0.904564   0.904760  0.904564  0.902381 

GaussianNaiveBayes
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.726141   0.765912  0.726141  0.735019
1              SMOTE  0.721992   0.767169  0.721992  0.731342
2             ADASYN  0.709544   0.772414  0.709544  0.719806
3    BorderlineSMOTE  0.713693   0.774458  0.713693  0.723795
4           SVMSMOTE  0.705394   0.754629  0.705394  0.715488 

KNN
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.597510   0.642771  0.597510  0.610736
1              SMOTE  0.597510   0.636534  0.597510  0.609933
2             ADASYN  0.597510   0.659425  0.597510  0.611712
3    BorderlineSMOTE  0.614108   0.662130  0.614108  0.627087
4           SVMSMOTE  0.6