In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, KMeansSMOTE, SVMSMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# import kagglehub
# kagglehub.dataset_download("iammustafatz/diabetes-prediction-dataset")
# kagglehub.dataset_download("fedesoriano/stroke-prediction-dataset")
# kagglehub.dataset_download("l3llff/banana")
# kagglehub.dataset_download("gabrielsantello/cars-purchase-decision-dataset")
# kagglehub.dataset_download("youssefaboelwafa/hotel-booking-cancellation-prediction")
# kagglehub.dataset_download("shelvigarg/credit-card-buyers")
# kagglehub.dataset_download("shivamb/machine-predictive-maintenance-classification")
# kagglehub.dataset_download("fdemoribajolin/death-classification-icu")
# kagglehub.dataset_download("nareshbhat/wine-quality-binary-classification")
# kagglehub.dataset_download("mssmartypants/paris-housing-classification")
# kagglehub.dataset_download("rabieelkharoua/predict-online-course-engagement-dataset")
# kagglehub.dataset_download("amanalisiddiqui/fraud-detection-dataset")

In [3]:
banana = pd.read_csv("E:\\Workings\\Thesis\\Percentile v SMOTE\\Datasets smote\\banana_quality.csv")

In [4]:
banana

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,-1.924968,0.468078,3.077832,-1.472177,0.294799,2.435570,0.271290,Good
1,-2.409751,0.486870,0.346921,-2.495099,-0.892213,2.067549,0.307325,Good
2,-0.357607,1.483176,1.568452,-2.645145,-0.647267,3.090643,1.427322,Good
3,-0.868524,1.566201,1.889605,-1.273761,-1.006278,1.873001,0.477862,Good
4,0.651825,1.319199,-0.022459,-1.209709,-1.430692,1.078345,2.812442,Good
...,...,...,...,...,...,...,...,...
7995,-6.414403,0.723565,1.134953,2.952763,0.297928,-0.156946,2.398091,Bad
7996,0.851143,-2.217875,-2.812175,0.489249,-1.323410,-2.316883,2.113136,Bad
7997,1.422722,-1.907665,-2.532364,0.964976,-0.562375,-1.834765,0.697361,Bad
7998,-2.131904,-2.742600,-1.008029,2.126946,-0.802632,-3.580266,0.423569,Bad


In [5]:
banana.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Size         8000 non-null   float64
 1   Weight       8000 non-null   float64
 2   Sweetness    8000 non-null   float64
 3   Softness     8000 non-null   float64
 4   HarvestTime  8000 non-null   float64
 5   Ripeness     8000 non-null   float64
 6   Acidity      8000 non-null   float64
 7   Quality      8000 non-null   object 
dtypes: float64(7), object(1)
memory usage: 500.1+ KB


In [6]:
banana['Quality'].value_counts()

Quality
Good    4006
Bad     3994
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
label_encoder = LabelEncoder()
banana['Quality'] = label_encoder.fit_transform(banana['Quality'])

In [9]:
banana['Quality'].value_counts()

Quality
1    4006
0    3994
Name: count, dtype: int64

In [10]:
from imblearn.datasets import make_imbalance

In [11]:
from sklearn.model_selection import train_test_split
from collections import Counter

In [12]:
df_resampled, y_resampled = make_imbalance(banana, banana['Quality'], sampling_strategy={ 0 : 1200, 1: 4000},random_state=42)
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['Quality']

Original class distribution: Counter({1: 4000, 0: 1200})


In [13]:
# Function to calculate the required metrics 
def get_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, y_prob[:, 1])
    return accuracy, precision, recall, f1, auc

# Define classifiers
classifiers = {
    'GaussianNB': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# Function to classify and store metrics
def classify_and_store(classifiers, X_train, y_train, X_test, y_test, oversampler=None):
    if oversampler:
        # Apply oversampling
        X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
    
    # Store the metrics for each classifier
    classifier_metrics = {}
    
    for name, clf in classifiers.items():
        # Train the classifier
        clf.fit(X_train_resampled, y_train_resampled)
        
        # Predict on the test set
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)
        
        # Get metrics
        accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pred, y_prob)
        classifier_metrics[name] = {
            'accuracy': accuracy,
            'precision_0': precision[0],
            'precision_1': precision[1],
            'recall_0': recall[0],
            'recall_1': recall[1],
            'f1_0': f1[0],
            'f1_1': f1[1],
            'auc_0': auc if len(np.unique(y_test)) == 2 else None,
            'auc_1': auc if len(np.unique(y_test)) == 2 else None
        }
    
    return classifier_metrics


# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(df_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 1: Perform baseline evaluation (without oversampling) and store results
oversamplers = {
    'RandomOverSampler': RandomOverSampler(random_state=42, sampling_strategy='minority'),
    'SMOTE': SMOTE(random_state=42, sampling_strategy='minority'),
    'ADASYN': ADASYN(random_state=42, sampling_strategy='minority'),
    'KMeansSMOTE': KMeansSMOTE(random_state=42, sampling_strategy='minority'),
    'SVMSMOTE': SVMSMOTE(random_state=42, sampling_strategy='minority')
}

# Store all results in a single DataFrame
all_results = []

# Step 2: Baseline (no oversampling)
baseline_metrics = classify_and_store(classifiers, X_train, y_train, X_test, y_test)
for clf_name, m in baseline_metrics.items():
    all_results.append({
        'Classifier': clf_name,
        'Oversampler': 'None',
        'Accuracy': m['accuracy'],
        'Precision(0)': m['precision_0'],
        'Precision(1)': m['precision_1'],
        'Recall (0)': m['recall_0'],
        'Recall (1)': m['recall_1'],
        'F1(0)': m['f1_0'],
        'F1(1)': m['f1_1'],
        'AUC(0)': m['auc_0'],
        'AUC(1)': m['auc_1']
    })

# Step 3: Loop over classifiers and then oversamplers
for clf_name, clf in classifiers.items():
    for sampler_name, oversampler in oversamplers.items():
        oversampled_metrics = classify_and_store({clf_name: clf}, X_train, y_train, X_test, y_test, oversampler)
        
        # Now m will be the dictionary with metrics for the classifier
        m = oversampled_metrics[clf_name]
        
        all_results.append({
            'Classifier': clf_name,
            'Oversampler': sampler_name,
            'Accuracy': m['accuracy'],
            'Precision(0)': m['precision_0'],
            'Precision(1)': m['precision_1'],
            'Recall (0)': m['recall_0'],
            'Recall (1)': m['recall_1'],
            'F1(0)': m['f1_0'],
            'F1(1)': m['f1_1'],
            'AUC(0)': m['auc_0'],
            'AUC(1)': m['auc_1']
        })

# Convert to DataFrame
results_df = pd.DataFrame(all_results)


_______________________________________________________________________________________________________________________________________________________________________

In [15]:
banana[banana['Quality']==0].corr().abs().sum().sort_values()

Quality        0.000000
HarvestTime    1.788721
Weight         1.936141
Softness       2.108621
Acidity        2.109679
Sweetness      2.165802
Ripeness       2.195922
Size           2.436070
dtype: float64

In [16]:
df_resampled2[df_resampled2['Quality']==0].count()

Size           1200
Weight         1200
Sweetness      1200
Softness       1200
HarvestTime    1200
Ripeness       1200
Acidity        1200
Quality        1200
dtype: int64

In [17]:
3216-1200

2016

In [18]:
minority_samples = df_resampled2[df_resampled2['Quality']==0]
random_values = minority_samples['Size'].sample(n=2016, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,Size
0,-0.394360
1,-2.485234
2,-2.171047
3,1.196364
4,-0.211955
...,...
2011,-1.779156
2012,-2.867868
2013,-2.625883
2014,0.033828


In [19]:
random_values['Weight'] = np.nan
random_values['Sweetness'] = np.nan
random_values['Softness'] = np.nan
random_values['HarvestTime'] = np.nan
random_values['Ripeness'] = np.nan
random_values['Acidity'] = np.nan
random_values['Quality'] = 0

In [20]:
# a.replace(np.nan, 'Pnan', inplace=True)
random_values

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,-0.394360,,,,,,,0
1,-2.485234,,,,,,,0
2,-2.171047,,,,,,,0
3,1.196364,,,,,,,0
4,-0.211955,,,,,,,0
...,...,...,...,...,...,...,...,...
2011,-1.779156,,,,,,,0
2012,-2.867868,,,,,,,0
2013,-2.625883,,,,,,,0
2014,0.033828,,,,,,,0


In [21]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [22]:
random_values['Size'] = calculate_percentiles(random_values['Size'])
random_values['Size'] = 'P' + random_values['Size'].round().astype(str,errors='ignore')

In [23]:
random_values

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,P74.0,,,,,,,0
1,P30.0,,,,,,,0
2,P36.0,,,,,,,0
3,P93.0,,,,,,,0
4,P78.0,,,,,,,0
...,...,...,...,...,...,...,...,...
2011,P46.0,,,,,,,0
2012,P24.0,,,,,,,0
2013,P28.0,,,,,,,0
2014,P82.0,,,,,,,0


In [24]:
random_values['Size'].unique()

array(['P74.0', 'P30.0', 'P36.0', 'P93.0', 'P78.0', 'P81.0', 'P32.0',
       'P13.0', 'P37.0', 'P41.0', 'P75.0', 'P73.0', 'P11.0', 'P79.0',
       'P86.0', 'P64.0', 'P97.0', 'P92.0', 'P76.0', 'P42.0', 'P50.0',
       'P22.0', 'P68.0', 'P53.0', 'P65.0', 'P31.0', 'P40.0', 'P62.0',
       'P19.0', 'P2.0', 'P55.0', 'P80.0', 'P3.0', 'P85.0', 'P69.0',
       'P20.0', 'P47.0', 'P98.0', 'P77.0', 'P12.0', 'P5.0', 'P34.0',
       'P7.0', 'P58.0', 'P4.0', 'P87.0', 'P66.0', 'P84.0', 'P52.0',
       'P26.0', 'P94.0', 'P27.0', 'P82.0', 'P59.0', 'P33.0', 'P39.0',
       'P72.0', 'P16.0', 'P83.0', 'P90.0', 'P67.0', 'P51.0', 'P70.0',
       'P43.0', 'P63.0', 'P61.0', 'P18.0', 'P10.0', 'P95.0', 'P54.0',
       'P23.0', 'P38.0', 'P45.0', 'P9.0', 'P89.0', 'P49.0', 'P96.0',
       'P25.0', 'P91.0', 'P46.0', 'P29.0', 'P6.0', 'P99.0', 'P24.0',
       'P8.0', 'P88.0', 'P56.0', 'P35.0', 'P1.0', 'P28.0', 'P15.0',
       'P17.0', 'P48.0', 'P60.0', 'P21.0', 'P44.0', 'P71.0', 'P57.0',
       'P0.0', 'P14.0', 'P100

In [25]:
bananaP = dataframe_to_percentiles(minority_samples)
bananaP = 'P' + bananaP.round().astype(str,errors='ignore')
bananaP['Quality'] = 0

In [26]:
ZERO = pd.concat([bananaP,random_values], ignore_index=True)
ZERO['Quality']=0
ZERO

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,P48.0,P54.0,P21.0,P84.0,P33.0,P34.0,P67.0,0
1,P54.0,P50.0,P42.0,P37.0,P38.0,P90.0,P30.0,0
2,P27.0,P87.0,P40.0,P43.0,P21.0,P52.0,P9.0,0
3,P77.0,P35.0,P13.0,P21.0,P72.0,P96.0,P30.0,0
4,P85.0,P58.0,P5.0,P58.0,P30.0,P35.0,P76.0,0
...,...,...,...,...,...,...,...,...
3211,P46.0,,,,,,,0
3212,P24.0,,,,,,,0
3213,P28.0,,,,,,,0
3214,P82.0,,,,,,,0


In [27]:


# Step 1: Identify unique values of 'pH'
unique_size = bananaP['Size'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'pH') that has missing values
for column in bananaP.columns:
    if column != 'Size' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'pH'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'pH' is equal to the current pH_value
            non_missing_values = bananaP.loc[bananaP['Size'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [28]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['Size']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [29]:
# myMode[myMode.isna().any(axis=1)]
# myMode = myMode.fillna(myMode.iloc[0])
myMode

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Quality
0,P48.0,P54.0,P21.0,P84.0,P33.0,P34.0,P67.0,0
1,P54.0,P50.0,P42.0,P37.0,P38.0,P90.0,P30.0,0
2,P27.0,P87.0,P40.0,P43.0,P21.0,P52.0,P9.0,0
3,P77.0,P35.0,P13.0,P21.0,P72.0,P96.0,P30.0,0
4,P85.0,P58.0,P5.0,P58.0,P30.0,P35.0,P76.0,0
...,...,...,...,...,...,...,...,...
3211,P46.0,P12.0,P38.0,P34.0,P87.0,P92.0,P94.0,0
3212,P24.0,P34.0,P77.0,P54.0,P30.0,P51.0,P12.0,0
3213,P28.0,P70.0,P38.0,P31.0,P83.0,P72.0,P18.0,0
3214,P82.0,P24.0,P44.0,P81.0,P78.0,P10.0,P58.0,0


In [30]:
print(myMode.shape)
print(ZERO.shape)

(3216, 8)
(3216, 8)


In [31]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = banana.columns.difference(['Quality'])
for variable in columns_to_impute:
    mapping_data[variable] = [banana[banana['Quality']==0][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,Acidity,HarvestTime,Ripeness,Size,Softness,Sweetness,Weight
P0,-6.292339,-7.570008,-7.423155,-7.998074,-6.959320,-5.893079,-8.283002
P1,-4.608811,-5.382910,-5.035012,-5.497982,-4.860846,-4.627055,-5.330116
P2,-4.237076,-5.077713,-4.368813,-5.152749,-4.377747,-4.332015,-4.957582
P3,-3.918218,-4.702864,-4.077861,-4.804678,-4.064465,-4.071587,-4.708460
P4,-3.671925,-4.543398,-3.851930,-4.571162,-3.821002,-3.863941,-4.532867
...,...,...,...,...,...,...,...
P96,3.983738,1.496725,3.924699,1.667341,2.468967,0.866747,1.464498
P97,4.257724,1.661803,4.144970,1.921859,2.585758,1.109378,1.676883
P98,4.489732,1.995723,4.507051,2.229927,2.731268,1.326730,1.975377
P99,5.031205,2.524296,4.942510,2.737843,2.993727,1.719955,2.577319


In [32]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['Quality'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [33]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [34]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [35]:
percent = pd.concat([a, df_resampled2[df_resampled2['Quality']==1][0:3216]], ignore_index=True)


In [36]:
# classifiers = {
#     "GaussianNaiveBayes": GaussianNB(),
#     "KNN": KNeighborsClassifier(),
#     "RandomForest": RandomForestClassifier(),
# }

# Percentile_Results = []

# for name, classifier in classifiers.items():
#     accuracy, precision, recall, f1 = evaluate_oversampling2(
#         percent[['Size', 'Weight', 'Sweetness', 'Softness', 'HarvestTime', 'Ripeness',
#        'Acidity']],
#         percent[['Quality']],
#         classifier
#     )
#     Percentile_Results.append({
#         "Classifier": name,
#         "Accuracy": accuracy,
#         "Precision": precision,
#         "Recall": recall,
#         "F1 Score": f1
#     })

# Percentile_Results = pd.DataFrame(Percentile_Results)

In [37]:
X_percent = percent.drop(columns=['Quality'])  # Replace with actual target column name
y_percent = percent['Quality']  # Replace with actual target column name

# Standardize the 'percent' data as well
X_percent = scaler.fit_transform(X_percent)

def evaluate_custom_oversampling(X_train, y_train, X_test, y_test, classifiers):
    custom_results = []
    
    # Iterate over classifiers first
    for clf_name, clf in classifiers.items():
        # Train the classifier
        clf.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)
        
        # Get metrics
        accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pred, y_prob)
        
        # Append the result
        custom_results.append({
            'Classifier': clf_name,               # Classifier first
            'Oversampler': 'PERCENTILES',              # Oversampler (Custom in this case)
            'Accuracy': accuracy,
            'Precision(0)': precision[0],
            'Precision(1)': precision[1],
            'Recall (0)': recall[0],
            'Recall (1)': recall[1],
            'F1(0)': f1[0],
            'F1(1)': f1[1],
            'AUC(0)': auc,
            'AUC(1)': auc
        })
    
    # Convert to DataFrame and return
    return pd.DataFrame(custom_results)



# Evaluate on custom oversampled data (the 'percent' dataset)
custom_results_df = evaluate_custom_oversampling(X_percent, y_percent, X_test, y_test, classifiers)


# Output: Print or review the custom oversampling results
custom_results_df

Unnamed: 0,Classifier,Oversampler,Accuracy,Precision(0),Precision(1),Recall (0),Recall (1),F1(0),F1(1),AUC(0),AUC(1)
0,GaussianNB,PERCENTILES,0.775962,0.52452,0.982487,0.960938,0.715561,0.678621,0.828044,0.951939,0.951939
1,KNN,PERCENTILES,0.965385,0.881944,0.99734,0.992188,0.956633,0.933824,0.976562,0.993797,0.993797
2,RandomForest,PERCENTILES,0.954808,0.851852,0.995962,0.988281,0.943878,0.915009,0.969221,0.99292,0.99292


In [38]:
results_df

Unnamed: 0,Classifier,Oversampler,Accuracy,Precision(0),Precision(1),Recall (0),Recall (1),F1(0),F1(1),AUC(0),AUC(1)
0,GaussianNB,,0.9125,0.960894,0.902439,0.671875,0.991071,0.790805,0.944681,0.957375,0.957375
1,KNN,,0.981731,0.961089,0.988506,0.964844,0.987245,0.962963,0.987875,0.990566,0.990566
2,RandomForest,,0.973077,0.945312,0.982143,0.945312,0.982143,0.945312,0.982143,0.992633,0.992633
3,GaussianNB,RandomOverSampler,0.897115,0.742671,0.961801,0.890625,0.899235,0.809947,0.929466,0.956369,0.956369
4,GaussianNB,SMOTE,0.894231,0.735484,0.961644,0.890625,0.895408,0.805654,0.927345,0.955257,0.955257
5,GaussianNB,ADASYN,0.826923,0.597436,0.964615,0.910156,0.799745,0.721362,0.874477,0.94749,0.94749
6,GaussianNB,KMeansSMOTE,0.908654,0.896552,0.911589,0.710938,0.973214,0.793028,0.941394,0.9375,0.9375
7,GaussianNB,SVMSMOTE,0.802885,0.561743,0.961722,0.90625,0.769133,0.693572,0.854713,0.937296,0.937296
8,KNN,RandomOverSampler,0.972115,0.912727,0.993464,0.980469,0.969388,0.945386,0.981278,0.987193,0.987193
9,KNN,SMOTE,0.970192,0.903226,0.994744,0.984375,0.965561,0.942056,0.979935,0.988299,0.988299


In [39]:
combined_df = pd.concat([results_df, custom_results_df], ignore_index=True)

# Save to Excel
combined_df.to_excel("banana3.xlsx", index=False)
