In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE
)

In [None]:
# kagglehub.dataset_download("joshmcadams/oranges-vs-grapefruit")
citrus = pd.read_csv("C:\\Users\\danis\\Downloads\\Datasets smote\\citrus.csv")

In [3]:
citrus

Unnamed: 0,name,diameter,weight,red,green,blue
0,orange,2.96,86.76,172,85,2
1,orange,3.91,88.05,166,78,3
2,orange,4.42,95.17,156,81,2
3,orange,4.47,95.60,163,81,4
4,orange,4.48,95.76,161,72,9
...,...,...,...,...,...,...
9995,grapefruit,15.35,253.89,149,77,20
9996,grapefruit,15.41,254.67,148,68,7
9997,grapefruit,15.59,256.50,168,82,20
9998,grapefruit,15.92,260.14,142,72,11


In [4]:
citrus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      10000 non-null  object 
 1   diameter  10000 non-null  float64
 2   weight    10000 non-null  float64
 3   red       10000 non-null  int64  
 4   green     10000 non-null  int64  
 5   blue      10000 non-null  int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 468.9+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
label_encoder = LabelEncoder()
citrus['name'] = label_encoder.fit_transform(citrus['name'])

In [7]:
citrus['name'].value_counts()

name
1    5000
0    5000
Name: count, dtype: int64

In [8]:
from imblearn.datasets import make_imbalance

In [9]:
from sklearn.model_selection import train_test_split
from collections import Counter

In [10]:
df_resampled, y_resampled = make_imbalance(citrus, citrus['name'], sampling_strategy={ 0 : 1700, 1: 5000},random_state=42)
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['name']

Original class distribution: Counter({1: 5000, 0: 1700})


In [11]:
def evaluate_oversampling(X, y, sampler, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    print("Class distribution after SMOTE:", Counter(y_train_resampled))

    
    # Train the classifier
    classifier.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1, X_test, y_test

oversamplers = {
    "RandomOverSampler": RandomOverSampler(sampling_strategy='minority'),
    "SMOTE": SMOTE(sampling_strategy='minority'),
    "ADASYN": ADASYN(sampling_strategy='minority'),
    "BorderlineSMOTE": BorderlineSMOTE(sampling_strategy='minority'),
    "SVMSMOTE": SVMSMOTE(sampling_strategy='minority')
}
resultsNB = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=GaussianNB())
    resultsNB.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsNB = pd.DataFrame(resultsNB)

# Print results
print(resultsNB)

Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({0: 4051, 1: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.914179   0.924773  0.914179  0.916481
1              SMOTE  0.916418   0.926705  0.916418  0.918635
2             ADASYN  0.868657   0.904092  0.868657  0.874661
3    BorderlineSMOTE  0.870896   0.907223  0.870896  0.876863
4           SVMSMOTE  0.879104   0.909353  0.879104  0.884323


In [12]:
resultsKNN = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=KNeighborsClassifier())
    resultsKNN.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsKNN = pd.DataFrame(resultsKNN)

# Print results
print(resultsKNN)

Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({0: 4051, 1: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.898507   0.907923  0.898507  0.900949
1              SMOTE  0.891045   0.900360  0.891045  0.893597
2             ADASYN  0.867164   0.893774  0.867164  0.872522
3    BorderlineSMOTE  0.882836   0.899528  0.882836  0.886639
4           SVMSMOTE  0.892537   0.907043  0.892537  0.895822


In [13]:
resultsRF = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=RandomForestClassifier())
    resultsRF.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsRF = pd.DataFrame(resultsRF)

# Print results
print(resultsRF)

Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({0: 4051, 1: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
Class distribution after SMOTE: Counter({1: 4019, 0: 4019})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.939552   0.940607  0.939552  0.939938
1              SMOTE  0.941791   0.944334  0.941791  0.942506
2             ADASYN  0.930597   0.937091  0.930597  0.932065
3    BorderlineSMOTE  0.931343   0.935456  0.931343  0.932440
4           SVMSMOTE  0.933582   0.937214  0.933582  0.934571


_______________________________________________________________________________________________________________________________________________________________________

In [14]:
citrus[citrus['name']==0].corr().abs().sum().sort_values()

name        0.000000
green       1.017065
blue        1.017437
red         1.022842
diameter    2.015328
weight      2.015351
dtype: float64

In [15]:
df_resampled2[df_resampled2['name']==0].count()

name        1700
diameter    1700
weight      1700
red         1700
green       1700
blue        1700
dtype: int64

In [16]:
4019-1700

2319

In [17]:
minority_samples = df_resampled2[df_resampled2['name']==0]
random_values = minority_samples['weight'].sample(n=2319, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,weight
0,208.70
1,182.60
2,197.98
3,215.61
4,225.40
...,...
2314,194.72
2315,145.52
2316,198.53
2317,183.11


In [18]:
random_values['diameter'] = np.nan
random_values['red'] = np.nan
random_values['green'] = np.nan
random_values['blue'] = np.nan
random_values['name'] = 0

In [19]:
random_values

Unnamed: 0,weight,diameter,red,green,blue,name
0,208.70,,,,,0
1,182.60,,,,,0
2,197.98,,,,,0
3,215.61,,,,,0
4,225.40,,,,,0
...,...,...,...,...,...,...
2314,194.72,,,,,0
2315,145.52,,,,,0
2316,198.53,,,,,0
2317,183.11,,,,,0


In [20]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [21]:
random_values['weight'] = calculate_percentiles(random_values['weight'])
random_values['weight'] = 'P' + random_values['weight'].round().astype(str,errors='ignore')

In [23]:
random_values['weight'].unique()

array(['P71.0', 'P22.0', 'P50.0', 'P81.0', 'P91.0', 'P97.0', 'P78.0',
       'P77.0', 'P41.0', 'P87.0', 'P34.0', 'P92.0', 'P47.0', 'P82.0',
       'P5.0', 'P96.0', 'P52.0', 'P93.0', 'P39.0', 'P58.0', 'P60.0',
       'P68.0', 'P20.0', 'P63.0', 'P72.0', 'P33.0', 'P79.0', 'P45.0',
       'P26.0', 'P67.0', 'P16.0', 'P36.0', 'P86.0', 'P38.0', 'P62.0',
       'P73.0', 'P27.0', 'P84.0', 'P42.0', 'P31.0', 'P35.0', 'P76.0',
       'P56.0', 'P70.0', 'P4.0', 'P85.0', 'P18.0', 'P99.0', 'P3.0',
       'P30.0', 'P19.0', 'P51.0', 'P95.0', 'P98.0', 'P90.0', 'P44.0',
       'P69.0', 'P11.0', 'P94.0', 'P17.0', 'P23.0', 'P21.0', 'P74.0',
       'P53.0', 'P24.0', 'P14.0', 'P54.0', 'P46.0', 'P40.0', 'P66.0',
       'P57.0', 'P55.0', 'P1.0', 'P2.0', 'P6.0', 'P88.0', 'P29.0',
       'P12.0', 'P8.0', 'P13.0', 'P64.0', 'P48.0', 'P43.0', 'P25.0',
       'P28.0', 'P37.0', 'P49.0', 'P89.0', 'P65.0', 'P59.0', 'P80.0',
       'P61.0', 'P10.0', 'P32.0', 'P7.0', 'P83.0', 'P9.0', 'P75.0',
       'P15.0', 'P100.0', 'P0

In [24]:
citrusP = dataframe_to_percentiles(minority_samples)
citrusP = 'P' + citrusP.round().astype(str,errors='ignore')
citrusP['name'] = 0

In [25]:
citrusP

Unnamed: 0,name,diameter,weight,red,green,blue
6501,0,P31.0,P31.0,P61.0,P71.0,P0.0
7586,0,P53.0,P53.0,P10.0,P23.0,P89.0
7653,0,P55.0,P55.0,P68.0,P12.0,P23.0
6055,0,P22.0,P22.0,P56.0,P67.0,P77.0
5705,0,P16.0,P16.0,P68.0,P88.0,P29.0
...,...,...,...,...,...,...
9111,0,P83.0,P83.0,P53.0,P71.0,P67.0
6902,0,P40.0,P40.0,P15.0,P59.0,P67.0
6703,0,P36.0,P36.0,P24.0,P21.0,P52.0
5121,0,P3.0,P3.0,P27.0,P54.0,P70.0


In [26]:
ZERO = pd.concat([citrusP,random_values], ignore_index=True)
ZERO['name']=0
ZERO

Unnamed: 0,name,diameter,weight,red,green,blue
0,0,P31.0,P31.0,P61.0,P71.0,P0.0
1,0,P53.0,P53.0,P10.0,P23.0,P89.0
2,0,P55.0,P55.0,P68.0,P12.0,P23.0
3,0,P22.0,P22.0,P56.0,P67.0,P77.0
4,0,P16.0,P16.0,P68.0,P88.0,P29.0
...,...,...,...,...,...,...
4014,0,,P44.0,,,
4015,0,,P0.0,,,
4016,0,,P52.0,,,
4017,0,,P23.0,,,


In [27]:


# Step 1: Identify unique values of 'pH'
unique_size = citrusP['weight'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'citric acid') that has missing values
for column in citrusP.columns:
    if column != 'weight' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'citric acid'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'citric acid' is equal to the current size_value
            non_missing_values = citrusP.loc[citrusP['weight'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [28]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['weight']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [29]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4019 entries, 0 to 4018
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      4019 non-null   int64 
 1   diameter  4019 non-null   object
 2   weight    4019 non-null   object
 3   red       4019 non-null   object
 4   green     4019 non-null   object
 5   blue      4019 non-null   object
dtypes: int64(1), object(5)
memory usage: 188.5+ KB


In [30]:
print(myMode.shape)
print(ZERO.shape)

(4019, 6)
(4019, 6)


In [31]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = citrus.columns.difference(['name'])
for variable in columns_to_impute:
    mapping_data[variable] = [citrus[citrus['name']==0][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,blue,diameter,green,red,weight
P0,2.0,7.6300,31.0,115.0,126.7900
P1,2.0,8.7900,46.0,127.0,152.5894
P2,2.0,9.0600,49.0,130.0,157.5696
P3,2.0,9.1900,51.0,132.0,160.7591
P4,2.0,9.3300,52.0,133.0,163.1276
...,...,...,...,...,...
P96,33.0,13.6104,87.0,168.0,230.7312
P97,34.0,13.7900,89.0,170.0,233.6127
P98,36.0,13.9900,91.0,172.0,236.9422
P99,38.0,14.3200,94.0,174.0,241.5519


In [32]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['name'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [33]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [34]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [35]:
a

Unnamed: 0,name,diameter,weight,red,green,blue
0,0,10.86,187.9476,154.0,75.0,2.0
1,0,11.54,198.8600,138.0,63.0,28.0
2,0,11.62,199.8500,155.0,58.0,8.0
3,0,10.54,182.3934,152.0,74.0,23.0
4,0,10.25,178.0568,155.0,82.0,10.0
...,...,...,...,...,...,...
4014,0,11.26,194.5700,146.0,46.0,28.0
4015,0,7.63,126.7900,141.0,58.0,11.0
4016,0,11.51,198.3848,130.0,73.0,9.0
4017,0,10.58,183.0062,150.0,83.0,4.0


In [36]:
percent = pd.concat([a, df_resampled2[df_resampled2['name']==1][0:4019]], ignore_index=True)

In [37]:
percent['name'].value_counts()

name
0    4019
1    4019
Name: count, dtype: int64

In [38]:
def evaluate_oversampling2(X, y, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(xtest)
    
    # Evaluate metrics
    accuracy = accuracy_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred, average='weighted')
    recall = recall_score(ytest, y_pred, average='weighted')
    f1 = f1_score(ytest, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1

In [39]:
classifiers = {
    "GaussianNaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(),
}

Percentile_Results = []

for name, classifier in classifiers.items():
    accuracy, precision, recall, f1 = evaluate_oversampling2(
        percent[['diameter', 'weight', 'red', 'green', 'blue']],
        percent[['name']],
        classifier
    )
    Percentile_Results.append({
        "Classifier": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

Percentile_Results = pd.DataFrame(Percentile_Results)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)


In [41]:
print('Percentile\n',Percentile_Results,'\n')
print('GaussianNaiveBayes\n',resultsNB,'\n')
print('KNN\n',resultsKNN,'\n')
print('RandomForest\n',resultsRF)

# Percentile
#             Classifier  Accuracy  Precision    Recall  F1 Score
# 0  GaussianNaiveBayes  0.921642   0.922284  0.921642  0.918998
# 1                 KNN  0.896269   0.905194  0.896269  0.888276
# 2        RandomForest  0.859701   0.879005  0.859701  0.841837 

# GaussianNaiveBayes
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.914925   0.924867  0.914925  0.917130
# 1              SMOTE  0.915672   0.925785  0.915672  0.917883
# 2             ADASYN  0.868657   0.904092  0.868657  0.874661
# 3    BorderlineSMOTE  0.869403   0.906502  0.869403  0.875483
# 4           SVMSMOTE  0.880597   0.910128  0.880597  0.885705 

# KNN
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.892537   0.902197  0.892537  0.895123
# 1              SMOTE  0.895522   0.903876  0.895522  0.897836
# 2             ADASYN  0.864179   0.890957  0.864179  0.869657
# 3    BorderlineSMOTE  0.887313   0.903780  0.887313  0.890971
# 4           SVMSMOTE  0.891791   0.905628  0.891791  0.895007 

# RandomForest
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.943284   0.943758  0.943284  0.943480
# 1              SMOTE  0.937313   0.939277  0.937313  0.937939
# 2             ADASYN  0.935075   0.940743  0.935075  0.936361
# 3    BorderlineSMOTE  0.926866   0.931402  0.926866  0.928087
# 4           SVMSMOTE  0.931343   0.935174  0.931343  0.932391

Percentile
            Classifier  Accuracy  Precision    Recall  F1 Score
0  GaussianNaiveBayes  0.916418   0.925104  0.916418  0.918429
1                 KNN  0.932836   0.932836  0.932836  0.932836
2        RandomForest  0.962687   0.962651  0.962687  0.962267 

GaussianNaiveBayes
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.914179   0.924773  0.914179  0.916481
1              SMOTE  0.916418   0.926705  0.916418  0.918635
2             ADASYN  0.868657   0.904092  0.868657  0.874661
3    BorderlineSMOTE  0.870896   0.907223  0.870896  0.876863
4           SVMSMOTE  0.879104   0.909353  0.879104  0.884323 

KNN
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.898507   0.907923  0.898507  0.900949
1              SMOTE  0.891045   0.900360  0.891045  0.893597
2             ADASYN  0.867164   0.893774  0.867164  0.872522
3    BorderlineSMOTE  0.882836   0.899528  0.882836  0.886639
4           SVMSMOTE  0.8