In [60]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE
)

In [None]:
# kagglehub.dataset_download("gabrielsantello/cars-purchase-decision-dataset")

car = pd.read_csv("C:\\Users\\danis\\Downloads\\Datasets smote\\car_data.csv")

In [62]:
del car['User ID']

In [63]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Gender        1000 non-null   object
 1   Age           1000 non-null   int64 
 2   AnnualSalary  1000 non-null   int64 
 3   Purchased     1000 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 31.4+ KB


In [64]:
from sklearn.preprocessing import LabelEncoder

In [65]:
label_encoder = LabelEncoder()
car['Gender'] = label_encoder.fit_transform(car['Gender'])

In [66]:
car

Unnamed: 0,Gender,Age,AnnualSalary,Purchased
0,1,35,20000,0
1,1,40,43500,0
2,1,49,74000,0
3,1,40,107500,1
4,1,25,79000,0
...,...,...,...,...
995,1,38,59000,0
996,0,47,23500,0
997,0,28,138500,1
998,0,48,134000,1


In [67]:
from imblearn.datasets import make_imbalance
from sklearn.model_selection import train_test_split
from collections import Counter

In [68]:
car['Purchased'].value_counts()

Purchased
0    598
1    402
Name: count, dtype: int64

In [69]:
df_resampled, y_resampled = make_imbalance(car, car['Purchased'], sampling_strategy={ 0 : 598, 1: 200},random_state=42)
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['Purchased']

Original class distribution: Counter({0: 598, 1: 200})


In [70]:
def evaluate_oversampling(X, y, sampler, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    print("Class distribution after SMOTE:", Counter(y_train_resampled))

    
    # Train the classifier
    classifier.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1, X_test, y_test

oversamplers = {
    "RandomOverSampler": RandomOverSampler(sampling_strategy='minority'),
    "SMOTE": SMOTE(sampling_strategy='minority'),
    "ADASYN": ADASYN(sampling_strategy='minority'),
    "BorderlineSMOTE": BorderlineSMOTE(sampling_strategy='minority'),
    "SVMSMOTE": SVMSMOTE(sampling_strategy='minority')
}
resultsNB = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=GaussianNB())
    resultsNB.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsNB = pd.DataFrame(resultsNB)

# Print results
print(resultsNB)

Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({1: 501, 0: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
              Method  Accuracy  Precision   Recall  F1 Score
0  RandomOverSampler   0.90625   0.913884  0.90625  0.908161
1              SMOTE   0.90625   0.913884  0.90625  0.908161
2             ADASYN   0.89375   0.901821  0.89375  0.895916
3    BorderlineSMOTE   0.89375   0.901821  0.89375  0.895916
4           SVMSMOTE   0.90625   0.913884  0.90625  0.908161


In [71]:
resultsKNN = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=KNeighborsClassifier())
    resultsKNN.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsKNN = pd.DataFrame(resultsKNN)

# Print results
print(resultsKNN)

Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({1: 501, 0: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
              Method  Accuracy  Precision   Recall  F1 Score
0  RandomOverSampler   0.74375   0.770421  0.74375  0.752245
1              SMOTE   0.76875   0.781183  0.76875  0.773465
2             ADASYN   0.73750   0.766897  0.73750  0.746661
3    BorderlineSMOTE   0.73750   0.757522  0.73750  0.744659
4           SVMSMOTE   0.77500   0.778173  0.77500  0.776459


In [72]:
resultsRF = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=RandomForestClassifier())
    resultsRF.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsRF = pd.DataFrame(resultsRF)

# Print results
print(resultsRF)

Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({1: 501, 0: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
Class distribution after SMOTE: Counter({0: 483, 1: 483})
              Method  Accuracy  Precision   Recall  F1 Score
0  RandomOverSampler   0.88750   0.885543  0.88750  0.885845
1              SMOTE   0.89375   0.893099  0.89375  0.893383
2             ADASYN   0.87500   0.879321  0.87500  0.876553
3    BorderlineSMOTE   0.87500   0.873559  0.87500  0.874118
4           SVMSMOTE   0.89375   0.893099  0.89375  0.893383


_______________________________________________________________________________________________________________________________________________________________________

In [73]:
car[car['Purchased']==1].corr().abs().sum().sort_values()

Purchased       0.000000
Gender          1.151233
AnnualSalary    1.384483
Age             1.433290
dtype: float64

In [74]:
df_resampled2[df_resampled2['Purchased']==1].count()

Gender          200
Age             200
AnnualSalary    200
Purchased       200
dtype: int64

In [75]:
483-200

283

In [76]:
minority_samples = df_resampled2[df_resampled2['Purchased']==1]
random_values = minority_samples['Age'].sample(n=283, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,Age
0,52
1,47
2,54
3,60
4,47
...,...
278,60
279,47
280,40
281,48


In [77]:
car.columns

Index(['Gender', 'Age', 'AnnualSalary', 'Purchased'], dtype='object')

In [78]:
random_values['Gender'] = np.nan
random_values['AnnualSalary'] = np.nan
random_values['Purchased'] = 1

In [79]:
random_values

Unnamed: 0,Age,Gender,AnnualSalary,Purchased
0,52,,,1
1,47,,,1
2,54,,,1
3,60,,,1
4,47,,,1
...,...,...,...,...
278,60,,,1
279,47,,,1
280,40,,,1
281,48,,,1


In [80]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [81]:
random_values['Age'] = calculate_percentiles(random_values['Age'])
random_values['Age'] = 'P' + random_values['Age'].round().astype(str,errors='ignore')

In [82]:
random_values['Age'].unique()

array(['P62.0', 'P37.0', 'P71.0', 'P90.0', 'P15.0', 'P2.0', 'P10.0',
       'P80.0', 'P63.0', 'P24.0', 'P94.0', 'P59.0', 'P32.0', 'P85.0',
       'P81.0', 'P45.0', 'P72.0', 'P27.0', 'P9.0', 'P19.0', 'P33.0',
       'P28.0', 'P0.0', 'P73.0', 'P91.0', 'P38.0', 'P25.0', 'P16.0',
       'P7.0', 'P98.0', 'P76.0', 'P39.0', 'P55.0', 'P29.0', 'P20.0',
       'P52.0', 'P64.0', 'P69.0', 'P53.0', 'P5.0', 'P56.0', 'P46.0',
       'P92.0', 'P11.0', 'P14.0', 'P86.0', 'P34.0', 'P96.0', 'P65.0',
       'P4.0', 'P74.0', 'P1.0', 'P77.0', 'P40.0', 'P35.0', 'P30.0',
       'P21.0', 'P66.0', 'P87.0', 'P70.0', 'P82.0', 'P95.0', 'P47.0',
       'P31.0', 'P41.0', 'P97.0', 'P3.0', 'P75.0', 'P22.0', 'P12.0',
       'P57.0', 'P54.0', 'P48.0', 'P83.0', 'P88.0', 'P78.0', 'P67.0',
       'P99.0', 'P60.0', 'P26.0', 'P23.0', 'P17.0', 'P42.0', 'P6.0',
       'P93.0', 'P49.0', 'P43.0', 'P13.0', 'P58.0', 'P50.0', 'P36.0',
       'P89.0', 'P84.0', 'P61.0', 'P8.0', 'P18.0', 'P100.0', 'P68.0',
       'P51.0', 'P79.0', 'P44

In [83]:
carP = dataframe_to_percentiles(minority_samples)
carP = 'P' + carP.round().astype(str,errors='ignore')
carP['Purchased'] = 1

In [84]:
minority_samples

Unnamed: 0,Gender,Age,AnnualSalary,Purchased
34,0,44,113500,1
884,0,52,41500,1
232,1,48,33000,1
62,1,49,88000,1
298,1,50,29500,1
...,...,...,...,...
960,1,32,100000,1
661,1,34,112000,1
919,0,52,21000,1
794,1,50,87500,1


In [85]:
ZERO = pd.concat([carP,random_values], ignore_index=True)
ZERO

Unnamed: 0,Gender,Age,AnnualSalary,Purchased
0,P0.0,P30.0,P65.0,1
1,P1.0,P62.0,P25.0,1
2,P52.0,P44.0,P14.0,1
3,P52.0,P52.0,P44.0,1
4,P53.0,P56.0,P12.0,1
...,...,...,...,...
478,,P94.0,,1
479,,P44.0,,1
480,,P19.0,,1
481,,P52.0,,1


In [86]:


# Step 1: Identify unique values of 'pH'
unique_size = carP['Age'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'pH') that has missing values
for column in carP.columns:
    if column != 'Age' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'pH'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'pH' is equal to the current pH_value
            non_missing_values = carP.loc[carP['Age'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [87]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['Age']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [88]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Gender        483 non-null    object
 1   Age           483 non-null    object
 2   AnnualSalary  483 non-null    object
 3   Purchased     483 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 15.2+ KB


In [89]:
print(myMode.shape)
print(ZERO.shape)

(483, 4)
(483, 4)


In [90]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = car.columns.difference(['Purchased'])
for variable in columns_to_impute:
    mapping_data[variable] = [car[car['Purchased']==1][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,Age,AnnualSalary,Gender
P0,27.00,20000.0,0.0
P1,28.01,21505.0,0.0
P2,30.00,22510.0,0.0
P3,31.03,23015.0,0.0
P4,32.00,24500.0,0.0
...,...,...,...
P96,61.00,147500.0,1.0
P97,61.00,148500.0,1.0
P98,62.00,149500.0,1.0
P99,63.00,150500.0,1.0


In [91]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['Purchased'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [92]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [93]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [94]:
a

Unnamed: 0,Gender,Age,AnnualSalary,Purchased
0,0.0,44.00,109150.0,1
1,0.0,51.00,43500.0,1
2,0.0,48.00,33570.0,1
3,0.0,49.00,88000.0,1
4,0.0,49.56,31560.0,1
...,...,...,...,...
478,1.0,60.94,144500.0,1
479,0.0,48.00,33570.0,1
480,1.0,40.00,100280.0,1
481,0.0,49.00,88000.0,1


In [95]:
percent = pd.concat([a, df_resampled2[df_resampled2['Purchased']==0][0:483]], ignore_index=True)

In [96]:
percent['Purchased'].value_counts()

Purchased
1    483
0    483
Name: count, dtype: int64

In [97]:
def evaluate_oversampling2(X, y, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(xtest)
    
    # Evaluate metrics
    accuracy = accuracy_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred, average='weighted')
    recall = recall_score(ytest, y_pred, average='weighted')
    f1 = f1_score(ytest, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1

In [98]:
classifiers = {
    "GaussianNaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(),
}

Percentile_Results = []

for name, classifier in classifiers.items():
    accuracy, precision, recall, f1 = evaluate_oversampling2(
        percent[['Gender','Age', 'AnnualSalary']]
        , percent[['Purchased']], 
        classifier
    )
    Percentile_Results.append({
        "Classifier": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

Percentile_Results = pd.DataFrame(Percentile_Results)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)


In [99]:
print('Percentile\n',Percentile_Results,'\n')
print('GaussianNaiveBayes\n',resultsNB,'\n')
print('KNN\n',resultsKNN,'\n')
print('RandomForest\n',resultsRF)

# Percentile
#             Classifier  Accuracy  Precision  Recall  F1 Score
# 0  GaussianNaiveBayes    0.8625   0.859786  0.8625  0.856771
# 1                 KNN    0.8000   0.791914  0.8000  0.793605
# 2        RandomForest    0.8500   0.867106  0.8500  0.832283 

# GaussianNaiveBayes
#                Method  Accuracy  Precision   Recall  F1 Score
# 0  RandomOverSampler   0.90625   0.913884  0.90625  0.908161
# 1              SMOTE   0.90625   0.913884  0.90625  0.908161
# 2             ADASYN   0.90000   0.906368  0.90000  0.901786
# 3    BorderlineSMOTE   0.90000   0.906368  0.90000  0.901786
# 4           SVMSMOTE   0.90625   0.913884  0.90625  0.908161 

# KNN
#                Method  Accuracy  Precision   Recall  F1 Score
# 0  RandomOverSampler   0.76250   0.777189  0.76250  0.767915
# 1              SMOTE   0.76875   0.781183  0.76875  0.773465
# 2             ADASYN   0.75000   0.778780  0.75000  0.758725
# 3    BorderlineSMOTE   0.76250   0.773172  0.76250  0.766741
# 4           SVMSMOTE   0.74375   0.757056  0.74375  0.748974 

# RandomForest
#                Method  Accuracy  Precision   Recall  F1 Score
# 0  RandomOverSampler   0.89375   0.892214  0.89375  0.892602
# 1              SMOTE   0.88750   0.887500  0.88750  0.887500
# 2             ADASYN   0.90000   0.903751  0.90000  0.901243
# 3    BorderlineSMOTE   0.88750   0.886243  0.88750  0.886707
# 4           SVMSMOTE   0.87500   0.873559  0.87500  0.874118

Percentile
            Classifier  Accuracy  Precision   Recall  F1 Score
0  GaussianNaiveBayes   0.90000   0.909452  0.90000  0.902280
1                 KNN   0.77500   0.797841  0.77500  0.782043
2        RandomForest   0.96875   0.970145  0.96875  0.969048 

GaussianNaiveBayes
               Method  Accuracy  Precision   Recall  F1 Score
0  RandomOverSampler   0.90625   0.913884  0.90625  0.908161
1              SMOTE   0.90625   0.913884  0.90625  0.908161
2             ADASYN   0.89375   0.901821  0.89375  0.895916
3    BorderlineSMOTE   0.89375   0.901821  0.89375  0.895916
4           SVMSMOTE   0.90625   0.913884  0.90625  0.908161 

KNN
               Method  Accuracy  Precision   Recall  F1 Score
0  RandomOverSampler   0.74375   0.770421  0.74375  0.752245
1              SMOTE   0.76875   0.781183  0.76875  0.773465
2             ADASYN   0.73750   0.766897  0.73750  0.746661
3    BorderlineSMOTE   0.73750   0.757522  0.73750  0.744659
4           SVMSMOTE   0.77500   0.77817