In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE
)

In [None]:
# kagglehub.dataset_download("whenamancodes/blood-transfusion-dataset")
transfusion = pd.read_csv("C:\\Users\\danis\\Downloads\\Datasets smote\\transfusion.csv")

In [12]:
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [13]:
from sklearn.model_selection import train_test_split
from collections import Counter

In [14]:
transfusion['whether he/she donated blood in March 2007'].value_counts()

whether he/she donated blood in March 2007
0    570
1    178
Name: count, dtype: int64

In [15]:
df_resampled, y_resampled = transfusion.copy(), transfusion['whether he/she donated blood in March 2007']
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['whether he/she donated blood in March 2007']

Original class distribution: Counter({0: 570, 1: 178})


In [16]:
def evaluate_oversampling(X, y, sampler, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    print("Class distribution after SMOTE:", Counter(y_train_resampled))

    
    # Train the classifier
    classifier.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1, X_test, y_test

oversamplers = {
    "RandomOverSampler": RandomOverSampler(sampling_strategy='minority'),
    "SMOTE": SMOTE(sampling_strategy='minority'),
    "ADASYN": ADASYN(sampling_strategy='minority'),
    "BorderlineSMOTE": BorderlineSMOTE(sampling_strategy='minority'),
    "SVMSMOTE": SVMSMOTE(sampling_strategy='minority')
}
resultsNB = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=GaussianNB())
    resultsNB.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsNB = pd.DataFrame(resultsNB)

# Print results
print(resultsNB)

Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({0: 457, 1: 445})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.626667   0.755022  0.626667  0.652270
1              SMOTE  0.733333   0.719129  0.733333  0.725017
2             ADASYN  0.626667   0.738767  0.626667  0.652377
3    BorderlineSMOTE  0.733333   0.695533  0.733333  0.705367
4           SVMSMOTE  0.626667   0.755022  0.626667  0.652270


In [17]:
resultsKNN = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=KNeighborsClassifier())
    resultsKNN.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsKNN = pd.DataFrame(resultsKNN)

# Print results
print(resultsKNN)

Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({0: 457, 1: 445})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.640000   0.695809  0.640000  0.659640
1              SMOTE  0.740000   0.742378  0.740000  0.741156
2             ADASYN  0.726667   0.734212  0.726667  0.730153
3    BorderlineSMOTE  0.720000   0.725119  0.720000  0.722434
4           SVMSMOTE  0.786667   0.773413  0.786667  0.777384


In [18]:
resultsRF = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=RandomForestClassifier())
    resultsRF.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsRF = pd.DataFrame(resultsRF)

# Print results
print(resultsRF)

Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({0: 457, 1: 445})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
Class distribution after SMOTE: Counter({1: 457, 0: 457})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.713333   0.705534  0.713333  0.709155
1              SMOTE  0.653333   0.677714  0.653333  0.663883
2             ADASYN  0.660000   0.675123  0.660000  0.666913
3    BorderlineSMOTE  0.666667   0.684489  0.666667  0.674621
4           SVMSMOTE  0.706667   0.690554  0.706667  0.697519


_______________________________________________________________________________________________________________________________________________________________________

In [19]:
transfusion[transfusion['whether he/she donated blood in March 2007']==1].corr().abs().sum().sort_values()

whether he/she donated blood in March 2007    0.000000
Recency (months)                              1.452802
Time (months)                                 2.551066
Frequency (times)                             2.931335
Monetary (c.c. blood)                         2.931335
dtype: float64

In [20]:
df_resampled2[df_resampled2['whether he/she donated blood in March 2007']==1].count()

Recency (months)                              178
Frequency (times)                             178
Monetary (c.c. blood)                         178
Time (months)                                 178
whether he/she donated blood in March 2007    178
dtype: int64

In [21]:
457-178

279

In [22]:
minority_samples = df_resampled2[df_resampled2['whether he/she donated blood in March 2007']==1]
random_values = minority_samples['Frequency (times)'].sample(n=279, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,Frequency (times)
0,2
1,5
2,2
3,11
4,8
...,...
274,3
275,5
276,8
277,1


In [25]:
transfusion.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'whether he/she donated blood in March 2007'],
      dtype='object')

In [26]:
random_values['Recency (months)'] = np.nan
random_values['Monetary (c.c. blood)'] = np.nan
random_values['Time (months)'] = np.nan
random_values['whether he/she donated blood in March 2007'] = 1

In [27]:
random_values

Unnamed: 0,Frequency (times),Recency (months),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,,,,1
1,5,,,,1
2,2,,,,1
3,11,,,,1
4,8,,,,1
...,...,...,...,...,...
274,3,,,,1
275,5,,,,1
276,8,,,,1
277,1,,,,1


In [28]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [29]:
random_values['Frequency (times)'] = calculate_percentiles(random_values['Frequency (times)'])
random_values['Frequency (times)'] = 'P' + random_values['Frequency (times)'].round().astype(str,errors='ignore')

In [30]:
random_values['Frequency (times)'].unique()

array(['P11.0', 'P37.0', 'P82.0', 'P66.0', 'P87.0', 'P49.0', 'P99.0',
       'P76.0', 'P38.0', 'P58.0', 'P0.0', 'P95.0', 'P88.0', 'P67.0',
       'P39.0', 'P31.0', 'P22.0', 'P83.0', 'P40.0', 'P100.0', 'P79.0',
       'P68.0', 'P23.0', 'P59.0', 'P50.0', 'P92.0', 'P84.0', 'P1.0',
       'P97.0', 'P12.0', 'P98.0', 'P80.0', 'P60.0', 'P41.0', 'P32.0',
       'P51.0', 'P13.0', 'P24.0', 'P69.0', 'P42.0', 'P89.0', 'P77.0',
       'P14.0', 'P70.0', 'P52.0', 'P2.0', 'P61.0', 'P25.0', 'P85.0',
       'P96.0', 'P93.0', 'P90.0', 'P33.0', 'P26.0', 'P43.0', 'P71.0',
       'P27.0', 'P44.0', 'P53.0', 'P78.0', 'P62.0', 'P3.0', 'P34.0',
       'P15.0', 'P94.0', 'P28.0', 'P86.0', 'P72.0', 'P54.0', 'P63.0',
       'P35.0', 'P73.0', 'P45.0', 'P16.0', 'P4.0', 'P17.0', 'P64.0',
       'P5.0', 'P18.0', 'P19.0', 'P6.0', 'P74.0', 'P91.0', 'P55.0',
       'P46.0', 'P36.0', 'P7.0', 'P81.0', 'P20.0', 'P8.0', 'P29.0',
       'P9.0', 'P56.0', 'P75.0', 'P21.0', 'P65.0', 'P30.0', 'P57.0',
       'P47.0', 'P48.0', 'P10

In [31]:
transfusionP = dataframe_to_percentiles(minority_samples)
transfusionP = 'P' + transfusionP.round().astype(str,errors='ignore')
transfusionP['whether he/she donated blood in March 2007'] = 1

In [32]:
minority_samples

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
6,2,7,1750,14,1
...,...,...,...,...,...
680,20,14,3500,69,1
695,17,7,1750,58,1
708,11,2,500,38,1
712,14,2,500,35,1


In [33]:
ZERO = pd.concat([transfusionP,random_values], ignore_index=True)
ZERO

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,P3.0,P100.0,P100.0,P98.0,1
1,P1.0,P84.0,P84.0,P46.0,1
2,P2.0,P90.0,P90.0,P60.0,1
3,P3.0,P94.0,P94.0,P75.0,1
4,P4.0,P58.0,P58.0,P21.0,1
...,...,...,...,...,...
452,,P30.0,,,1
453,,P48.0,,,1
454,,P76.0,,,1
455,,P10.0,,,1


In [34]:


# Step 1: Identify unique values of 'pH'
unique_size = transfusionP['Frequency (times)'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'pH') that has missing values
for column in transfusionP.columns:
    if column != 'Age' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'pH'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'pH' is equal to the current pH_value
            non_missing_values = transfusionP.loc[transfusionP['Frequency (times)'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [35]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['Frequency (times)']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [57]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   Recency (months)                            457 non-null    object
 1   Frequency (times)                           457 non-null    object
 2   Monetary (c.c. blood)                       457 non-null    object
 3   Time (months)                               457 non-null    object
 4   whether he/she donated blood in March 2007  457 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 18.0+ KB


In [54]:
myMode[myMode.isnull().any(axis=1)]['Frequency (times)'].value_counts()
myMode = myMode.fillna(myMode.iloc[0])

In [55]:
print(myMode.shape)
print(ZERO.shape)

(457, 5)
(457, 5)


In [56]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = transfusion.columns.difference(['whether he/she donated blood in March 2007'])
for variable in columns_to_impute:
    mapping_data[variable] = [transfusion[transfusion['whether he/she donated blood in March 2007']==1][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,Frequency (times),Monetary (c.c. blood),Recency (months),Time (months)
P0,1.00,250.0,0.00,2.00
P1,1.00,250.0,0.77,2.00
P2,1.00,250.0,1.54,2.00
P3,1.00,250.0,2.00,2.31
P4,1.00,250.0,2.00,4.00
...,...,...,...,...
P96,21.92,5480.0,16.92,85.44
P97,30.83,7707.5,20.69,88.38
P98,37.22,9305.0,21.00,98.00
P99,43.69,10922.5,22.23,98.00


In [59]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['whether he/she donated blood in March 2007'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [60]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [61]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [62]:
a

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2.00,50.00,12500.0,98.00,1
1,0.77,12.68,3170.0,28.00,1
2,1.54,16.00,4000.0,35.00,1
3,2.00,20.00,5000.0,44.50,1
4,2.00,6.66,1665.0,14.00,1
...,...,...,...,...,...
452,2.00,4.00,1000.0,14.00,1
453,4.00,5.00,1250.0,17.23,1
454,2.00,10.00,2500.0,28.00,1
455,4.00,1.00,250.0,4.05,1


In [65]:
percent = pd.concat([a, df_resampled2[df_resampled2['whether he/she donated blood in March 2007']==0][0:457]], ignore_index=True)

In [66]:
percent['whether he/she donated blood in March 2007'].value_counts()

whether he/she donated blood in March 2007
1    457
0    457
Name: count, dtype: int64

In [67]:
def evaluate_oversampling2(X, y, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(xtest)
    
    # Evaluate metrics
    accuracy = accuracy_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred, average='weighted')
    recall = recall_score(ytest, y_pred, average='weighted')
    f1 = f1_score(ytest, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1

In [68]:
transfusion.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'whether he/she donated blood in March 2007'],
      dtype='object')

In [69]:
classifiers = {
    "GaussianNaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(),
}

Percentile_Results = []

for name, classifier in classifiers.items():
    accuracy, precision, recall, f1 = evaluate_oversampling2(
        percent[['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)']]
        , percent[['whether he/she donated blood in March 2007']], 
        classifier
    )
    Percentile_Results.append({
        "Classifier": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

Percentile_Results = pd.DataFrame(Percentile_Results)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)


In [70]:
print('Percentile\n',Percentile_Results,'\n')
print('GaussianNaiveBayes\n',resultsNB,'\n')
print('KNN\n',resultsKNN,'\n')
print('RandomForest\n',resultsRF)

Percentile
            Classifier  Accuracy  Precision    Recall  F1 Score
0  GaussianNaiveBayes  0.733333   0.695533  0.733333  0.705367
1                 KNN  0.793333   0.808202  0.793333  0.798960
2        RandomForest  0.893333   0.890579  0.893333  0.890007 

GaussianNaiveBayes
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.626667   0.755022  0.626667  0.652270
1              SMOTE  0.733333   0.719129  0.733333  0.725017
2             ADASYN  0.626667   0.738767  0.626667  0.652377
3    BorderlineSMOTE  0.733333   0.695533  0.733333  0.705367
4           SVMSMOTE  0.626667   0.755022  0.626667  0.652270 

KNN
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.640000   0.695809  0.640000  0.659640
1              SMOTE  0.740000   0.742378  0.740000  0.741156
2             ADASYN  0.726667   0.734212  0.726667  0.730153
3    BorderlineSMOTE  0.720000   0.725119  0.720000  0.722434
4           SVMSMOTE  0.7