In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE
)

In [None]:
# download data
# kagglehub.dataset_download("sameepvani/nasa-nearest-earth-objects") 

In [4]:
nasa = pd.read_csv("C:\\Users\\danis\\Downloads\\Datasets smote\\nearest earth object NASA.csv")

In [5]:
nasa

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,5.483974e+07,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.265800,0.594347,73588.726663,6.143813e+07,Earth,False,20.00,True
2,2512244,512244 (2015 YE18),0.722030,1.614507,114258.692129,4.979872e+07,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,2.543497e+07,Earth,False,22.20,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,4.627557e+07,Earth,False,20.09,True
...,...,...,...,...,...,...,...,...,...,...
90831,3763337,(2016 VX1),0.026580,0.059435,52078.886692,1.230039e+07,Earth,False,25.00,False
90832,3837603,(2019 AD3),0.016771,0.037501,46114.605073,5.432121e+07,Earth,False,26.00,False
90833,54017201,(2020 JP3),0.031956,0.071456,7566.807732,2.840077e+07,Earth,False,24.60,False
90834,54115824,(2021 CN5),0.007321,0.016370,69199.154484,6.869206e+07,Earth,False,27.80,False


In [None]:
del nasa['id']
del nasa['name']
del nasa['orbiting_body']
del nasa['sentry_object']
del nasa['est_diameter_max']

In [7]:
nasa

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,16.73,False
1,0.265800,0.594347,73588.726663,6.143813e+07,20.00,True
2,0.722030,1.614507,114258.692129,4.979872e+07,17.83,False
3,0.096506,0.215794,24764.303138,2.543497e+07,22.20,False
4,0.255009,0.570217,42737.733765,4.627557e+07,20.09,True
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00,False
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00,False
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60,False
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80,False


In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
label_encoder = LabelEncoder()
nasa['hazardous'] = label_encoder.fit_transform(nasa['hazardous'])

In [10]:
nasa

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,16.73,0
1,0.265800,0.594347,73588.726663,6.143813e+07,20.00,1
2,0.722030,1.614507,114258.692129,4.979872e+07,17.83,0
3,0.096506,0.215794,24764.303138,2.543497e+07,22.20,0
4,0.255009,0.570217,42737.733765,4.627557e+07,20.09,1
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00,0
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00,0
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60,0
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80,0


In [11]:
from imblearn.datasets import make_imbalance
from sklearn.model_selection import train_test_split
from collections import Counter

In [12]:
nasa['hazardous'].value_counts()

hazardous
0    81996
1     8840
Name: count, dtype: int64

In [136]:
df_resampled, y_resampled = nasa.copy(), nasa['hazardous']
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['hazardous']

Original class distribution: Counter({0: 81996, 1: 8840})


In [137]:
def evaluate_oversampling(X, y, sampler, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    print("Class distribution after SMOTE:", Counter(y_train_resampled))

    
    # Train the classifier
    classifier.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1, X_test, y_test

oversamplers = {
    "RandomOverSampler": RandomOverSampler(sampling_strategy='minority'),
    "SMOTE": SMOTE(sampling_strategy='minority'),
    "ADASYN": ADASYN(sampling_strategy='minority'),
    "BorderlineSMOTE": BorderlineSMOTE(sampling_strategy='minority'),
    "SVMSMOTE": SVMSMOTE(sampling_strategy='minority')
}
resultsNB = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=GaussianNB())
    resultsNB.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsNB = pd.DataFrame(resultsNB)

# Print results
print(resultsNB)

Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({1: 68357, 0: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.681308   0.861955  0.681308  0.745435
1              SMOTE  0.634522   0.866857  0.634522  0.710004
2             ADASYN  0.607552   0.867286  0.607552  0.688316
3    BorderlineSMOTE  0.652246   0.864668  0.652246  0.723757
4           SVMSMOTE  0.793813   0.857950  0.793813  0.821088


In [138]:
resultsKNN = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=KNeighborsClassifier())
    resultsKNN.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsKNN = pd.DataFrame(resultsKNN)

# Print results
print(resultsKNN)

Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({1: 68357, 0: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.724020   0.843825  0.724020  0.772785
1              SMOTE  0.677400   0.847283  0.677400  0.741245
2             ADASYN  0.662649   0.847694  0.662649  0.730614
3    BorderlineSMOTE  0.723415   0.845718  0.723415  0.772764
4           SVMSMOTE  0.745707   0.845869  0.745707  0.787382


In [139]:
resultsRF = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=RandomForestClassifier())
    resultsRF.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsRF = pd.DataFrame(resultsRF)

# Print results
print(resultsRF)

Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({1: 68357, 0: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
Class distribution after SMOTE: Counter({0: 65557, 1: 65557})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.910392   0.911557  0.910392  0.910962
1              SMOTE  0.909621   0.914001  0.909621  0.911650
2             ADASYN  0.910282   0.914585  0.910282  0.912275
3    BorderlineSMOTE  0.907365   0.912954  0.907365  0.909917
4           SVMSMOTE  0.906649   0.911155  0.906649  0.908744


_______________________________________________________________________________________________________________________________________________________________________

In [19]:
nasa[nasa['hazardous']==1].corr().abs().sum().sort_values()

hazardous             0.000000
miss_distance         1.334394
relative_velocity     1.561733
est_diameter_min      2.069745
absolute_magnitude    2.105509
dtype: float64

In [20]:
df_resampled2[df_resampled2['hazardous']==1].count()

est_diameter_min      8840
est_diameter_max      8840
relative_velocity     8840
miss_distance         8840
absolute_magnitude    8840
hazardous             8840
dtype: int64

In [None]:
# carP = df_resampled2.rank(pct=True)*100
# carP = 'P' + carP.round().astype(str,errors='ignore')

In [21]:
65557-8840

56717

In [None]:
minority_samples = nasa[nasa['hazardous']==1]
random_values = minority_samples['absolute_magnitude'].sample(n=56717, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,absolute_magnitude
0,20.85
1,17.12
2,20.60
3,22.00
4,17.99
...,...
56712,19.33
56713,20.64
56714,16.36
56715,18.60


In [23]:
nasa.columns

Index(['est_diameter_min', 'relative_velocity', 'miss_distance',
       'absolute_magnitude', 'hazardous'],
      dtype='object')

In [94]:
random_values['est_diameter_min'] = np.nan
random_values['relative_velocity'] = np.nan
random_values['miss_distance'] = np.nan
random_values['hazardous'] = 1

In [87]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [None]:
random_values['absolute_magnitude'] = calculate_percentiles(random_values['absolute_magnitude'])
random_values['absolute_magnitude'] = 'P' + random_values['absolute_magnitude'].round().astype(str,errors='ignore')

In [116]:
random_values['absolute_magnitude'].unique()

array(['P58.0', 'P3.0', 'P50.0', 'P98.0', 'P6.0', 'P68.0', 'P23.0',
       'P29.0', 'P28.0', 'P86.0', 'P76.0', 'P4.0', 'P24.0', 'P9.0',
       'P19.0', 'P59.0', 'P55.0', 'P89.0', 'P10.0', 'P90.0', 'P71.0',
       'P16.0', 'P38.0', 'P17.0', 'P32.0', 'P47.0', 'P52.0', 'P5.0',
       'P44.0', 'P65.0', 'P20.0', 'P31.0', 'P62.0', 'P41.0', 'P1.0',
       'P21.0', 'P8.0', 'P72.0', 'P94.0', 'P64.0', 'P100.0', 'P80.0',
       'P0.0', 'P15.0', 'P82.0', 'P43.0', 'P79.0', 'P2.0', 'P18.0',
       'P85.0', 'P40.0', 'P13.0', 'P14.0', 'P37.0', 'P33.0', 'P35.0',
       'P26.0', 'P22.0', 'P49.0', 'P93.0', 'P11.0', 'P53.0', 'P97.0',
       'P75.0', 'P67.0', 'P7.0', 'P39.0', 'P25.0', 'P61.0', 'P46.0',
       'P12.0', 'P45.0', 'P92.0', 'P30.0', 'P81.0', 'P36.0', 'P99.0',
       'P56.0', 'P57.0', 'P74.0', 'P66.0', 'P77.0', 'P69.0', 'P42.0',
       'P73.0', 'P87.0', 'P27.0', 'P91.0', 'P83.0', 'P95.0', 'P48.0',
       'P63.0', 'P51.0', 'P60.0', 'P34.0', 'P88.0', 'P54.0', 'P70.0',
       'P78.0', 'P96.0', 'P84

In [None]:
nasaP = dataframe_to_percentiles(minority_samples)
nasaP = 'P' + nasaP.round().astype(str,errors='ignore')
nasaP['hazardous'] = 1

In [123]:
nasaP

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
1,P65.0,P70.0,P79.0,P33.0,1
4,P64.0,P25.0,P58.0,P35.0,1
10,P63.0,P50.0,P46.0,P36.0,1
23,P48.0,P95.0,P17.0,P50.0,1
27,P3.0,P34.0,P39.0,P94.0,1
...,...,...,...,...,...
90782,P20.0,P63.0,P73.0,P81.0,1
90794,P1.0,P39.0,P69.0,P99.0,1
90811,P79.0,P13.0,P95.0,P21.0,1
90812,P97.0,P42.0,P59.0,P3.0,1


In [124]:
ZERO = pd.concat([nasaP,random_values], ignore_index=True)
ZERO

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,P65.0,P70.0,P79.0,P33.0,1
1,P64.0,P25.0,P58.0,P35.0,1
2,P63.0,P50.0,P46.0,P36.0,1
3,P48.0,P95.0,P17.0,P50.0,1
4,P3.0,P34.0,P39.0,P94.0,1
...,...,...,...,...,...
65552,,,,P20.0,1
65553,,,,P52.0,1
65554,,,,P1.0,1
65555,,,,P12.0,1


In [125]:


# Step 1: Identify unique values of 'pH'
unique_size = nasaP['absolute_magnitude'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'pH') that has missing values
for column in nasaP.columns:
    if column != 'absolute_magnitude' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'pH'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'pH' is equal to the current pH_value
            non_missing_values = nasaP.loc[nasaP['absolute_magnitude'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [127]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['absolute_magnitude']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [128]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65557 entries, 0 to 65556
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   est_diameter_min    65557 non-null  object
 1   relative_velocity   65557 non-null  object
 2   miss_distance       65557 non-null  object
 3   absolute_magnitude  65557 non-null  object
 4   hazardous           65557 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.5+ MB


In [130]:
# myMode[myMode.isnull().any(axis=1)]['Age'].value_counts()
# myMode = myMode.fillna(myMode.iloc[0])

In [129]:
print(myMode.shape)
print(ZERO.shape)

(65557, 5)
(65557, 5)


In [131]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = nasa.columns.difference(['hazardous'])
for variable in columns_to_impute:
    mapping_data[variable] = [nasa[nasa['hazardous']==1][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,absolute_magnitude,est_diameter_min,miss_distance,relative_velocity
P0,14.04,0.088015,1.432727e+05,5908.291826
P1,16.11,0.104847,3.064562e+06,18246.126515
P2,16.59,0.105817,4.199650e+06,21515.177473
P3,17.10,0.110804,5.134106e+06,23209.603904
P4,17.40,0.110804,5.981734e+06,24601.152489
...,...,...,...,...
P96,21.90,0.880147,7.232378e+07,117588.328525
P97,21.90,1.010543,7.300132e+07,123378.594723
P98,22.00,1.278071,7.360449e+07,129829.777855
P99,22.02,1.594245,7.423233e+07,140707.035702


In [132]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['hazardous'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [133]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [134]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [35]:
# a[0:200] = df_resampled2[df_resampled2['Purchased']==1]

In [135]:
a

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,0.258556,73964.741586,6.117170e+07,20.00,1
1,0.253837,43017.810183,4.651343e+07,20.06,1
2,0.249204,58658.010358,3.814665e+07,20.10,1
3,0.198863,114397.000352,1.506881e+07,20.60,1
4,0.110804,48444.753387,3.290172e+07,21.90,1
...,...,...,...,...,...
65552,0.363542,60160.412592,3.675092e+07,19.32,1
65553,0.198863,45458.215694,5.364725e+07,20.63,1
65554,1.594245,129829.777855,5.207575e+07,16.11,1
65555,0.497227,80924.641588,8.344978e+06,18.64,1


In [140]:
percent = pd.concat([a, df_resampled2[df_resampled2['hazardous']==0][0:65557]], ignore_index=True)

In [143]:
Counter(percent['hazardous'])

Counter({1: 65557, 0: 65557})

In [144]:
def evaluate_oversampling2(X, y, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(xtest)
    
    # Evaluate metrics
    accuracy = accuracy_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred, average='weighted')
    recall = recall_score(ytest, y_pred, average='weighted')
    f1 = f1_score(ytest, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1

In [146]:
classifiers = {
    "GaussianNaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(),
}

Percentile_Results = []

for name, classifier in classifiers.items():
    accuracy, precision, recall, f1 = evaluate_oversampling2(
        percent[['est_diameter_min',	'relative_velocity', 'miss_distance', 'absolute_magnitude']]
        , percent[['hazardous']], 
        classifier
    )
    Percentile_Results.append({
        "Classifier": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

Percentile_Results = pd.DataFrame(Percentile_Results)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)


In [147]:
print('Percentile\n',Percentile_Results,'\n')
print('GaussianNaiveBayes\n',resultsNB,'\n')
print('KNN\n',resultsKNN,'\n')
print('RandomForest\n',resultsRF)


Percentile
            Classifier  Accuracy  Precision    Recall  F1 Score
0  GaussianNaiveBayes  0.759247   0.857830  0.759247  0.798975
1                 KNN  0.889586   0.839332  0.889586  0.859287
2        RandomForest  0.936317   0.934682  0.936317  0.923794 

GaussianNaiveBayes
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.681308   0.861955  0.681308  0.745435
1              SMOTE  0.634522   0.866857  0.634522  0.710004
2             ADASYN  0.607552   0.867286  0.607552  0.688316
3    BorderlineSMOTE  0.652246   0.864668  0.652246  0.723757
4           SVMSMOTE  0.793813   0.857950  0.793813  0.821088 

KNN
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.724020   0.843825  0.724020  0.772785
1              SMOTE  0.677400   0.847283  0.677400  0.741245
2             ADASYN  0.662649   0.847694  0.662649  0.730614
3    BorderlineSMOTE  0.723415   0.845718  0.723415  0.772764
4           SVMSMOTE  0.7