In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, KMeansSMOTE, SVMSMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [43]:
# download data
# kagglehub.dataset_download("sameepvani/nasa-nearest-earth-objects")

In [44]:
nasa = pd.read_csv("/content/nearest earth object NASA.csv")

In [45]:
nasa

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,5.483974e+07,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.265800,0.594347,73588.726663,6.143813e+07,Earth,False,20.00,True
2,2512244,512244 (2015 YE18),0.722030,1.614507,114258.692129,4.979872e+07,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,2.543497e+07,Earth,False,22.20,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,4.627557e+07,Earth,False,20.09,True
...,...,...,...,...,...,...,...,...,...,...
90831,3763337,(2016 VX1),0.026580,0.059435,52078.886692,1.230039e+07,Earth,False,25.00,False
90832,3837603,(2019 AD3),0.016771,0.037501,46114.605073,5.432121e+07,Earth,False,26.00,False
90833,54017201,(2020 JP3),0.031956,0.071456,7566.807732,2.840077e+07,Earth,False,24.60,False
90834,54115824,(2021 CN5),0.007321,0.016370,69199.154484,6.869206e+07,Earth,False,27.80,False


In [46]:
del nasa['id']
del nasa['name']
del nasa['orbiting_body']
del nasa['sentry_object']
del nasa['est_diameter_max']

In [47]:
nasa

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,13569.249224,5.483974e+07,16.73,False
1,0.265800,73588.726663,6.143813e+07,20.00,True
2,0.722030,114258.692129,4.979872e+07,17.83,False
3,0.096506,24764.303138,2.543497e+07,22.20,False
4,0.255009,42737.733765,4.627557e+07,20.09,True
...,...,...,...,...,...
90831,0.026580,52078.886692,1.230039e+07,25.00,False
90832,0.016771,46114.605073,5.432121e+07,26.00,False
90833,0.031956,7566.807732,2.840077e+07,24.60,False
90834,0.007321,69199.154484,6.869206e+07,27.80,False


In [48]:
from sklearn.preprocessing import LabelEncoder

In [49]:
label_encoder = LabelEncoder()
nasa['hazardous'] = label_encoder.fit_transform(nasa['hazardous'])

In [50]:
nasa

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,13569.249224,5.483974e+07,16.73,0
1,0.265800,73588.726663,6.143813e+07,20.00,1
2,0.722030,114258.692129,4.979872e+07,17.83,0
3,0.096506,24764.303138,2.543497e+07,22.20,0
4,0.255009,42737.733765,4.627557e+07,20.09,1
...,...,...,...,...,...
90831,0.026580,52078.886692,1.230039e+07,25.00,0
90832,0.016771,46114.605073,5.432121e+07,26.00,0
90833,0.031956,7566.807732,2.840077e+07,24.60,0
90834,0.007321,69199.154484,6.869206e+07,27.80,0


In [51]:
from imblearn.datasets import make_imbalance
from sklearn.model_selection import train_test_split
from collections import Counter

In [52]:
nasa['hazardous'].value_counts()

Unnamed: 0_level_0,count
hazardous,Unnamed: 1_level_1
0,81996
1,8840


In [53]:
df_resampled, y_resampled = nasa.copy(), nasa['hazardous']
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method
print("Original class distribution:", Counter(y_resampled))
del df_resampled['hazardous']

Original class distribution: Counter({0: 81996, 1: 8840})


In [54]:
# Function to calculate the required metrics
def get_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, y_prob[:, 1])
    return accuracy, precision, recall, f1, auc

# Define classifiers
classifiers = {
    'GaussianNB': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# Function to classify and store metrics
def classify_and_store(classifiers, X_train, y_train, X_test, y_test, oversampler=None):
    if oversampler:
        # Apply oversampling
        X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train

    # Store the metrics for each classifier
    classifier_metrics = {}

    for name, clf in classifiers.items():
        # Train the classifier
        clf.fit(X_train_resampled, y_train_resampled)

        # Predict on the test set
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)

        # Get metrics
        accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pred, y_prob)
        classifier_metrics[name] = {
            'accuracy': accuracy,
            'precision_0': precision[0],
            'precision_1': precision[1],
            'recall_0': recall[0],
            'recall_1': recall[1],
            'f1_0': f1[0],
            'f1_1': f1[1],
            'auc_0': auc if len(np.unique(y_test)) == 2 else None,
            'auc_1': auc if len(np.unique(y_test)) == 2 else None
        }

    return classifier_metrics


# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(df_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Step 1: Perform baseline evaluation (without oversampling) and store results
oversamplers = {
    'RandomOverSampler': RandomOverSampler(random_state=42, sampling_strategy='minority'),
    'SMOTE': SMOTE(random_state=42, sampling_strategy='minority'),
    'ADASYN': ADASYN(random_state=42, sampling_strategy='minority'),
    # Adjusting parameters for KMeansSMOTE
    'KMeansSMOTE': KMeansSMOTE(random_state=42, sampling_strategy='minority', cluster_balance_threshold=0.05, k_neighbors=3),
    'SVMSMOTE': SVMSMOTE(random_state=42, sampling_strategy='minority')
}

# Store all results in a single DataFrame
all_results = []

# Step 2: Baseline (no oversampling)
baseline_metrics = classify_and_store(classifiers, X_train, y_train, X_test, y_test)
for clf_name, m in baseline_metrics.items():
    all_results.append({
        'Classifier': clf_name,
        'Oversampler': 'None',
        'Accuracy': m['accuracy'],
        'Precision(0)': m['precision_0'],
        'Precision(1)': m['precision_1'],
        'Recall (0)': m['recall_0'],
        'Recall (1)': m['recall_1'],
        'F1(0)': m['f1_0'],
        'F1(1)': m['f1_1'],
        'AUC(0)': m['auc_0'],
        'AUC(1)': m['auc_1']
    })

# Step 3: Loop over classifiers and then oversamplers
for clf_name, clf in classifiers.items():
    for sampler_name, oversampler in oversamplers.items():
        try:
            # Wrap the classify_and_store call in a try-except block
            # to gracefully handle potential errors with specific oversamplers
            oversampled_metrics = classify_and_store({clf_name: clf}, X_train, y_train, X_test, y_test, oversampler)

            # Now m will be the dictionary with metrics for the classifier
            m = oversampled_metrics[clf_name]

            all_results.append({
                'Classifier': clf_name,
                'Oversampler': sampler_name,
                'Accuracy': m['accuracy'],
                'Precision(0)': m['precision_0'],
                'Precision(1)': m['precision_1'],
                'Recall (0)': m['recall_0'],
                'Recall (1)': m['recall_1'],
                'F1(0)': m['f1_0'],
                'F1(1)': m['f1_1'],
                'AUC(0)': m['auc_0'],
                'AUC(1)': m['auc_1']
            })
        except RuntimeError as e:
            print(f"Skipping {sampler_name} for {clf_name} due to error: {e}")
            # Optionally, append an entry indicating failure or skip adding results

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

In [56]:
results_df

Unnamed: 0,Classifier,Oversampler,Accuracy,Precision(0),Precision(1),Recall (0),Recall (1),F1(0),F1(1),AUC(0),AUC(1)
0,GaussianNB,,0.860194,0.926062,0.281637,0.918852,0.302487,0.922443,0.29169,0.867825,0.867825
1,KNN,,0.901695,0.93054,0.475239,0.963258,0.316368,0.946616,0.379861,0.855326,0.855326
2,RandomForest,,0.920299,0.939903,0.624446,0.974208,0.40775,0.956748,0.493352,0.931076,0.931076
3,GaussianNB,RandomOverSampler,0.774439,0.986517,0.284047,0.761117,0.901099,0.859282,0.431938,0.867798,0.867798
4,GaussianNB,SMOTE,0.773503,0.987192,0.283877,0.759535,0.906304,0.858528,0.432335,0.867907,0.867907
5,GaussianNB,ADASYN,0.770586,0.981102,0.274792,0.761117,0.860613,0.857221,0.416573,0.860457,0.860457
6,GaussianNB,KMeansSMOTE,0.864707,0.920279,0.262541,0.931139,0.233083,0.925677,0.246936,0.866214,0.866214
7,GaussianNB,SVMSMOTE,0.772402,0.990903,0.286551,0.755399,0.934066,0.857271,0.438561,0.874389,0.874389
8,KNN,RandomOverSampler,0.8488,0.968519,0.356862,0.86088,0.73395,0.911533,0.480227,0.854167,0.854167
9,KNN,SMOTE,0.841259,0.970366,0.346369,0.850538,0.753036,0.906509,0.47449,0.872803,0.872803


_______________________________________________________________________________________________________________________________________________________________________

In [57]:
nasa[nasa['hazardous']==1].corr().abs().sum().sort_values()

Unnamed: 0,0
hazardous,0.0
miss_distance,1.334394
relative_velocity,1.561733
est_diameter_min,2.069745
absolute_magnitude,2.105509


In [58]:
df_resampled2[df_resampled2['hazardous']==1].count()

Unnamed: 0,0
est_diameter_min,8840
relative_velocity,8840
miss_distance,8840
absolute_magnitude,8840
hazardous,8840


In [59]:
# carP = df_resampled2.rank(pct=True)*100
# carP = 'P' + carP.round().astype(str,errors='ignore')

In [60]:
65557-8840

56717

In [61]:
minority_samples = nasa[nasa['hazardous']==1]
random_values = minority_samples['absolute_magnitude'].sample(n=56717, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,absolute_magnitude
0,19.86
1,20.65
2,22.10
3,20.80
4,19.90
...,...
56712,21.30
56713,21.60
56714,18.02
56715,18.80


In [None]:
nasa.columns

In [62]:
random_values['est_diameter_min'] = np.nan
random_values['relative_velocity'] = np.nan
random_values['miss_distance'] = np.nan
random_values['hazardous'] = 1

In [63]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [64]:
random_values['absolute_magnitude'] = calculate_percentiles(random_values['absolute_magnitude'])
random_values['absolute_magnitude'] = 'P' + random_values['absolute_magnitude'].round().astype(str,errors='ignore')

In [65]:
random_values['absolute_magnitude'].unique()

array(['P30.0', 'P52.0', 'P99.0', 'P55.0', 'P31.0', 'P47.0', 'P54.0',
       'P44.0', 'P21.0', 'P86.0', 'P72.0', 'P61.0', 'P20.0', 'P64.0',
       'P49.0', 'P68.0', 'P0.0', 'P42.0', 'P46.0', 'P76.0', 'P37.0',
       'P13.0', 'P5.0', 'P41.0', 'P19.0', 'P33.0', 'P79.0', 'P11.0',
       'P36.0', 'P22.0', 'P65.0', 'P12.0', 'P85.0', 'P24.0', 'P97.0',
       'P71.0', 'P26.0', 'P94.0', 'P8.0', 'P35.0', 'P82.0', 'P39.0',
       'P7.0', 'P90.0', 'P14.0', 'P4.0', 'P98.0', 'P81.0', 'P59.0',
       'P29.0', 'P74.0', 'P10.0', 'P70.0', 'P58.0', 'P92.0', 'P62.0',
       'P25.0', 'P93.0', 'P43.0', 'P67.0', 'P16.0', 'P57.0', 'P75.0',
       'P9.0', 'P17.0', 'P89.0', 'P28.0', 'P15.0', 'P3.0', 'P32.0',
       'P1.0', 'P45.0', 'P23.0', 'P40.0', 'P2.0', 'P51.0', 'P6.0',
       'P60.0', 'P18.0', 'P100.0', 'P50.0', 'P38.0', 'P80.0', 'P66.0',
       'P87.0', 'P53.0', 'P27.0', 'P69.0', 'P91.0', 'P56.0', 'P77.0',
       'P73.0', 'P83.0', 'P95.0', 'P48.0', 'P88.0', 'P63.0', 'P34.0',
       'P96.0', 'P78.0', 'P84

In [66]:
nasaP = dataframe_to_percentiles(minority_samples)
nasaP = 'P' + nasaP.round().astype(str,errors='ignore')
nasaP['hazardous'] = 1

In [67]:
nasaP

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
1,P65.0,P70.0,P79.0,P33.0,1
4,P64.0,P25.0,P58.0,P35.0,1
10,P63.0,P50.0,P46.0,P36.0,1
23,P48.0,P95.0,P17.0,P50.0,1
27,P3.0,P34.0,P39.0,P94.0,1
...,...,...,...,...,...
90782,P20.0,P63.0,P73.0,P81.0,1
90794,P1.0,P39.0,P69.0,P99.0,1
90811,P79.0,P13.0,P95.0,P21.0,1
90812,P97.0,P42.0,P59.0,P3.0,1


In [68]:
ZERO = pd.concat([nasaP,random_values], ignore_index=True)
ZERO

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,P65.0,P70.0,P79.0,P33.0,1
1,P64.0,P25.0,P58.0,P35.0,1
2,P63.0,P50.0,P46.0,P36.0,1
3,P48.0,P95.0,P17.0,P50.0,1
4,P3.0,P34.0,P39.0,P94.0,1
...,...,...,...,...,...
65552,,,,P74.0,1
65553,,,,P85.0,1
65554,,,,P7.0,1
65555,,,,P14.0,1


In [69]:


# Step 1: Identify unique values of 'pH'
unique_size = nasaP['absolute_magnitude'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'pH') that has missing values
for column in nasaP.columns:
    if column != 'absolute_magnitude' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}

        # Iterate over each unique value of 'pH'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'pH' is equal to the current pH_value
            non_missing_values = nasaP.loc[nasaP['absolute_magnitude'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [70]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['absolute_magnitude']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [71]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65557 entries, 0 to 65556
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   est_diameter_min    65557 non-null  object
 1   relative_velocity   65557 non-null  object
 2   miss_distance       65557 non-null  object
 3   absolute_magnitude  65557 non-null  object
 4   hazardous           65557 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.5+ MB


In [72]:
# myMode[myMode.isnull().any(axis=1)]['Age'].value_counts()
# myMode = myMode.fillna(myMode.iloc[0])

In [73]:
print(myMode.shape)
print(ZERO.shape)

(65557, 5)
(65557, 5)


In [74]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = nasa.columns.difference(['hazardous'])
for variable in columns_to_impute:
    mapping_data[variable] = [nasa[nasa['hazardous']==1][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,absolute_magnitude,est_diameter_min,miss_distance,relative_velocity
P0,14.04,0.088015,1.432727e+05,5908.291826
P1,16.11,0.104847,3.064562e+06,18246.126515
P2,16.59,0.105817,4.199650e+06,21515.177473
P3,17.10,0.110804,5.134106e+06,23209.603904
P4,17.40,0.110804,5.981734e+06,24601.152489
...,...,...,...,...
P96,21.90,0.880147,7.232378e+07,117588.328525
P97,21.90,1.010543,7.300132e+07,123378.594723
P98,22.00,1.278071,7.360449e+07,129829.777855
P99,22.02,1.594245,7.423233e+07,140707.035702


In [75]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['hazardous'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [76]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [77]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [78]:
# a[0:200] = df_resampled2[df_resampled2['Purchased']==1]

In [79]:
a

Unnamed: 0,est_diameter_min,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,0.258556,73964.741586,6.117170e+07,20.00,1
1,0.253837,43017.810183,4.651343e+07,20.06,1
2,0.249204,58658.010358,3.814665e+07,20.10,1
3,0.198863,114397.000352,1.506881e+07,20.60,1
4,0.110804,48444.753387,3.290172e+07,21.90,1
...,...,...,...,...,...
65552,0.146068,100002.365708,7.667261e+06,21.30,1
65553,0.128397,46715.062410,5.364725e+07,21.61,1
65554,0.646480,129829.777855,6.434242e+07,18.07,1
65555,0.461907,42253.937868,5.640048e+07,18.80,1


In [80]:
percent = pd.concat([a, df_resampled2[df_resampled2['hazardous']==0][0:65557]], ignore_index=True)

In [81]:
Counter(percent['hazardous'])

Counter({1: 65557, 0: 65557})

In [82]:
X_percent = percent.drop(columns=['hazardous'])  # Replace with actual target column name
y_percent = percent['hazardous']  # Replace with actual target column name

# Standardize the 'percent' data as well
X_percent = scaler.fit_transform(X_percent)

def evaluate_custom_oversampling(X_train, y_train, X_test, y_test, classifiers):
    custom_results = []

    # Iterate over classifiers first
    for clf_name, clf in classifiers.items():
        # Train the classifier
        clf.fit(X_train, y_train)

        # Predict on the test set
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)

        # Get metrics
        accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pred, y_prob)

        # Append the result
        custom_results.append({
            'Classifier': clf_name,               # Classifier first
            'Oversampler': 'PERCENTILES',              # Oversampler (Custom in this case)
            'Accuracy': accuracy,
            'Precision(0)': precision[0],
            'Precision(1)': precision[1],
            'Recall (0)': recall[0],
            'Recall (1)': recall[1],
            'F1(0)': f1[0],
            'F1(1)': f1[1],
            'AUC(0)': auc,
            'AUC(1)': auc
        })

    # Convert to DataFrame and return
    return pd.DataFrame(custom_results)



# Evaluate on custom oversampled data (the 'percent' dataset)
custom_results_df = evaluate_custom_oversampling(X_percent, y_percent, X_test, y_test, classifiers)


# Output: Print or review the custom oversampling results
custom_results_df

Unnamed: 0,Classifier,Oversampler,Accuracy,Precision(0),Precision(1),Recall (0),Recall (1),F1(0),F1(1),AUC(0),AUC(1)
0,GaussianNB,PERCENTILES,0.642944,1.0,0.210443,0.60539,1.0,0.754197,0.347712,0.862712,0.862712
1,KNN,PERCENTILES,0.844011,0.92066,0.223335,0.905651,0.257953,0.913094,0.239399,0.74097,0.74097
2,RandomForest,PERCENTILES,0.810821,0.908765,0.122792,0.87919,0.160787,0.893733,0.139244,0.744101,0.744101


In [83]:
combined_df = pd.concat([results_df, custom_results_df], ignore_index=True)

# Save to Excel
combined_df.to_excel("nasa3.xlsx", index=False)
