In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, KMeansSMOTE, SVMSMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# kagglehub.dataset_download("shivamb/machine-predictive-maintenance-classification")

maintenance = pd.read_csv("E:\Workings\Thesis\Percentile v SMOTE\Datasets smote\predictive_maintenance.csv")

  maintenance = pd.read_csv("E:\Workings\Thesis\Percentile v SMOTE\Datasets smote\predictive_maintenance.csv")


In [3]:
maintenance

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,No Failure


In [4]:
del maintenance['UDI']
del maintenance['Product ID']
del maintenance['Type']
del maintenance['Failure Type']

In [5]:
maintenance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air temperature [K]      10000 non-null  float64
 1   Process temperature [K]  10000 non-null  float64
 2   Rotational speed [rpm]   10000 non-null  int64  
 3   Torque [Nm]              10000 non-null  float64
 4   Tool wear [min]          10000 non-null  int64  
 5   Target                   10000 non-null  int64  
dtypes: float64(3), int64(3)
memory usage: 468.9 KB


In [6]:
from sklearn.model_selection import train_test_split
from collections import Counter

In [7]:
maintenance['Target'].value_counts()

Target
0    9661
1     339
Name: count, dtype: int64

In [8]:
df_resampled, y_resampled = maintenance.copy(), maintenance['Target']
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['Target']

Original class distribution: Counter({0: 9661, 1: 339})


In [11]:
# Function to calculate the required metrics 
def get_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None)
    auc = roc_auc_score(y_true, y_prob[:, 1])
    return accuracy, precision, recall, f1, auc

# Define classifiers
classifiers = {
    'GaussianNB': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# Function to classify and store metrics
def classify_and_store(classifiers, X_train, y_train, X_test, y_test, oversampler=None):
    if oversampler:
        # Apply oversampling
        X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train
    
    # Store the metrics for each classifier
    classifier_metrics = {}
    
    for name, clf in classifiers.items():
        # Train the classifier
        clf.fit(X_train_resampled, y_train_resampled)
        
        # Predict on the test set
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)
        
        # Get metrics
        accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pred, y_prob)
        classifier_metrics[name] = {
            'accuracy': accuracy,
            'precision_0': precision[0],
            'precision_1': precision[1],
            'recall_0': recall[0],
            'recall_1': recall[1],
            'f1_0': f1[0],
            'f1_1': f1[1],
            'auc_0': auc if len(np.unique(y_test)) == 2 else None,
            'auc_1': auc if len(np.unique(y_test)) == 2 else None
        }
    
    return classifier_metrics


# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(df_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Step 1: Perform baseline evaluation (without oversampling) and store results
oversamplers = {
    'RandomOverSampler': RandomOverSampler(random_state=42, sampling_strategy='minority'),
    'SMOTE': SMOTE(random_state=42, sampling_strategy='minority'),
    'ADASYN': ADASYN(random_state=42, sampling_strategy='minority'),
    # Adjusting parameters for KMeansSMOTE
    'KMeansSMOTE': KMeansSMOTE(random_state=42, sampling_strategy='minority', cluster_balance_threshold=0.05, k_neighbors=3),
    'SVMSMOTE': SVMSMOTE(random_state=42, sampling_strategy='minority')
}

# Store all results in a single DataFrame
all_results = []

# Step 2: Baseline (no oversampling)
baseline_metrics = classify_and_store(classifiers, X_train, y_train, X_test, y_test)
for clf_name, m in baseline_metrics.items():
    all_results.append({
        'Classifier': clf_name,
        'Oversampler': 'None',
        'Accuracy': m['accuracy'],
        'Precision(0)': m['precision_0'],
        'Precision(1)': m['precision_1'],
        'Recall (0)': m['recall_0'],
        'Recall (1)': m['recall_1'],
        'F1(0)': m['f1_0'],
        'F1(1)': m['f1_1'],
        'AUC(0)': m['auc_0'],
        'AUC(1)': m['auc_1']
    })

# Step 3: Loop over classifiers and then oversamplers
for clf_name, clf in classifiers.items():
    for sampler_name, oversampler in oversamplers.items():
        try:
            # Wrap the classify_and_store call in a try-except block
            # to gracefully handle potential errors with specific oversamplers
            oversampled_metrics = classify_and_store({clf_name: clf}, X_train, y_train, X_test, y_test, oversampler)

            # Now m will be the dictionary with metrics for the classifier
            m = oversampled_metrics[clf_name]

            all_results.append({
                'Classifier': clf_name,
                'Oversampler': sampler_name,
                'Accuracy': m['accuracy'],
                'Precision(0)': m['precision_0'],
                'Precision(1)': m['precision_1'],
                'Recall (0)': m['recall_0'],
                'Recall (1)': m['recall_1'],
                'F1(0)': m['f1_0'],
                'F1(1)': m['f1_1'],
                'AUC(0)': m['auc_0'],
                'AUC(1)': m['auc_1']
            })
        except RuntimeError as e:
            print(f"Skipping {sampler_name} for {clf_name} due to error: {e}")
            # Optionally, append an entry indicating failure or skip adding results

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

# Step 1: Perform baseline evaluation (without oversampling) and store results
oversamplers = {
    'RandomOverSampler': RandomOverSampler(random_state=42, sampling_strategy='minority'),
    'SMOTE': SMOTE(random_state=42, sampling_strategy='minority'),
    'ADASYN': ADASYN(random_state=42, sampling_strategy='minority'),
    'KMeansSMOTE': KMeansSMOTE(random_state=42, sampling_strategy='minority', cluster_balance_threshold=0.05, k_neighbors=3),
    'SVMSMOTE': SVMSMOTE(random_state=42, sampling_strategy='minority')
}

# Store all results in a single DataFrame
all_results = []

# Step 2: Baseline (no oversampling)
baseline_metrics = classify_and_store(classifiers, X_train, y_train, X_test, y_test)
for clf_name, m in baseline_metrics.items():
    all_results.append({
        'Classifier': clf_name,
        'Oversampler': 'None',
        'Accuracy': m['accuracy'],
        'Precision(0)': m['precision_0'],
        'Precision(1)': m['precision_1'],
        'Recall (0)': m['recall_0'],
        'Recall (1)': m['recall_1'],
        'F1(0)': m['f1_0'],
        'F1(1)': m['f1_1'],
        'AUC(0)': m['auc_0'],
        'AUC(1)': m['auc_1']
    })

# Step 3: Loop over classifiers and then oversamplers
for clf_name, clf in classifiers.items():
    for sampler_name, oversampler in oversamplers.items():
        oversampled_metrics = classify_and_store({clf_name: clf}, X_train, y_train, X_test, y_test, oversampler)
        
        # Now m will be the dictionary with metrics for the classifier
        m = oversampled_metrics[clf_name]
        
        all_results.append({
            'Classifier': clf_name,
            'Oversampler': sampler_name,
            'Accuracy': m['accuracy'],
            'Precision(0)': m['precision_0'],
            'Precision(1)': m['precision_1'],
            'Recall (0)': m['recall_0'],
            'Recall (1)': m['recall_1'],
            'F1(0)': m['f1_0'],
            'F1(1)': m['f1_1'],
            'AUC(0)': m['auc_0'],
            'AUC(1)': m['auc_1']
        })

# Convert to DataFrame
results_df = pd.DataFrame(all_results)


_______________________________________________________________________________________________________________________________________________________________________

In [12]:
maintenance[maintenance['Target']==1].corr()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
Air temperature [K],1.0,0.848669,-0.142306,0.035488,-0.229465,
Process temperature [K],0.848669,1.0,-0.074655,0.01621,-0.104193,
Rotational speed [rpm],-0.142306,-0.074655,1.0,-0.879944,-0.10868,
Torque [Nm],0.035488,0.01621,-0.879944,1.0,-0.021317,
Tool wear [min],-0.229465,-0.104193,-0.10868,-0.021317,1.0,
Target,,,,,,


In [13]:
maintenance[maintenance['Target']==1].corr().abs().sum().sort_values()

Target                     0.000000
Tool wear [min]            1.463655
Torque [Nm]                1.952960
Process temperature [K]    2.043727
Rotational speed [rpm]     2.205584
Air temperature [K]        2.255928
dtype: float64

In [14]:
maintenance[maintenance['Target']==1].count()

Air temperature [K]        339
Process temperature [K]    339
Rotational speed [rpm]     339
Torque [Nm]                339
Tool wear [min]            339
Target                     339
dtype: int64

In [15]:
7722-339

7383

In [16]:
minority_samples = maintenance[maintenance['Target']==1]
random_values = minority_samples['Air temperature [K]'].sample(n=7383, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,Air temperature [K]
0,299.2
1,298.6
2,298.1
3,302.3
4,298.4
...,...
7378,298.4
7379,297.3
7380,300.7
7381,298.9


In [17]:
maintenance.columns

Index(['Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target'],
      dtype='object')

In [18]:
random_values['Process temperature [K]'] = np.nan
random_values['Rotational speed [rpm]'] = np.nan
random_values['Torque [Nm]'] = np.nan
random_values['Tool wear [min]'] = np.nan
random_values['Target'] = 1

In [19]:
random_values

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,299.2,,,,,1
1,298.6,,,,,1
2,298.1,,,,,1
3,302.3,,,,,1
4,298.4,,,,,1
...,...,...,...,...,...,...
7378,298.4,,,,,1
7379,297.3,,,,,1
7380,300.7,,,,,1
7381,298.9,,,,,1


In [20]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [21]:
random_values['Air temperature [K]'] = calculate_percentiles(random_values['Air temperature [K]'])
random_values['Air temperature [K]'] = 'P' + random_values['Air temperature [K]'].round().astype(str,errors='ignore')

In [22]:
random_values['Air temperature [K]'].unique()

array(['P25.0', 'P18.0', 'P11.0', 'P65.0', 'P15.0', 'P56.0', 'P5.0',
       'P72.0', 'P4.0', 'P99.0', 'P89.0', 'P19.0', 'P68.0', 'P98.0',
       'P14.0', 'P78.0', 'P74.0', 'P17.0', 'P90.0', 'P94.0', 'P22.0',
       'P23.0', 'P1.0', 'P44.0', 'P83.0', 'P8.0', 'P48.0', 'P60.0',
       'P42.0', 'P85.0', 'P9.0', 'P86.0', 'P88.0', 'P53.0', 'P51.0',
       'P95.0', 'P32.0', 'P7.0', 'P39.0', 'P63.0', 'P35.0', 'P2.0',
       'P45.0', 'P31.0', 'P33.0', 'P6.0', 'P38.0', 'P29.0', 'P47.0',
       'P46.0', 'P0.0', 'P81.0', 'P27.0', 'P20.0', 'P3.0', 'P30.0',
       'P50.0', 'P69.0', 'P97.0', 'P12.0', 'P100.0', 'P13.0', 'P82.0',
       'P28.0', 'P49.0', 'P52.0', 'P66.0', 'P64.0', 'P10.0', 'P61.0',
       'P21.0', 'P40.0', 'P57.0', 'P24.0', 'P79.0', 'P54.0', 'P75.0',
       'P36.0', 'P91.0', 'P73.0', 'P84.0', 'P96.0', 'P70.0', 'P34.0',
       'P87.0', 'P26.0', 'P76.0', 'P67.0', 'P58.0', 'P80.0', 'P16.0',
       'P62.0', 'P43.0', 'P92.0', 'P55.0', 'P37.0', 'P41.0', 'P71.0',
       'P59.0', 'P77.0', 'P93

In [23]:
maintenanceP = dataframe_to_percentiles(minority_samples)
maintenanceP = 'P' + maintenanceP.round().astype(str,errors='ignore')
maintenanceP['Target'] = 1

In [24]:
maintenanceP

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
50,P22.0,P21.0,P99.0,P1.0,P43.0,1
69,P22.0,P20.0,P72.0,P87.0,P59.0,1
77,P21.0,P19.0,P80.0,P19.0,P75.0,1
160,P16.0,P9.0,P9.0,P72.0,P86.0,1
161,P15.0,P6.0,P73.0,P45.0,P88.0,1
...,...,...,...,...,...,...
9758,P19.0,P30.0,P91.0,P9.0,P90.0,1
9764,P19.0,P25.0,P12.0,P88.0,P5.0,1
9822,P19.0,P24.0,P45.0,P73.0,P57.0,1
9830,P16.0,P23.0,P33.0,P59.0,P72.0,1


In [25]:
ZERO = pd.concat([maintenanceP,random_values], ignore_index=True)
ZERO

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,P22.0,P21.0,P99.0,P1.0,P43.0,1
1,P22.0,P20.0,P72.0,P87.0,P59.0,1
2,P21.0,P19.0,P80.0,P19.0,P75.0,1
3,P16.0,P9.0,P9.0,P72.0,P86.0,1
4,P15.0,P6.0,P73.0,P45.0,P88.0,1
...,...,...,...,...,...,...
7717,P17.0,,,,,1
7718,P5.0,,,,,1
7719,P42.0,,,,,1
7720,P22.0,,,,,1


In [26]:
ZERO['Target'].value_counts()

Target
1    7722
Name: count, dtype: int64

In [27]:


# Step 1: Identify unique values of 'pH'
unique_size = maintenanceP['Air temperature [K]'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'pH') that has missing values
for column in maintenanceP.columns:
    if column != 'Air temperature [K]' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'pH'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'pH' is equal to the current pH_value
            non_missing_values = maintenanceP.loc[maintenanceP['Air temperature [K]'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [28]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['Air temperature [K]']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [29]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7722 entries, 0 to 7721
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Air temperature [K]      7722 non-null   object
 1   Process temperature [K]  7722 non-null   object
 2   Rotational speed [rpm]   7722 non-null   object
 3   Torque [Nm]              7722 non-null   object
 4   Tool wear [min]          7722 non-null   object
 5   Target                   7722 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 362.1+ KB


In [30]:
# myMode[myMode.isnull().any(axis=1)]['Air temperature [K]'].value_counts()
# myMode = myMode.fillna(myMode.iloc[0])

In [31]:
print(myMode.shape)
print(ZERO.shape)

(7722, 6)
(7722, 6)


In [32]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = maintenance.columns.difference(['Target'])
for variable in columns_to_impute:
    mapping_data[variable] = [maintenance[maintenance['Target']==1][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Tool wear [min],Torque [Nm]
P0,295.600,306.100,1181.00,0.00,3.800
P1,296.338,307.238,1223.90,2.00,5.676
P2,296.776,307.476,1235.76,5.76,9.180
P3,296.914,307.714,1255.14,8.14,9.926
P4,297.000,307.800,1258.00,10.52,11.400
...,...,...,...,...,...
P96,303.600,312.400,2673.92,229.96,69.648
P97,303.686,312.400,2703.90,231.86,70.586
P98,303.724,312.824,2724.84,234.00,71.848
P99,303.862,313.100,2829.96,242.96,74.158


In [33]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['Target'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [34]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [35]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [36]:
df_resampled2[df_resampled2['Target']==1]

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
50,298.9,309.1,2861,4.6,143,1
69,298.9,309.0,1410,65.7,191,1
77,298.8,308.9,1455,41.3,208,1
160,298.4,308.2,1282,60.7,216,1
161,298.3,308.1,1412,52.3,218,1
...,...,...,...,...,...,...
9758,298.6,309.8,2271,16.2,218,1
9764,298.5,309.5,1294,66.7,12,1
9822,298.5,309.4,1360,60.9,187,1
9830,298.3,309.3,1337,56.1,206,1


In [37]:
# a[0:339] = df_resampled2[df_resampled2['Target']==1]

In [38]:
a

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,298.900,309.198,2829.96,5.676,143.34,1
1,298.900,309.000,1406.80,65.706,191.00,1
2,298.800,308.900,1459.80,41.366,207.50,1
3,298.400,308.200,1284.00,60.636,216.00,1
4,298.300,308.028,1414.96,52.500,217.44,1
...,...,...,...,...,...,...
7717,298.400,308.700,1421.50,60.700,119.00,1
7718,297.200,308.104,1322.36,44.908,207.00,1
7719,300.700,311.800,1337.00,58.500,189.00,1
7720,298.900,309.198,2829.96,5.676,143.34,1


In [39]:
percent = pd.concat([a, df_resampled2[df_resampled2['Target']==0][0:7722]], ignore_index=True)

In [40]:
percent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15444 entries, 0 to 15443
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air temperature [K]      15444 non-null  float64
 1   Process temperature [K]  15444 non-null  float64
 2   Rotational speed [rpm]   15444 non-null  float64
 3   Torque [Nm]              15444 non-null  float64
 4   Tool wear [min]          15444 non-null  float64
 5   Target                   15444 non-null  int64  
dtypes: float64(5), int64(1)
memory usage: 724.1 KB


In [41]:
X_percent = percent.drop(columns=['Target'])  # Replace with actual target column name
y_percent = percent['Target']  # Replace with actual target column name

# Standardize the 'percent' data as well
X_percent = scaler.fit_transform(X_percent)

def evaluate_custom_oversampling(X_train, y_train, X_test, y_test, classifiers):
    custom_results = []
    
    # Iterate over classifiers first
    for clf_name, clf in classifiers.items():
        # Train the classifier
        clf.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)
        
        # Get metrics
        accuracy, precision, recall, f1, auc = get_metrics(y_test, y_pred, y_prob)
        
        # Append the result
        custom_results.append({
            'Classifier': clf_name,               # Classifier first
            'Oversampler': 'PERCENTILES',              # Oversampler (Custom in this case)
            'Accuracy': accuracy,
            'Precision(0)': precision[0],
            'Precision(1)': precision[1],
            'Recall (0)': recall[0],
            'Recall (1)': recall[1],
            'F1(0)': f1[0],
            'F1(1)': f1[1],
            'AUC(0)': auc,
            'AUC(1)': auc
        })
    
    # Convert to DataFrame and return
    return pd.DataFrame(custom_results)



# Evaluate on custom oversampled data (the 'percent' dataset)
custom_results_df = evaluate_custom_oversampling(X_percent, y_percent, X_test, y_test, classifiers)


# Output: Print or review the custom oversampling results
custom_results_df

Unnamed: 0,Classifier,Oversampler,Accuracy,Precision(0),Precision(1),Recall (0),Recall (1),F1(0),F1(1),AUC(0),AUC(1)
0,GaussianNB,PERCENTILES,0.607,0.996555,0.067938,0.596699,0.934426,0.746452,0.126667,0.871862,0.871862
1,KNN,PERCENTILES,0.859,0.994037,0.157895,0.859722,0.836066,0.922013,0.265625,0.907621,0.907621
2,RandomForest,PERCENTILES,0.8755,0.989571,0.156934,0.880866,0.704918,0.93206,0.256716,0.907896,0.907896


In [42]:
combined_df = pd.concat([results_df, custom_results_df], ignore_index=True)

# Save to Excel
combined_df.to_excel("predictivemaintenance3.xlsx", index=False)
