# Handling Atypical Values Approach 2:	

Adding a "problematic" feature  
Instead of deleting outliers, create a binary feature:  
0 = normal data, 1 = extreme/unusual case  
This allows the model to learn about unusual medical conditions without removing them.  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
# Mark BMI > 60 as problematic (extremely high BMI, rare medical condition)
df["problematic"]=0
sample= df[df["bmi"]>60]
df.loc[sample.index, "problematic"] = 1

In [4]:
# Mark HbA1c < 5.7 & Glucose < 200 but diabetes == 1 as problematic
# (label inconsistency / suspicious case)
sample2=df[(df["HbA1c_level"]<5.7)&(df["blood_glucose_level"]<200)&(df["diabetes"]==1)]
df.loc[sample2.index, "problematic"] = 1

# Mark HbA1c >= 6.4 & Glucose >= 200 but diabetes == 0 as problematic
# (label inconsistency / suspicious case)
sample3=df[(df["HbA1c_level"]>=6.4)&(df["blood_glucose_level"]>=200)&(df["diabetes"]==0)]
df.loc[sample3.index, "problematic"] = 1

In [5]:
# Dropping rows with 'Other' gender category (too few samples, may cause noise)
# Encoding gender as binary: Female=0, Male=1
df = df[df['gender'] != 'Other'].reset_index(drop=True)
df_encoded = df[['hypertension', 'heart_disease', 'diabetes','problematic','gender']].copy()
gender_label_encoding = df['gender'].map({'Female': 0, 'Male': 1})
df_encoded['gender'] = gender_label_encoding

# Simplifying smoking history categories:
# 'former', 'ever', 'never' -> all merged into 'not current'
df['smoking_history'] = df['smoking_history'].replace({
    'former': 'not current',
    'ever': 'not current',
    'never': 'not current'
})
smoking_label_encoding= df['smoking_history'].map({'not current': 0, 'No Info': 1,'current':2 })
df_encoded['smoking_history']=smoking_label_encoding

In [6]:
df_numerics= df[["age", "bmi", "HbA1c_level", "blood_glucose_level"]].copy()

In [7]:
df_numerics = df_numerics.reset_index(drop=True)
df_encoded = df_encoded.reset_index(drop=True)
model = pd.concat([df_encoded, df_numerics], axis=1)
model

Unnamed: 0,hypertension,heart_disease,diabetes,problematic,gender,smoking_history,age,bmi,HbA1c_level,blood_glucose_level
0,0,1,0,0,0,0,80.0,25.19,6.6,140
1,0,0,0,0,0,1,54.0,27.32,6.6,80
2,0,0,0,0,1,0,28.0,27.32,5.7,158
3,0,0,0,0,0,2,36.0,23.45,5.0,155
4,1,1,0,0,1,2,76.0,20.14,4.8,155
...,...,...,...,...,...,...,...,...,...,...
99977,0,0,0,0,0,1,80.0,27.32,6.2,90
99978,0,0,0,0,0,1,2.0,17.37,6.5,100
99979,0,0,0,0,1,0,66.0,27.83,5.7,155
99980,0,0,0,0,0,0,24.0,35.42,4.0,100


In [8]:
# Exporting the cleaned and encoded dataset for model training
model.to_csv("not_scaling.csv", index=False)