# Handling of Atypical Values Approach 2:	

Adding a "problematic" feature  
Instead of deleting outliers, create a binary feature:  
0 = normal data, 1 = extreme/unusual case  
This allows the model to learn about unusual medical conditions without removing them.  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
# Mark BMI > 60 as problematic (extremely high BMI, rare medical condition)
df["problematic"]=0
sample= df[df["bmi"]>60]
df.loc[sample.index, "problematic"] = 1

In [4]:
# Mark HbA1c < 5.7 & Glucose < 200 but diabetes == 1 as problematic
# (label inconsistency / suspicious case)
sample2=df[(df["HbA1c_level"]<5.7)&(df["blood_glucose_level"]<200)&(df["diabetes"]==1)]
df.loc[sample2.index, "problematic"] = 1

# Mark HbA1c >= 6.4 & Glucose >= 200 but diabetes == 0 as problematic
# (label inconsistency / suspicious case)
sample3=df[(df["HbA1c_level"]>=6.4)&(df["blood_glucose_level"]>=200)&(df["diabetes"]==0)]
df.loc[sample3.index, "problematic"] = 1

In [5]:
# Dropping rows with 'Other' gender category (too few samples, may cause noise)
# Encoding gender as binary: Female=0, Male=1
df = df[df['gender'] != 'Other'].reset_index(drop=True)
df_encoded = df[['hypertension', 'heart_disease', 'diabetes','problematic','gender']].copy()
gender_label_encoding = df['gender'].map({'Female': 0, 'Male': 1})
df_encoded['gender'] = gender_label_encoding

# Simplifying smoking history categories:
# 'former', 'ever', 'never' -> all merged into 'not current'
df['smoking_history'] = df['smoking_history'].replace({
    'former': 'not current',
    'ever': 'not current',
    'never': 'not current'
})

In [6]:
# One-Hot Encode 'smoking_history' to avoid misleading correlations
ohe=OneHotEncoder()
xd=ohe.fit_transform(df[["smoking_history"]]).toarray()
xd=pd.DataFrame(xd)
xd.columns=ohe.get_feature_names_out()
df_encoded.loc[:, xd.columns] = xd
df_encoded.head(3)

Unnamed: 0,hypertension,heart_disease,diabetes,problematic,gender,smoking_history_No Info,smoking_history_current,smoking_history_not current
0,0,1,0,0,0,0.0,0.0,1.0
1,0,0,0,0,0,1.0,0.0,0.0
2,0,0,0,0,1,0.0,0.0,1.0


In [7]:
cols_to_scale = ["age", "bmi", "HbA1c_level", "blood_glucose_level"]

scaler = MinMaxScaler(feature_range=(-1, 1))  
scaled_array = scaler.fit_transform(df[cols_to_scale])
df_scaled = pd.DataFrame(scaled_array, columns=cols_to_scale)
print(df_scaled.shape)
print(df_scaled.describe().round(3).T)

(99982, 4)
                       count   mean    std  min    25%    50%    75%  max
age                  99982.0  0.046  0.563 -1.0 -0.401  0.074  0.499  1.0
bmi                  99982.0 -0.596  0.155 -1.0 -0.682 -0.596 -0.543  1.0
HbA1c_level          99982.0 -0.263  0.389 -1.0 -0.527 -0.164 -0.018  1.0
blood_glucose_level  99982.0 -0.472  0.370 -1.0 -0.818 -0.455 -0.282  1.0


In [8]:
df_scaled = df_scaled.reset_index(drop=True)
df_encoded = df_encoded.reset_index(drop=True)
model = pd.concat([df_encoded, df_scaled], axis=1)
model

Unnamed: 0,hypertension,heart_disease,diabetes,problematic,gender,smoking_history_No Info,smoking_history_current,smoking_history_not current,age,bmi,HbA1c_level,blood_glucose_level
0,0,1,0,0,0,0.0,0.0,1.0,1.000000,-0.645658,0.127273,-0.454545
1,0,0,0,0,0,1.0,0.0,0.0,0.349349,-0.595938,0.127273,-1.000000
2,0,0,0,0,1,0.0,0.0,1.0,-0.301301,-0.595938,-0.200000,-0.290909
3,0,0,0,0,0,0.0,1.0,0.0,-0.101101,-0.686275,-0.454545,-0.318182
4,1,1,0,0,1,0.0,1.0,0.0,0.899900,-0.763539,-0.527273,-0.318182
...,...,...,...,...,...,...,...,...,...,...,...,...
99977,0,0,0,0,0,1.0,0.0,0.0,1.000000,-0.595938,-0.018182,-0.909091
99978,0,0,0,0,0,1.0,0.0,0.0,-0.951952,-0.828198,0.090909,-0.818182
99979,0,0,0,0,1,0.0,0.0,1.0,0.649650,-0.584034,-0.200000,-0.318182
99980,0,0,0,0,0,0.0,0.0,1.0,-0.401401,-0.406863,-0.818182,-0.818182


In [9]:
# Exporting the cleaned and encoded dataset for model training
model.to_csv("status.csv", index=False)