In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from imblearn.over_sampling import SMOTE

In [55]:
data = pd.read_csv('ressources/kaggle/diabetes.csv')
data.count()

Pregnancies                 768
Glucose                     768
BloodPressure               768
SkinThickness               768
Insulin                     768
BMI                         768
DiabetesPedigreeFunction    768
Age                         768
Outcome                     768
dtype: int64

In [None]:
data = pd.read_csv('ressources/kaggle/diabetes.csv')

def find_dynamic_inconsistencies(data):
    inconsistent_conditions = []

    # IQR for outlier detection in each column
    for column in ['Age', 'Pregnancies', 'BMI', 'Glucose', 'BloodPressure', 'SkinThickness	', 'Insulin']:
        if column in data.columns:
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            condition = (data[column] < lower_bound) | (data[column] > upper_bound)
            inconsistent_conditions.append(condition)
            print(f"{column}: Outliers identified as below {lower_bound:.2f} or above {upper_bound:.2f}")

# Having 14 pregnancies is rare, especially for younger individuals. For a 20-year- woman, 
# this is biologically improbable due to the time constraints (each pregnancy lasting about 9 months) 
# and the physical toll of multiple pregnancies.
    # if 'Age' in data.columns and 'Pregnancies' in data.columns:
    #     condition = (data['Age'] < 15) & (data['Pregnancies'] > data['Pregnancies'].quantile(0.90))
    #     inconsistent_conditions.append(condition)
    #     print("Rule 1: Unusually high pregnancies detected for Age < 15.")

# LOGICAL ERROR : under 20?
    # if 'Age' in data.columns and 'Pregnancies' in data.columns:
    #     condition = (data['Age'] > 20) & (data['Pregnancies'] > 4)
    #     inconsistent_conditions.append(condition)
    #     print("Rule 2: Unusually high pregnancies detected for Age under 20.")

    if 'Age' in data.columns and 'Pregnancies' in data.columns:
        condition = (data['Age'] <= 25) & (data['Pregnancies'] > 5)
        inconsistent_conditions.append(condition)
        print("Rule: Unusually high pregnancies (>5) detected for Age ≤ 25.")

# BMI values between 18.5–24.9 are considered  normal, 
# while values above 30 indicate obesity. 
# Values >50 represent severe or morbid obesity but possible 
# Dzhambulat Khatokhov (Russia): At 4 years old, he weighed 48 kg, 
# and by 9 years old, he was around 146 kg, with a BMI likely exceeding 60. source : Guinness World Records in 2023
    if 'Age' in data.columns and 'BMI' in data.columns:
        condition = (data['Age'] < 10) & (data['BMI'] > 60)
        inconsistent_conditions.append(condition)
        print("Rule 3: Unusually high BMI detected for Age < 60 or this children reach a new Guinness World Record.")

# Plasma glucose levels are measured after 2 hours of an oral glucose tolerance test (OGTT). 
# normal levels are <140 mg/dL. Levels above 200 mg/dL typically indicate diabetes.
# Why It's Unusual:
# - While a value of 192 mg/dL is high, it falls within plausible ranges for individuals 
#   with diabetes or impaired glucose tolerance.
# - This is not necessarily an error, but high values (close to 200) 
#   should be verified to ensure correct measurement.
    if 'Glucose' in data.columns and 'Insulin' in data.columns:
        condition = (data['Glucose'] > 200) & (data['Insulin'] < 10)
        inconsistent_conditions.append(condition)
        print("Rule 4: Glucose > 200 with low Insulin detected.")

# Normal diastolic blood pressure ranges between 60–80 mmHg. Values below 40 mmHg are physiologically implausible in healthy individuals.
# Why It's Unusual:
# A diastolic blood pressure of 24 mmHg indicates extreme hypotension, often associated with life-threatening conditions like:
# - Cardiovascular shock.
# - Severe blood loss or dehydration.
# In a dataset focused on diabetes prediction, this value is most likely an error rather than a valid measurement.
# not sure about the 120 value
    if 'BloodPressure' in data.columns:
        condition = (data['BloodPressure'] < 40) | (data['BloodPressure'] > 120) 
        inconsistent_conditions.append(condition)
        print("Rule 5: BloodPressure detected outside normal range (40-120).")


# Normal fasting serum insulin levels are typically between 16–166 μU/mL. Even after glucose intake, levels rarely exceed 300–400 μU/mL in most populations.
# Why It's Unusual:
# - 799 μU/mL is extremely high and suggests:
# - Severe insulin resistance or a rare metabolic condition like insulinoma (insulin-producing tumor).
# Likely causes:
# - A true outlier reflecting a unique physiological state.
# - Data-entry or measurement error.
    if 'Insulin' in data.columns:
        condition = (data['Insulin'] < 1) | (data['Insulin'] > 500)
        inconsistent_conditions.append(condition)
        print("Rule 6: Insulin value detected outside realistic bounds (1-500).")

    # Combine all conditions
    combined_condition = inconsistent_conditions[0]
    for condition in inconsistent_conditions[1:]:
        combined_condition |= condition

    # Identify inconsistent rows
    inconsistent_rows = data[combined_condition]
    return inconsistent_rows

# Apply the dynamic detection
dynamic_inconsistent_rows = find_dynamic_inconsistencies(data)

print("\nDynamic inconsistencies identified:")
print(dynamic_inconsistent_rows)

# Remove inconsistent rows
data = data.drop(dynamic_inconsistent_rows.index)

print(f"\nNumber of inconsistent rows removed: {len(dynamic_inconsistent_rows)}")
print("\nCleaned dataset:")
print(data)

Age: Outliers identified as below -1.50 or above 66.50
Pregnancies: Outliers identified as below -6.50 or above 13.50
BMI: Outliers identified as below 13.35 or above 50.55
Glucose: Outliers identified as below 37.12 or above 202.12
BloodPressure: Outliers identified as below 35.00 or above 107.00
Insulin: Outliers identified as below -190.88 or above 318.12
Rule: Unusually high pregnancies (>5) detected for Age ≤ 25.
Rule 3: Unusually high BMI detected for Age < 60 or this children reach a new Guinness World Record.
Rule 4: Glucose > 200 with low Insulin detected.
Rule 5: BloodPressure detected outside normal range (40-120).

Dynamic inconsistencies identified:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
7             10      115              0              0        0  35.3   
8              2      197             70             45      543  30.5   
9              8      125             96              0        0   0.0   
13             1      189        

In [39]:
data = pd.read_csv('ressources/kaggle/diabetes.csv')
data[(data['Age'] < 10) & (data['BMI'] > 60)]


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [None]:
data = pd.read_csv('ressources/kaggle/diabetes.csv')
data[(data['Age'] <= 25) & (data['Pregnancies'] > 4)]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
49,7,105,0,0,0,0.0,0.305,24,0
98,6,93,50,30,64,28.7,0.356,23,0
117,5,78,48,0,0,33.7,0.654,25,0
121,6,111,64,39,0,34.2,0.26,24,0
189,5,139,80,35,160,31.6,0.361,25,1
216,5,109,62,41,129,35.8,0.514,25,1
457,5,86,68,28,71,30.2,0.364,24,0
731,8,120,86,0,0,28.4,0.259,22,1


In [71]:
data_raw = pd.read_csv('ressources/kaggle/diabetes.csv')
data_raw[ (data_raw['Insulin'] > 500)]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
8,2,197,70,45,543,30.5,0.158,53,1
13,1,189,60,23,846,30.1,0.398,59,1
228,4,197,70,39,744,36.7,2.329,31,0
247,0,165,90,33,680,52.3,0.427,23,0
286,5,155,84,44,545,38.7,0.619,34,0
409,1,172,68,49,579,42.4,0.702,28,1
584,8,124,76,24,600,28.7,0.687,52,1
655,2,155,52,27,540,38.7,0.24,25,1
753,0,181,88,44,510,43.3,0.222,26,1
