In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer

file_path = "/Users/91life/Desktop/diabetes-project/data/Responses.csv"
df = pd.read_csv(file_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

df = df.rename(columns={
    'Gjinia': 'gender',
    'Numri i shtatzanive': 'pregnancies',
    'Mosha': 'age',
    'Niveli i glukozës në gjak (plotësojeni vetëm nëse e dini)': 'glucose',
    'Tensioni mesatar i gjakut  (plotësojeni vetëm nëse e dini)': 'blood_pressure',
    'Pesha (kg)': 'weight',
    'Gjatësia (cm)': 'height',
    'Niveli i insulinës  (plotësojeni vetëm nëse e dini)': 'insulin',
    'A jeni fizikisht aktiv?': 'physically_active',
    'A pini duhan?': 'smoking',
    'A konsumoni shpesh ushqime të përpunuara, të ëmbla ose me yndyrë të lartë?': 'junk_food',
    'A keni ndonjë anëtar të familjes me diabet?': 'family_history',
    'A jeni të diagnostifikuar me diabet nga mjeku?': 'diabetes_diagnosed'
})

df = df.drop(columns=['Timestamp'])

print("Columns:", df.columns.tolist())


Dataset shape: (314, 14)
Columns: ['Timestamp', 'Gjinia', 'Numri i shtatzanive', 'Mosha', 'Niveli i glukozës në gjak (plotësojeni vetëm nëse e dini)', 'Tensioni mesatar i gjakut  (plotësojeni vetëm nëse e dini)', 'Pesha (kg)', 'Gjatësia (cm)', 'Niveli i insulinës  (plotësojeni vetëm nëse e dini)', 'A jeni fizikisht aktiv?', 'A pini duhan?', 'A konsumoni shpesh ushqime të përpunuara, të ëmbla ose me yndyrë të lartë?', 'A keni ndonjë anëtar të familjes me diabet?', 'A jeni të diagnostifikuar me diabet nga mjeku?']
Columns: ['gender', 'pregnancies', 'age', 'glucose', 'blood_pressure', 'weight', 'height', 'insulin', 'physically_active', 'smoking', 'junk_food', 'family_history', 'diabetes_diagnosed']


In [11]:
# Data cleaning functions
def clean_pregnancies(x):
    """Clean pregnancies column - handle text values and extract numbers"""
    if pd.isna(x):
        return np.nan
    
    x_str = str(x).strip()
    
    # Handle text values
    if x_str.lower() in ['asnje', 'asnjë', 'none', '']:
        return 0
    
    numbers = re.findall(r'\d+', x_str)
    if numbers:
        return int(numbers[0]) 
    
    try:
        return int(float(x_str))
    except:
        return np.nan

def clean_height(x):
    """Clean height column - convert meters to cm and handle mixed units"""
    if pd.isna(x):
        return np.nan
    
    try:
        x_float = float(x)
        
        if x_float < 3:
            return x_float * 100
        else:
            return x_float
    except:
        return np.nan

def clean_glucose(x):
    """Clean glucose values"""
    if pd.isna(x):
        return np.nan
    
    x_str = str(x).strip()
    
    if x_str.lower() in ['nuk e di', 'jo', '']:
        return np.nan
    
    x_clean = re.sub(r'[^\d.,]', '', x_str)
    x_clean = x_clean.replace(',', '.')
    
    try:
        return float(x_clean)
    except:
        return np.nan

def clean_blood_pressure(x):
    """Clean blood pressure values"""
    if pd.isna(x):
        return np.nan
    
    x_str = str(x).strip()
    
    if x_str.lower() in ['nuk e di', 'jo', '']:
        return np.nan
    
    if "/" in x_str:
        return float(x_str.split("/")[0])
    
    numbers = re.findall(r'\d+', x_str)
    if numbers:
        return float(numbers[0])
    
    try:
        return float(x_str)
    except:
        return np.nan

def clean_insulin(x):
    """Clean insulin values"""
    if pd.isna(x):
        return np.nan
    
    x_str = str(x).strip()
    
    if x_str.lower() in ['nuk e di', 'jo', '']:
        return np.nan
    
    x_clean = re.sub(r'[^\d.,]', '', x_str)
    x_clean = x_clean.replace(',', '.')
    
    try:
        return float(x_clean)
    except:
        return np.nan

In [12]:
# Apply data cleaning

df['pregnancies'] = df['pregnancies'].apply(clean_pregnancies)
df['height'] = df['height'].apply(clean_height)
df['glucose'] = df['glucose'].apply(clean_glucose)
df['blood_pressure'] = df['blood_pressure'].apply(clean_blood_pressure)
df['insulin'] = df['insulin'].apply(clean_insulin)
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')
df['age'] = pd.to_numeric(df['age'], errors='coerce')

print("\nData types after cleaning:")
print(df.dtypes)



Data types after cleaning:
gender                 object
pregnancies           float64
age                   float64
glucose               float64
blood_pressure        float64
weight                float64
height                float64
insulin               float64
physically_active      object
smoking                object
junk_food              object
family_history         object
diabetes_diagnosed     object
dtype: object


In [13]:
# Encode categorical variables

df['diabetes_diagnosed'] = df['diabetes_diagnosed'].map({'Po': 1, 'Jo': 0})

map_dict = {'Po': 1, 'Jo': 0, 'Mashkull': 1, 'Femër': 0}

for col in ['gender', 'physically_active', 'smoking', 'junk_food', 'family_history']:
    df[col] = df[col].map(map_dict)

print("\nUnique values in categorical columns:")
for col in ['gender', 'physically_active', 'smoking', 'junk_food', 'family_history', 'diabetes_diagnosed']:
    print(f"{col}: {df[col].value_counts().to_dict()}")



Unique values in categorical columns:
gender: {0: 167, 1: 147}
physically_active: {0: 174, 1: 140}
smoking: {0: 222, 1: 92}
junk_food: {0: 168, 1: 146}
family_history: {0: 226, 1: 88}
diabetes_diagnosed: {0: 211, 1: 103}


In [14]:
# Feature engineering

# Calculate BMI (Body Mass Index)
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)

print(f"New feature created: bmi")


New feature created: bmi


In [15]:
# Data validation and outlier detection

# Check for unrealistic values
print("\nData validation results:")

# Age validation (should be between 0 and 120)
invalid_age = df[(df['age'] < 0) | (df['age'] > 120)]
print(f"Invalid age values: {len(invalid_age)} rows")

# Height validation (should be between 100 and 250 cm)
invalid_height = df[(df['height'] < 100) | (df['height'] > 250)]
print(f"Invalid height values: {len(invalid_height)} rows")

# Weight validation (should be between 20 and 300 kg)
invalid_weight = df[(df['weight'] < 20) | (df['weight'] > 300)]
print(f"Invalid weight values: {len(invalid_weight)} rows")

# BMI validation (should be between 10 and 80)
invalid_bmi = df[(df['bmi'] < 10) | (df['bmi'] > 80)]
print(f"Invalid BMI values: {len(invalid_bmi)} rows")

# Glucose validation (should be between 50 and 600 mg/dL)
invalid_glucose = df[(df['glucose'] < 50) | (df['glucose'] > 600)]
print(f"Invalid glucose values: {len(invalid_glucose)} rows")

# Blood pressure validation (should be between 60 and 250 mmHg)
invalid_bp = df[(df['blood_pressure'] < 60) | (df['blood_pressure'] > 250)]
print(f"Invalid blood pressure values: {len(invalid_bp)} rows")

# Set unrealistic values to NaN
df.loc[(df['age'] < 0) | (df['age'] > 120), 'age'] = np.nan
df.loc[(df['height'] < 100) | (df['height'] > 250), 'height'] = np.nan
df.loc[(df['weight'] < 20) | (df['weight'] > 300), 'weight'] = np.nan
df.loc[(df['glucose'] < 50) | (df['glucose'] > 600), 'glucose'] = np.nan
df.loc[(df['blood_pressure'] < 60) | (df['blood_pressure'] > 250), 'blood_pressure'] = np.nan

# Recalculate BMI after cleaning
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
df.loc[(df['bmi'] < 10) | (df['bmi'] > 80), 'bmi'] = np.nan




Data validation results:
Invalid age values: 0 rows
Invalid height values: 9 rows
Invalid weight values: 0 rows
Invalid BMI values: 8 rows
Invalid glucose values: 18 rows
Invalid blood pressure values: 1 rows


In [21]:
# Missing value analysis
print("Missing value analysis:")
print(df.isnull().sum())

print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Percentage of missing values: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")

# For numerical columns, use median imputation
numerical_cols = ['age', 'pregnancies', 'glucose', 'blood_pressure', 'weight', 'height', 'insulin', 'bmi']
categorical_cols = ['gender', 'physically_active', 'smoking', 'junk_food', 'family_history']

print("\nPerforming missing value imputation...")

# Impute numerical columns with median
for col in numerical_cols:
    if col in df.columns:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)
        print(f"Imputed {col} with median: {median_value:.2f}")

# For categorical columns, use mode imputation
for col in categorical_cols:
    if col in df.columns:
        mode_value = df[col].mode()[0] if not df[col].mode().empty else 0
        df[col] = df[col].fillna(mode_value)
        print(f"Imputed {col} with mode: {mode_value}")

print("\nMissing values after imputation:")
print(df.isnull().sum())



Missing value analysis:
gender                  0
pregnancies            40
age                     6
glucose               105
blood_pressure         53
weight                  8
height                 17
insulin               145
physically_active       0
smoking                 0
junk_food               0
family_history          0
diabetes_diagnosed      0
bmi                    26
dtype: int64

Total missing values: 400
Percentage of missing values: 9.10%

Performing missing value imputation...
Imputed age with median: 39.00
Imputed pregnancies with median: 0.00
Imputed glucose with median: 178.00
Imputed blood_pressure with median: 130.00
Imputed weight with median: 82.00
Imputed height with median: 176.00
Imputed insulin with median: 32.00
Imputed bmi with median: 26.18
Imputed gender with mode: 0
Imputed physically_active with mode: 0
Imputed smoking with mode: 0
Imputed junk_food with mode: 0
Imputed family_history with mode: 0

Missing values after imputation:
gender          

In [22]:
# Data summary and final checks
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

print("\nData types:")
print(df.dtypes)

print("\nBasic statistics:")
print(df.describe())

print("\nTarget variable distribution:")
print(df['diabetes_diagnosed'].value_counts())
print(f"Diabetes prevalence: {(df['diabetes_diagnosed'].sum() / len(df)) * 100:.2f}%")

output_path = "/Users/91life/Desktop/diabetes-project/data/cleaned_responses.csv"
df.to_csv(output_path, index=False)

print(f"\nCleaned dataset saved to: {output_path}")
print("Note: Missing values have been imputed with medians (numerical) and modes (categorical) to maintain sufficient data for model training.")


Shape: (314, 14)
Columns: ['gender', 'pregnancies', 'age', 'glucose', 'blood_pressure', 'weight', 'height', 'insulin', 'physically_active', 'smoking', 'junk_food', 'family_history', 'diabetes_diagnosed', 'bmi']

Data types:
gender                  int64
pregnancies           float64
age                   float64
glucose               float64
blood_pressure        float64
weight                float64
height                float64
insulin               float64
physically_active       int64
smoking                 int64
junk_food               int64
family_history          int64
diabetes_diagnosed      int64
bmi                   float64
dtype: object

Basic statistics:
           gender  pregnancies         age     glucose  blood_pressure  \
count  314.000000   314.000000  314.000000  314.000000      314.000000   
mean     0.468153     0.796178   39.375796  173.031847      134.996815   
std      0.499781     1.244740   14.124238   43.018448       18.647135   
min      0.000000     0.000