In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler
import os

# --- 1. Load the dataset ---
file_name = '/content/kiwni1.csv'

if not os.path.exists(file_name):
    print(f"Error: The file '{file_name}' was not found in the current directory.")
    print("Please make sure the CSV file is in the same folder as the script.")
    exit()

# Read the CSV into a pandas DataFrame
df = pd.read_csv(file_name)
print(f"Successfully loaded '{file_name}'")


# --- 2. Clean and Rename Columns ---
# Clean up original column names
df.columns = df.columns.str.strip()

# Define a mapping from old column names to new ones
column_mapping = {
    'Genetics': 'Family_History_Diabetes',
    'Weight': 'Weight_kg',
    'Height': 'Height_cm',
    'Hypertension': 'High_Blood_Pressure',
    'Exercise': 'Exercise_Frequency',
    'Diabetes': 'Diabetes_Diagnosis'
}
df.rename(columns=column_mapping, inplace=True)
print("\nCleaned and renamed columns.")


# --- 3. Preprocess the data ---

# Function to extract numerical values from a string (for Age and Weight)
def clean_numeric(value):
    if isinstance(value, str):
        # Use regex to find the first sequence of digits (and optional decimal)
        numeric_part = re.search(r'[\d.]+', value)
        if numeric_part:
            return float(numeric_part.group(0))
    # If it's already a number or can't be parsed, return it as is
    if isinstance(value, (int, float)):
        return value
    return np.nan # Return NaN if no numeric part is found

# Function to convert various height formats to centimeters
def convert_height_to_cm(height_val):
    if isinstance(height_val, str):
        height_val = height_val.lower().strip().replace('"', '')
        # Format: 5'5, 5' 2, 5 feet 3.5 inches
        if "'" in height_val or "feet" in height_val:
            parts = re.findall(r'[\d.]+', height_val)
            feet = float(parts[0]) if len(parts) > 0 else 0
            inches = float(parts[1]) if len(parts) > 1 else 0
            return (feet * 12 + inches) * 2.54
        # Format: "4,9" (interpreted as 4 feet 9 inches)
        elif "," in height_val:
            parts = height_val.split(',')
            feet = float(parts[0]) if len(parts) > 0 else 0
            inches = float(parts[1]) if len(parts) > 1 else 0
            return (feet * 12 + inches) * 2.54
        # Format: numbers with "cm" or just numbers
        else:
            numeric_part = re.search(r'[\d.]+', height_val)
            if numeric_part:
                # Assume values > 30 are cm, otherwise they are likely feet that were written without a unit.
                num = float(numeric_part.group(0))
                if num > 30:
                    return num
                else: # Treat as feet (e.g., '5.3' becomes 5'3")
                    feet = int(num)
                    inches = (num - feet) * 10
                    return (feet * 12 + inches) * 2.54

    # If it's already a number, assume it's in cm
    elif isinstance(height_val, (int, float)):
        if height_val > 30:
             return height_val
        else: # Treat as feet
            feet = int(height_val)
            inches = (height_val - feet) * 10
            return (feet * 12 + inches) * 2.54
    return np.nan # Return NaN for unparseable formats

# Apply cleaning functions
print("Cleaning Age, Weight, and Height columns...")
df['Age'] = df['Age'].apply(clean_numeric)
df['Weight_kg'] = df['Weight_kg'].apply(clean_numeric)
df['Height_cm'] = df['Height_cm'].apply(convert_height_to_cm)


# --- 4. Encode categorical variables ---
print("Encoding categorical columns...")
# Gender: Female -> 0, Male -> 1
df['Gender'] = df['Gender'].str.strip().replace({'Female': 0, 'Male': 1})

# Binary columns: No -> 0, Yes -> 1
binary_cols = ['Family_History_Diabetes', 'High_Blood_Pressure', 'Diabetes_Diagnosis']
for col in binary_cols:
    df[col] = df[col].str.strip().replace({'No': 0, 'Yes': 1})

# Exercise Frequency: Ordinal encoding
exercise_map = {
    'Never': 0,
    '1-2 times': 1,
    '3-4 times': 2,
    '5 or more times': 3,
    '5 or more': 3 # Adding a variant found in the data
}
df['Exercise_Frequency'] = df['Exercise_Frequency'].str.strip().replace(exercise_map)

# Drop eating habits column as it is not part of the final request
df = df.drop(columns=['EatingHabits'])

# Convert all data to numeric, coercing errors to NaN
df = df.apply(pd.to_numeric, errors='coerce')

# Handle any potential missing values by filling with the median of the column
if df.isnull().sum().any():
    print("\nFound missing values after cleaning. Filling with column median.")
    df.fillna(df.median(), inplace=True)


# --- 5. Feature Engineering: Calculate BMI ---
print("Calculating BMI...")
# BMI = weight (kg) / (height (m))^2
# Ensure height is not zero to avoid division by zero error
height_m = df['Height_cm'] / 100
df['BMI'] = df['Weight_kg'] / (height_m.replace(0, np.nan) ** 2)
# Fill any NaN BMI values (from zero height) with the median BMI
if df['BMI'].isnull().any():
    df['BMI'].fillna(df['BMI'].median(), inplace=True)


# --- 6. Normalize Age and BMI ---
print("Normalizing Age and BMI columns...")
scaler = MinMaxScaler()
df[['Age', 'BMI']] = scaler.fit_transform(df[['Age', 'BMI']])


# --- 7. Remove original Height and Weight columns ---
print("Removing original Height and Weight columns...")
df = df.drop(columns=['Weight_kg', 'Height_cm'])


# --- 8. Display and Save the Preprocessed Data ---
print("\nPreprocessing complete. Displaying the first 20 rows of the final data:")
# Display all columns of the dataframe
pd.set_option('display.max_columns', None)
print(df.head(20).to_string())

# Save the preprocessed data to a new CSV file
output_filename = 'preprocessed_diabetes_data.csv'
df.to_csv(output_filename, index=False)
print(f"\nYour preprocessed file has been saved as '{output_filename}' in the current directory.")