# 🧼 Data Preprocessing and EDA – UCI Diabetes Dataset
This notebook is part of the *Smart Health: Summer Research in Diabetes Prediction Using ML* project. It includes data cleaning, encoding, and visualization before modeling.

In [None]:
# 📦 Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")


In [None]:
# 📂 Load the dataset from Google Drive
df = pd.read_csv("/content/drive/MyDrive/ucidata.csv")  # Change if needed
df.head()


In [None]:
# ℹ️ Data overview
df.info()
df.describe(include='all')


In [None]:
# 🔍 Check for missing values
df.isnull().sum()


In [None]:
# 🔄 Encode Yes/No and Male/Female to 1/0
df_encoded = df.copy()
df_encoded = df_encoded.replace({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0, 'Positive': 1, 'Negative': 0})
df_encoded.head()


In [None]:
# 📊 Class distribution
sns.countplot(x='class', data=df_encoded)
plt.title("Distribution of Diabetes Outcome")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()


In [None]:
# 📊 Countplots for categorical features
categorical_cols = df_encoded.columns[df_encoded.nunique() == 2].tolist()
categorical_cols.remove("class")

plt.figure(figsize=(15, 12))
for i, col in enumerate(categorical_cols):
    plt.subplot(4, 4, i+1)
    sns.countplot(x=col, data=df_encoded, palette='Set2')
    plt.title(col)
    plt.tight_layout()
plt.show()


In [None]:
# 🔗 Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
