In [2]:
# Data Cleaning and Preparation for Disease Prediction

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset (replace with actual file path or source)
df = pd.read_csv("healthcare_dataset.csv")

# ------------------------
# 1. Handling Missing Values
# ------------------------
# Summary of missing values
missing_summary = df.isnull().sum()
print("Missing Values per Column:\n", missing_summary)

# Impute numerical features with median
num_cols = df.select_dtypes(include=[np.number]).columns
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical features with mode
cat_cols = df.select_dtypes(include=["object"]).columns
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# ------------------------
# 2. Removing Duplicate Records
# ------------------------
initial_shape = df.shape
df = df.drop_duplicates()
final_shape = df.shape
print(f"Removed {initial_shape[0] - final_shape[0]} duplicate records")

# ------------------------
# 3. Outlier Detection and Treatment
# ------------------------
# Using IQR method for numeric features
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Cap outliers
    df[col] = np.where(df[col] < lower_bound, lower_bound,
                       np.where(df[col] > upper_bound, upper_bound, df[col]))

# ------------------------
# 4. Convert Data Types and Ensure Consistency
# ------------------------
# Convert categorical-like columns (e.g., 'Gender') to category dtype
df['Gender'] = df['Gender'].astype('category')

# ------------------------
# 5. Encode Categorical Variables
# ------------------------
# Label Encoding for binary categorical columns
label_enc_cols = ['Gender']  # Example binary category
le = LabelEncoder()
for col in label_enc_cols:
    df[col] = le.fit_transform(df[col])

# One-hot Encoding for multiclass columns
df = pd.get_dummies(df, columns=[col for col in cat_cols if col not in label_enc_cols], drop_first=True)

# ------------------------
# 6. Normalize/Standardize Features
# ------------------------
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Final cleaned dataset
df.to_csv("cleaned_healthcare_data.csv", index=False)
print("Data cleaning and preprocessing complete. Saved to 'cleaned_healthcare_data.csv'")


Missing Values per Column:
 Name                  50384
Age                   50384
Gender                50384
Blood Type            50384
Medical Condition     50384
Date of Admission     50384
Doctor                50384
Hospital              50384
Insurance Provider    50384
Billing Amount        50384
Room Number           50384
Admission Type        50384
Discharge Date        50384
Medication            50384
Test Results          50384
dtype: int64
Removed 50383 duplicate records


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.where(df[col] < lower_bound, lower_bound,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.where(df[col] < lower_bound, lower_bound,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.where(df[col] < lower_bound, lower_bound,
A value is trying to be set on a copy of

Data cleaning and preprocessing complete. Saved to 'cleaned_healthcare_data.csv'
