* NOTEBOOK: 03_data_preprocessing.ipynb
* DESCRIPTION: Data Preprocessing Pipeline (Cleaning & Encoding)


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. LOAD THE DATASET

In [11]:
try:
    df = pd.read_csv('../data/medical_insurance_data.csv')
    print("Data Loaded Successfully!")
except FileNotFoundError:
    print("Error: File not found! Please check the path.")

Data Loaded Successfully!


# 2. DATA CLEANING (Handling Missing Values)

In [12]:
# Removing rows with missing data to ensure high model accuracy.
print(f"Original Dataset Size: {df.shape[0]} rows")
df_clean = df.dropna().reset_index(drop=True)
print(f"Cleaned Dataset Size:  {df_clean.shape[0]} rows (Missing values removed)")

Original Dataset Size: 2772 rows
Cleaned Dataset Size:  2736 rows (Missing values removed)


# 3. FEATURE ENCODING (Text to Numbers)

In [13]:
# We use 'Manual Mapping' here to have full control over the values.
# This ensures that specific categories are assigned specific numbers consistently.

# Encoding 'sex' column
# Mapping: male = 0, female = 1
df_clean.replace({'sex': {'male': 0, 'female': 1}}, inplace=True)

# Encoding 'smoker' column
# Mapping: yes = 0, no = 1
# (Note: We are assigning 0 to smokers and 1 to non-smokers)
df_clean.replace({'smoker': {'yes': 0, 'no': 1}}, inplace=True)

# Encoding 'region' column
# Mapping regions to specific integer codes (0, 1, 2, 3)
df_clean.replace({'region': {'southeast': 0, 'southwest': 1, 'northeast': 2, 'northwest': 3}}, inplace=True)

print("\nData after Encoding (Ready for AI):")
print(df_clean.head())


Data after Encoding (Ready for AI):
   age  sex     bmi  children  smoker  region      charges
0   19    1  27.900       0.0       0       1  16884.92400
1   18    0  33.770       1.0       1       0   1725.55230
2   28    0  33.000       3.0       1       0   4449.46200
3   33    0  22.705       0.0       1       3  21984.47061
4   32    0  28.880       0.0       1       3   3866.85520


  df_clean.replace({'sex': {'male': 0, 'female': 1}}, inplace=True)
  df_clean.replace({'smoker': {'yes': 0, 'no': 1}}, inplace=True)
  df_clean.replace({'region': {'southeast': 0, 'southwest': 1, 'northeast': 2, 'northwest': 3}}, inplace=True)


# 4. DATA SPLITTING (Train vs Test)

In [14]:
# X contains the Input Features (Age, BMI, Smoker, etc.)
# y contains the Target Variable (Insurance Charges)
X = df_clean.drop('charges', axis=1)
y = df_clean['charges']

# We split the data: 80% for Training the model, 20% for Testing performance.
# random_state is fixed to ensure consistent results across the team.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# 5. SAVE PROCESSED DATA (Golden Data)

In [15]:
# Saving the processed files so all team members use the same data for modeling.
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

print("\n PHASE 3 COMPLETE: Processed Data Saved Successfully.")
print("The 'X_train.csv' and 'X_test.csv' files are ready for model building.")


 PHASE 3 COMPLETE: Processed Data Saved Successfully.
The 'X_train.csv' and 'X_test.csv' files are ready for model building.
