# Part 2b: Add Validation

Load and process patient data with BMI calculations.

**Your task:** Add schema and bounds validation to catch data quality issues early.

---

## Load data

In [5]:
import pandas as pd

# TODO: Define a function validate_schema(df, required_columns) that:
#       - Checks if all required columns are present
#       - Return a list of missing columns if any are missing

# TODO: Define a function validate_bounds(df, bounds_dict) that:
#       - For each column in bounds_dict, check if values are within (min, max)
#       - Use df[col].between(min, max) to find out-of-bounds values
#       - Print patient_id and value for any out-of-bounds rows

df = pd.read_csv("patient_intake_bad_values.csv")
df

# TODO: Call validate_schema() to check for required columns:
#       ["patient_id", "weight_kg", "height_cm", "age"]

# TODO: Call validate_bounds() with bounds:
#       weight_kg: (30, 250)
#       height_cm: (120, 230)
#       age: (0, 110)



Unnamed: 0,patient_id,first_name,last_name,weight_kg,height_cm,age,sex
0,P001,Mark,Johnson,500.0,177,46,M
1,P002,Donald,Walker,80.5,50,29,M
2,P003,Nancy,Rhodes,74.3,163,150,F
3,P004,Steven,Miller,0.0,171,71,M
4,P005,Javier,Johnson,72.8,300,18,M
5,P006,Daniel,Wagner,78.5,169,37,M
6,P007,Alexander,Gonzalez,92.5,174,29,M
7,P008,Marie,Gardner,85.9,170,51,F
8,P009,Daniel,Lawrence,80.1,166,66,M
9,P010,Robert,Smith,60.6,171,64,M


In [6]:
def validate_schema(df, required_columns):
  missing_columns = []
  for i in range(len(required_columns)):
    if required_columns[i] in df.columns:
      pass
    else:
      missing_columns.append(required_columns[i])
  if missing_columns == []:
    return "Schema Check Complete!"
  else:
    return "The following columns are missing: " + str(missing_columns)

In [7]:
validate_schema(df, ["patient_id", "weight_kg", "height_cm", "sex"])

'Schema Check Complete!'

In [8]:
def validate_bounds(df, bounds_dict):
  keys = list(bounds_dict.keys())
  values = list(bounds_dict.values())
  result = list(range(len(bounds_dict)))
  outOfBounds = dict.fromkeys(keys)
  for i in range(len(bounds_dict)):
     for j in range(len(values[i])):
      if values[i][j] <= df[keys[i]].max() and values[i][j] >= df[keys[i]].min():
        pass
      else:
        outOfBounds.update({keys[i]:str(values[i][j])})
  oKeys = list(outOfBounds.keys())
  oValues = list(outOfBounds.values())
  for i in range(len(bounds_dict)):
    if oValues[i] is None:
      result[i] = "No out-of-bounds values from category '" + oKeys[i] + "'"
    else:
      result[i] = "Category '" + oKeys[i] + "' has these values out-of-bounds: " + oValues[i]
  return result


In [9]:
bounds_dict = {'weight_kg': (30, 250), 'height_cm': (120, 230), 'age':(0, 110)}
values = list(bounds_dict.values())
print(list(bounds_dict.keys())[0])

weight_kg


In [10]:
validate_bounds(df, {'weight_kg': (30, 250), 'height_cm': (120, 230), 'age':(0, 110)})

["No out-of-bounds values from category 'weight_kg'",
 "No out-of-bounds values from category 'height_cm'",
 "Category 'age' has these values out-of-bounds: 0"]

---

## Calculate BMI

In [11]:
df["height_m"] = df["height_cm"] / 100
df["bmi"] = df["weight_kg"] / (df["height_m"] ** 2)
df["bmi"] = df["bmi"].round(1)

df[["patient_id", "weight_kg", "height_cm", "bmi"]].head()

Unnamed: 0,patient_id,weight_kg,height_cm,bmi
0,P001,500.0,177,159.6
1,P002,80.5,50,322.0
2,P003,74.3,163,28.0
3,P004,0.0,171,0.0
4,P005,72.8,300,8.1


---

## Categorize BMI

In [12]:
df["bmi_category"] = pd.cut(
    df["bmi"],
    bins=[0, 18.5, 25, 30, float("inf")],
    labels=["Underweight", "Normal", "Overweight", "Obese"],
    right=False
)

df[["patient_id", "bmi", "bmi_category"]].head()

Unnamed: 0,patient_id,bmi,bmi_category
0,P001,159.6,Obese
1,P002,322.0,Obese
2,P003,28.0,Overweight
3,P004,0.0,Underweight
4,P005,8.1,Underweight


---

## Summary statistics

In [13]:
summary = df.groupby("bmi_category")["patient_id"].count()
print("\nBMI category distribution:")
print(summary)

high_risk = df[df["bmi"] > 30]
print(f"\nHigh-risk patients (BMI > 30): {len(high_risk)}")


BMI category distribution:
bmi_category
Underweight    2
Normal         1
Overweight     4
Obese          3
Name: patient_id, dtype: int64

High-risk patients (BMI > 30): 3


  summary = df.groupby("bmi_category")["patient_id"].count()
