In [5]:
# notebooks/validation/03_validate_enrichment.ipynb

import pandas as pd

# 1. Load enriched data
df = pd.read_parquet("../../data/processed/incidents_enriched.parquet")
print("Shape:", df.shape)
df.head()

# 2. Column overview
df.info()

# 3. Null value report
df.isna().mean().sort_values(ascending=False).to_frame("null_ratio")

# 4. Categorical distributions
categoricals = [
    "ride_type", "season", "day_of_week", "age_group", "gender",
    "incident_type", "simulated_medical_condition"
]
for col in categoricals:
    print(f"\n🧾 {col}:")
    print(df[col].value_counts(dropna=False))

# 5. Numeric sanity checks
df[["age", "duration_min"]].describe()

# 6. Logical consistency
print("\nAge vs. is_minor")
print(df["age"].where(df["is_minor"] == True).min(), "← youngest minor")
print(df["age"].where(df["is_minor"] == False).max(), "← oldest non-minor")

print("\nAge vs. is_senior")
print(df["age"].where(df["is_senior"] == True).min(), "← youngest senior")

print("\nday_of_week vs. is_weekend")
print(df[["day_of_week", "is_weekend"]].drop_duplicates().sort_values("day_of_week"))

# 7. Simulated feature confirmation
print("\nSimulated features:")
print("→ simulated_medical_condition (controlled randomness)")
print("→ first_time_visitor (Bernoulli 40%)")
print("first_time_visitor mean:", df["first_time_visitor"].mean())
print("simulated_medical_condition distribution:")
print(df["simulated_medical_condition"].value_counts(normalize=True))

# 8. Optional: export nulls
df.isna().sum().to_csv("../../outputs/reports/nulls_enriched.csv")


Shape: (682, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   company                      682 non-null    object        
 1   incident_date                682 non-null    object        
 2   ride_name_dirty              682 non-null    object        
 3   ride_name                    682 non-null    object        
 4   theme_park                   682 non-null    object        
 5   age_gender                   682 non-null    object        
 6   description                  682 non-null    object        
 7   incident_date_parsed         682 non-null    datetime64[ns]
 8   age                          682 non-null    int64         
 9   gender                       682 non-null    object        
 10  incident_type                682 non-null    object        
 11  ride_type                   