In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

In [50]:
from src.io import (
    load_incident_data, 
    save_processed_data
)
from src.utils import (
    standardize_column_names, 
    normalize_text_column
)
from src.data_cleaning.incidents import ( 
    parse_incident_date_column,
    split_age_gender_column,
    classify_incident_type
)
from src.enrichment.ride_metadata import (
    fetch_wikipedia_ride_metadata
)
from src.enrichment.temporal_features import (
    enrich_temporal_features
)
from src.enrichment.visitor_profile import (
    enrich_visitor_profile
)
from src.enrichment.aggregate_features import (
    enrich_aggregate_features
)


In [None]:
df = load_incident_data("data/raw/incidents.csv")

In [None]:
df = standardize_column_names(df)

In [None]:
df = normalize_text_column(df, col="company")

In [None]:
df = parse_incident_date_column(df)   


In [None]:
df = normalize_text_column(df, col="theme_park")

In [None]:
df = split_age_gender_column(df)


In [None]:
df = classify_incident_type(df)

In [None]:
save_processed_data(df, "data/processed/incidents_clean.parquet")

In [None]:
df_unique_rides = df[["ride_name", "theme_park"]].drop_duplicates().copy()

metadata = df_unique_rides.apply(
    lambda row: pd.Series(fetch_wikipedia_ride_metadata(row["ride_name"], row["theme_park"])),
    axis=1
)

df_rides = pd.concat([df_unique_rides, metadata], axis=1)
df_rides.to_csv("data/external/rides_metadata_wikipedia.csv", index=False)

df = df.merge(df_rides, on=["ride_name", "theme_park"], how="left")


In [42]:
df = enrich_temporal_features(df)

In [43]:
df[["incident_date_parsed", "day_of_week", "month", "season", "is_weekend", "is_summer"]].head()


Unnamed: 0,incident_date_parsed,day_of_week,month,season,is_weekend,is_summer
0,2022-06-09,Thursday,6,summer,False,True
1,2014-10-05,Sunday,10,fall,True,False
2,2009-12-03,Thursday,12,winter,False,False
3,2022-08-07,Sunday,8,summer,True,True
4,2022-04-19,Tuesday,4,spring,False,False


In [46]:
df = enrich_visitor_profile(df)

In [None]:
df = enrich_aggregate_features(df)