In [None]:
import random
from pathlib import Path

import polars as pl
import yaml

In [56]:
root = Path("~/Data/foundata/QHTS")

In [None]:
def sampler(bounds: tuple[int, int]) -> int:
    a, b = bounds
    return random.randint(int(a), int(b))


def euro_sampler(bounds: tuple[int, int]) -> int:
    if bounds is None:
        return 0
    a, b = bounds
    return int(random.randint(int(a), int(b)) * 0.6)


def default(config, year):
    return config.get(year, config["default"])

In [82]:
def preprocess_hhs(hhs, config: dict, year: str):
    column_mapping = default(config["column_mappings"], year)
    dwell_mapping = default(config["dwelling_type"], year)
    zone_mapping = default(config["zone"], year)

    hhs = hhs.select(column_mapping.keys()).rename(column_mapping)

    # dwell type
    hhs = hhs.with_columns(
        pl.col("dwelling_type").replace_strict(dwell_mapping).fill_null("unknown")
    )

    # urban/rural
    hhs = hhs.with_columns(
        pl.col("zone")
        .replace_strict(zone_mapping, default=pl.col("zone"))
        .fill_null("unknown")
        .alias("urban/rural")
    )
    hhs = hhs.drop("zone")

    # remove rows with any nulls
    hhs = hhs.drop_nulls()

    return hhs


hhs = pl.read_csv(root / "2022-25" / "1_QTS_HOUSEHOLDS.csv")
config = yaml.safe_load(open("qhts/hh_dictionary.yaml"))

hhs = preprocess_hhs(hhs, config, year="2022-25")

hhs.head()

hid,year,day,month,dwelling_type,hh_size,num_vehicles,num_bikes,weight,urban/rural
i64,i64,i64,i64,str,i64,i64,i64,f64,str
107999,2022,3,7,"""unknown""",1,0,0,32.484449,"""suburban"""
108000,2022,4,7,"""unknown""",1,1,1,32.484449,"""suburban"""
108002,2022,6,7,"""unknown""",1,1,3,32.484449,"""suburban"""
108004,2022,3,7,"""unknown""",1,1,1,32.484449,"""suburban"""
108008,2022,2,7,"""unknown""",1,1,0,32.484449,"""suburban"""


In [None]:
def preprocess_persons(persons, config: dict, year: str):
    column_mapping = default(config["column_mappings"], year)
    persons = persons.select(column_mapping.keys()).rename(column_mapping)

    age_mapping = default(config["age"], year)
    sex_mapping = default(config["sex"], year)
    relationship_mapping = default(config["relationship"], year)
    has_licence_mapping = default(config["has_licence"], year)
    employment_mapping = default(config["employment"], year)
    occupation_mapping = default(config["occupation"], year)
    income_mapping = default(config["income"], year)

    # age
    persons = persons.with_columns(
        pl.col("age")
        .replace_strict(age_mapping)
        .map_elements(sampler, pl.Float64)
        .alias("age")
    )

    # sex
    persons = persons.with_columns(
        pl.col("sex").replace_strict(sex_mapping).alias("sex")
    )

    # relationship
    persons = persons.with_columns(
        pl.col("relationship")
        .replace_strict(relationship_mapping, default=pl.col("relationship"))
        .fill_null("self")
    )

    # has_licence
    persons = persons.with_columns(
        pl.col("has_licence").replace_strict(has_licence_mapping)
    )

    # employment
    persons = persons.with_columns(
        pl.col("employment").replace_strict(employment_mapping)
    )

    # occupation
    persons = persons.with_columns(
        pl.col("occupation").replace_strict(occupation_mapping).fill_null("unknown")
    )

    # disability
    persons = persons.with_columns(
        pl.col("disability").cast(pl.Boolean).fill_null(False)
    )

    # income TODO missing assumed to be 0
    persons = persons.with_columns(
        pl.col("income")
        .replace_strict(income_mapping, default=None)
        .map_elements(euro_sampler, return_dtype=pl.Int32)
    )

    return persons


year = "2022-25"
config = yaml.safe_load(open("qhts/person_dictionary.yaml"))
columns = list(default(config["column_mappings"], year).keys())

persons = pl.read_csv(root / "2022-25" / "2_QTS_PERSONS.csv", columns=columns)

persons = preprocess_persons(persons, config, year="2022-25")

persons.head()

pid,hid,age,sex,relationship,has_licence,employment,occupation,disability,income
str,i64,f64,str,str,bool,str,str,bool,i32
"""107999/1000""",107999,70.0,"""female""","""self""",False,"""retired""","""unknown""",True,
"""108000/1000""",108000,15.0,"""male""","""self""",True,"""other""","""unknown""",True,
"""108002/1000""",108002,43.0,"""female""","""self""",True,"""education""","""professional""",False,
"""108004/1000""",108004,53.0,"""male""","""self""",True,"""employed""","""technical""",False,
"""108008/1000""",108008,73.0,"""female""","""self""",True,"""retired""","""unknown""",False,


In [93]:
def preprocess_trips(trips, config: dict, year: str):

    column_mapping = default(config["column_mappings"], year)
    trips = trips.select(column_mapping.keys()).rename(column_mapping)

    mask = pl.any_horizontal(pl.all().is_null())
    keep = (
        trips.group_by("pid")
        .agg(mask.any().alias("flag"))
        .filter(~pl.col("flag"))
        .select("pid")
    )
    trips = trips.join(keep, on="pid")

    # modes & acts
    mode_map = default(config["mode_mappings"], year)
    act_map = default(config["act_mappings"], year)
    trips = trips.with_columns(
        pl.col("mode").replace_strict(mode_map),
        pl.col("oact").replace_strict(act_map),
        pl.col("dact").replace_strict(act_map),
    )

    return trips


year = "2022-25"
config = yaml.safe_load(open("qhts/trip_dictionary.yaml"))
columns = list(default(config["column_mappings"], year).keys())

trips = pl.read_csv(
    root / "2022-25" / "5_QTS_TRIPS.csv",
    columns=columns,
    # null_values="Missing",
)
trips = preprocess_trips(trips, config, year="2022-25")

trips.head()

hid,pid,seq,mode,distance,tst,tet,oact,dact,ozone,dzone
i64,str,i64,str,f64,i64,i64,str,str,i64,i64
108034,"""108034/1003""",1,"""car""",5.9,240,260,"""home""","""education""",30303106324,30302105327
108034,"""108034/1003""",2,"""car""",5.89,670,690,"""education""","""home""",30302105327,30303106324
108038,"""108038/1000""",1,"""car""",4.63,145,155,"""home""","""other""",30303106324,30302105613
108038,"""108038/1000""",2,"""car""",4.29,558,567,"""other""","""shop""",30302105613,30303106221
108038,"""108038/1000""",3,"""car""",2.96,645,652,"""shop""","""home""",30303106221,30303106324


In [92]:
trips["ORIGPURP"].unique().to_list()

['Childcare or kindergarten',
 'Just accompanying someone',
 'Go home',
 'Education',
 'Eat/drink',
 'Work (other work reason)',
 'Refuel my vehicle',
 'Shopping',
 'Personal business',
 'Pick up or drop off a passenger',
 'Pickup/deliver something (not work-related)',
 'Work (my workplace)',
 'Recreation/leisure activity',
 'At Home',
 'Other Place',
 'Social Visit']

In [None]:
years = ["2012-2020", "2022-2023", "2023-2024"]
hhs_names = [
    "households_vista_2012_2020_lga_v1.csv",
    "household_vista_2022_2023.csv",
    "household_vista_2023_2024.csv",
]
persons_names = [
    "persons_vista_2012_2020_lga_v1.csv",
    "person_vista_2022_2023.csv",
    "person_vista_2023_2024.csv",
]

trips_names = [
    "trips_vista_2012_2020_lga_v1.csv",
    "trips_vista_2022_2023.csv",
    "trips_vista_2023_2024.csv",
]

for year, hh_name, persons_name, trips_name in zip(
    years, hhs_names, persons_names, trips_names
):

    hh_config = yaml.safe_load(open("vista/hh_dictionary.yaml"))
    hh_columns = list(default(hh_config["column_mappings"], year).keys())

    person_config = yaml.safe_load(open("vista/person_dictionary.yaml"))
    person_columns = list(default(person_config["column_mappings"], year).keys())

    trips_config = yaml.safe_load(open("vista/trip_dictionary.yaml"))
    trips_columns = list(default(trips_config["column_mappings"], year).keys())

    print(year, ":")

    hhs = pl.read_csv(
        root / year / hh_name, columns=hh_columns, null_values="Missing/Refused"
    )
    hhs = preprocess_hhs(hhs, hh_config, year=year)

    persons = pl.read_csv(root / year / persons_name, columns=person_columns)
    persons = preprocess_persons(persons, person_config, year=year)

    trips = pl.read_csv(
        root / year / trips_name,
        columns=trips_columns,
        null_values="Missing",
    )
    trips = preprocess_trips(trips, config, year=year)

    print(
        "number of hhs: ",
        len(hhs),
        " & persons: ",
        len(persons),
        " & trips: ",
        len(trips),
    )

2012-2020 :
number of hhs:  30195  & persons:  77428  & trips:  206268
2022-2023 :
number of hhs:  3551  & persons:  9218  & trips:  26456
2023-2024 :
number of hhs:  2930  & persons:  8175  & trips:  24054
