In [None]:
import random
from pathlib import Path

import polars as pl
import yaml

In [2]:
root = Path("~/Data/foundata/VISTA")

In [None]:
def euro_sampler(bounds: tuple[int, int]) -> int:
    a, b = bounds
    return int(random.randint(int(a), int(b)) * 0.6)


def sampler(bounds: tuple[int, int]) -> int:
    a, b = bounds
    return random.randint(int(a), int(b))


def default(config, year):
    return config.get(year, config["default"])

In [4]:
def preprocess_hhs(hhs, config: dict, year: str):
    column_mapping = default(config["column_mappings"], year)
    income_mapping = default(config["hh_income"], year)
    ownership_mapping = default(config["home_ownership"], year)
    zone_mapping = default(config["zone"], year)

    hhs = hhs.select(column_mapping.keys()).rename(column_mapping)
    hhs = hhs.with_columns(pl.col("year").str.slice(0, 4).cast(pl.Int32).alias("year"))

    # income
    if year == "2012-2020":
        hhs = hhs.with_columns(
            pl.when(pl.col("hh_income").is_null())
            .then(0)
            .otherwise(
                (
                    pl.col("hh_income")
                    .str.slice(1)
                    .str.replace_all(",", "")
                    .cast(pl.Int32)
                )
                * 52
                * 0.6  # weekly to annual and AUS $ to EURO
            )
            .alias("hh_income")
        )
    else:
        # use mapping
        hhs = hhs.with_columns(
            pl.col("hh_income")
            .replace_strict(
                income_mapping, default=pl.lit((0, 0)), return_dtype=pl.List
            )
            .map_elements(euro_sampler, return_dtype=pl.Float64)
        )

    # ownership
    hhs = hhs.with_columns(
        pl.col("home_ownership")
        .replace_strict(ownership_mapping)
        .fill_null("unknown")
        .alias("home_ownership")
    )

    # urban/rural
    hhs = hhs.with_columns(
        pl.col("zone")
        .replace_strict(zone_mapping, default=pl.col("zone"))
        .fill_null("unknown")
        .alias("urban/rural")
    )
    hhs = hhs.drop("zone")

    # weight
    if year == "2012-2020":
        hhs = hhs.with_columns(
            pl.when(pl.col("wd_weight").is_null())
            .then(pl.col("we_weight"))
            .otherwise(pl.col("wd_weight"))
            .alias("weight")
        )
        hhs = hhs.drop("wd_weight", "we_weight")

    # remove rows with any nulls
    hhs = hhs.drop_nulls()

    return hhs


hhs = pl.read_csv(root / "2012-2020" / "households_vista_2012_2020_lga_v1.csv")
config = yaml.safe_load(open("vista/hh_dictionary.yaml"))

hhs = preprocess_hhs(
    hhs, yaml.safe_load(open("vista/hh_dictionary.yaml")), year="2012-2020"
)

hhs.head()

hid,year,day,month,home_ownership,hh_size,hh_income,num_vehicles,num_bikes,urban/rural,weight
str,i32,str,str,str,i64,f64,i64,i64,str,f64
"""Y12H0000101""",2012,"""Monday""","""May""","""owned""",4,38220.0,2,2,"""suburban""",87.77
"""Y12H0000102""",2012,"""Tuesday""","""May""","""owned""",4,53040.0,3,0,"""suburban""",85.93
"""Y12H0000103""",2012,"""Wednesday""","""May""","""owned""",4,31200.0,2,0,"""suburban""",87.12
"""Y12H0000104""",2012,"""Thursday""","""May""","""owned""",3,58500.0,3,2,"""suburban""",87.24
"""Y12H0000107""",2012,"""Sunday""","""May""","""owned""",3,54600.0,1,3,"""suburban""",261.58


In [None]:
def preprocess_persons(persons, config: dict, year: str):
    column_mapping = default(config["column_mappings"], year)
    sex_mapping = default(config["sex"], year)
    relationship_mapping = default(config["relationship"], year)
    has_license_mapping = default(config["has_licence"], year)
    occupation_mapping = default(config["occupation"], year)

    persons = persons.select(column_mapping.keys()).rename(column_mapping)

    # age
    if not year == "2012-2020":
        persons = persons.with_columns(
            pl.col("age")
            .replace_strict({"100+": "100->100"}, default=pl.col("age"))
            .str.split("->")
            .map_elements(sampler, pl.Float64)
            .alias("age")
        )

    # sex
    persons = persons.with_columns(
        pl.col("sex").replace_strict(sex_mapping).alias("sex")
    )

    # relationship
    persons = persons.with_columns(
        pl.col("relationship")
        .replace_strict(relationship_mapping)
        .alias("relationship")
    )

    # has_license
    persons = persons.with_columns(
        pl.col("has_license")
        .replace_strict(has_license_mapping, default=None)
        .alias("has_license")
    )

    # employment
    persons = persons.with_columns(
        pl.when(pl.col("anywork") == "Y")
        .then(pl.lit("employed"))
        .otherwise(
            pl.when(pl.col("studying") == "No Study")
            .then(pl.lit("unemployed"))
            .otherwise(pl.lit("education"))
        )
        .alias("employment_status")
    )
    persons = persons.drop("anywork", "studying")

    # occupation
    persons = persons.with_columns(
        pl.col("occupation").replace_strict(occupation_mapping).alias("occupation")
    )

    return persons


year = "2012-2020"
config = yaml.safe_load(open("vista/person_dictionary.yaml"))
columns = list(default(config["column_mappings"], year).keys())

persons = pl.read_csv(
    root / "2012-2020" / "persons_vista_2012_2020_lga_v1.csv", columns=columns
)

persons = preprocess_persons(persons, config, year="2012-2020")

persons.head()

pid,hid,age,sex,relationship,has_license,occupation,employment_status
str,str,i64,str,str,bool,str,str
"""Y12H0000101P01""","""Y12H0000101""",50,"""male""","""self""",True,"""community_personal_service""","""employed"""
"""Y12H0000101P02""","""Y12H0000101""",43,"""female""","""spouse/partner""",True,"""not_in_labor_force""","""unemployed"""
"""Y12H0000101P03""","""Y12H0000101""",11,"""female""","""child""",False,"""not_in_labor_force""","""education"""
"""Y12H0000101P04""","""Y12H0000101""",6,"""female""","""child""",False,"""not_in_labor_force""","""education"""
"""Y12H0000102P01""","""Y12H0000102""",57,"""female""","""spouse/partner""",True,"""not_in_labor_force""","""unemployed"""


In [6]:
def preprocess_trips(trips, config: dict, year: str):

    column_mapping = default(config["column_mappings"], year)
    trips = trips.select(column_mapping.keys()).rename(column_mapping)

    mask = pl.any_horizontal(pl.all().is_null())
    keep = (
        trips.group_by("pid")
        .agg(mask.any().alias("flag"))
        .filter(~pl.col("flag"))
        .select("pid")
    )
    trips = trips.join(keep, on="pid")

    # modes & acts
    mode_map = default(config["mode_mappings"], year)
    act_map = default(config["act_mappings"], year)
    trips = trips.with_columns(
        pl.col("mode").replace_strict(mode_map),
        pl.col("oact").replace_strict(act_map),
        pl.col("dact").replace_strict(act_map),
    )

    return trips


year = "2023-2024"
config = yaml.safe_load(open("vista/trip_dictionary.yaml"))
columns = list(default(config["column_mappings"], year).keys())

trips = pl.read_csv(
    root / "2023-2024" / "trips_vista_2023_2024.csv",
    columns=columns,
    null_values="Missing",
)
trips = preprocess_trips(trips, config, year="2023-2024")

trips.head()

hid,pid,seq,mode,distance,tst,tet,oact,dact,ozone,dzone
str,str,i64,str,f64,i64,i64,str,str,str,str
"""Y24H5740102""","""Y24H5740102P01""",1,"""car""",1.8012,809,813,"""home""","""shop""","""Casey (C)""","""Casey (C)"""
"""Y24H5740102""","""Y24H5740102P01""",2,"""car""",1.8012,823,828,"""shop""","""home""","""Casey (C)""","""Casey (C)"""
"""Y24H5740102""","""Y24H5740102P01""",3,"""car""",22.89185,965,1000,"""home""","""visit""","""Casey (C)""","""Frankston (C)"""
"""Y24H5740102""","""Y24H5740102P01""",4,"""car""",42.72692,1003,1056,"""visit""","""visit""","""Frankston (C)""","""Melbourne (C)"""
"""Y24H5740102""","""Y24H5740102P01""",5,"""car""",48.55273,1147,1189,"""visit""","""home""","""Melbourne (C)""","""Casey (C)"""


In [7]:
years = ["2012-2020", "2022-2023", "2023-2024"]
hhs_names = [
    "households_vista_2012_2020_lga_v1.csv",
    "household_vista_2022_2023.csv",
    "household_vista_2023_2024.csv",
]
persons_names = [
    "persons_vista_2012_2020_lga_v1.csv",
    "person_vista_2022_2023.csv",
    "person_vista_2023_2024.csv",
]

trips_names = [
    "trips_vista_2012_2020_lga_v1.csv",
    "trips_vista_2022_2023.csv",
    "trips_vista_2023_2024.csv",
]

for year, hh_name, persons_name, trips_name in zip(
    years, hhs_names, persons_names, trips_names
):

    hh_config = yaml.safe_load(open("vista/hh_dictionary.yaml"))
    hh_columns = list(default(hh_config["column_mappings"], year).keys())

    person_config = yaml.safe_load(open("vista/person_dictionary.yaml"))
    person_columns = list(default(person_config["column_mappings"], year).keys())

    trips_config = yaml.safe_load(open("vista/trip_dictionary.yaml"))
    trips_columns = list(default(trips_config["column_mappings"], year).keys())

    print(year, ":")

    hhs = pl.read_csv(
        root / year / hh_name, columns=hh_columns, null_values="Missing/Refused"
    )
    hhs = preprocess_hhs(hhs, hh_config, year=year)

    persons = pl.read_csv(root / year / persons_name, columns=person_columns)
    persons = preprocess_persons(persons, person_config, year=year)

    trips = pl.read_csv(
        root / year / trips_name,
        columns=trips_columns,
        null_values="Missing",
    )
    trips = preprocess_trips(trips, config, year=year)

    print(
        "number of hhs: ",
        len(hhs),
        " & persons: ",
        len(persons),
        " & trips: ",
        len(trips),
    )

2012-2020 :
number of hhs:  30195  & persons:  77428  & trips:  206268
2022-2023 :
number of hhs:  3551  & persons:  9218  & trips:  26456
2023-2024 :
number of hhs:  2930  & persons:  8175  & trips:  24054
