In [2]:
from pathlib import Path
import random
import polars as pl
import yaml

In [3]:
root = Path("~/Data/foundata/LTDS2425")

In [9]:
def euro_sampler(bounds: tuple[int, int]) -> int:
    a, b = bounds
    return int(random.randint(int(a), int(b)) * 0.9)


def sampler(bounds: tuple[int, int]) -> int:
    a, b = bounds
    return random.randint(int(a), int(b))


def default(config, year):
    return config.get(year, config["default"])

In [8]:
def load_mapping(path: Path, k_name: str, v_name: str) -> dict:
    file = pl.read_csv(path)
    k_col, v_col = file[k_name], file[v_name]
    mapping = dict(zip(k_col, v_col))
    return mapping

In [15]:
def preprocess_hhs(hhs, config: dict, year: str):
    column_mapping = default(config["column_mappings"], year)
    income_mapping = default(config["hh_income"], year)
    struct_mapping = default(config["hh_structure"], year)
    zone_mapping = load_mapping(root / "HABORO_T.csv", "HABORO", "TYPE")

    hhs = hhs.select(column_mapping.keys()).rename(column_mapping)

    # year
    hhs = hhs.with_columns(pl.col("year") + 2000)

    # income
    hhs = hhs.with_columns(
        pl.col("hh_income")
        .replace_strict(income_mapping, default=pl.lit((0, 0)), return_dtype=pl.List)
        .map_elements(euro_sampler, return_dtype=pl.Float64)
    )

    # structure
    hhs = hhs.with_columns(
        pl.col("hh_structure").replace_strict(struct_mapping).fill_null("unknown")
    )

    # urban/rural
    hhs = hhs.with_columns(
        pl.col("zone")
        .replace_strict(zone_mapping, default=pl.col("zone"))
        .fill_null("unknown")
        .alias("urban/rural")
    )
    hhs = hhs.drop("zone")

    print(len(hhs))
    # remove rows with any nulls
    hhs = hhs.drop_nulls()
    print(len(hhs))

    return hhs


config = yaml.safe_load(open("ltds/hh_dictionary.yaml"))
columns = list(default(config["column_mappings"], year="2425").keys())
hhs = pl.read_csv(root / "Household.csv", columns=columns)


hhs = preprocess_hhs(hhs, config, year="2425")

hhs.head()

8208
8208


hid,year,day,hh_structure,hh_size,hh_income,num_vehicles,num_bikes,weight,urban/rural
i64,i64,i64,str,i64,f64,i64,i64,f64,str
24009181,2024,6,"""lone parent""",4,46404.0,1,0,251.969105,"""suburban"""
24010021,2024,2,"""couple""",6,57264.0,1,3,481.303435,"""suburban"""
24010041,2024,2,"""couple""",4,37287.0,1,0,481.303435,"""suburban"""
24010051,2024,7,"""lone parent""",3,137895.0,0,1,348.118159,"""suburban"""
24010091,2024,2,"""single adult""",1,103817.0,1,0,810.201572,"""suburban"""


In [34]:
def preprocess_persons(persons, config: dict, year: str):
    column_mapping = default(config["column_mappings"], year)
    sex_mapping = default(config["sex"], year)
    relationship_mapping = default(config["relationship"], year)
    race_mapping = default(config["race"], year)

    persons = persons.select(column_mapping.keys()).rename(column_mapping)

    # age
    persons = persons.with_columns(
        pl.col("age")
        .replace_strict({"65+": "65-100"}, default=pl.col("age"))
        .str.split("-")
        .map_elements(sampler, pl.Float64)
    )

    # sex
    persons = persons.with_columns(pl.col("sex").replace_strict(sex_mapping))

    # relationship
    persons = persons.with_columns(
        pl.col("relationship")
        .replace_strict(relationship_mapping)
        .alias("relationship")
    )

    # race
    persons = persons.with_columns(
        pl.col("race")
        .replace_strict(race_mapping, default=pl.col("race"))
        .fill_null("unknown")
    )

    return persons


year = "2425"
config = yaml.safe_load(open("ltds/person_dictionary.yaml"))
columns = list(default(config["column_mappings"], year).keys())

persons = pl.read_csv(root / "person.csv", columns=columns)

persons = preprocess_persons(persons, config, year=year)

persons.head()

pid,hid,age,sex,relationship,race
i64,i64,f64,str,str,str
2412723104,24127231,2.0,"""male""","""child""","""unknown"""
2405718303,24057183,4.0,"""male""","""child""","""unknown"""
2426119103,24261191,2.0,"""female""","""child""","""unknown"""
2437901111,24379011,3.0,"""female""","""child""","""unknown"""
2439211107,24392111,4.0,"""female""","""other""","""unknown"""


In [44]:
def preprocess_persons_data(persons, config: dict, year: str):
    column_mapping = default(config["column_mappings"], year)

    has_license_mapping = default(config["has_licence"], year)
    employment_mapping = default(config["employment_status"], year)
    # occupation_mapping = default(config["occupation"], year)

    persons = persons.select(column_mapping.keys()).rename(column_mapping)

    # has_license
    persons = persons.with_columns(
        pl.col("has_license").replace_strict(
            has_license_mapping, default=None, return_dtype=pl.String
        )
    )

    # employment
    persons = persons.with_columns(
        pl.col("employment_status").replace_strict(employment_mapping)
    )

    # occupation
    # persons = persons.with_columns(
    #     pl.col("occupation").replace_strict(occupation_mapping).alias("occupation")
    # )

    return persons


year = "2425"
config = yaml.safe_load(open("ltds/person_data_dictionary.yaml"))
columns = list(default(config["column_mappings"], year).keys())

persons = pl.read_csv(root / "person data.csv", columns=columns)

persons = preprocess_persons_data(persons, config, year=year)

persons.head()

pid,hid,has_license,employment_status
i64,i64,str,str
2400412104,24004121,"""no""","""student"""
2400412105,24004121,"""no""","""student"""
2400412106,24004121,"""no""","""student"""
2400410101,24004101,"""yes""","""employed"""
2400410102,24004101,"""yes""","""unemployed"""


In [85]:
def sample_minute(base: int) -> int:
    return random.randint(int(base), int(base) + 5)


def sample_tst(row) -> int:
    tst_hr, tet_hr, duration = row["tst"], row["tet"], row["duration"]
    # earliest start is max of tst_hr and tet_hr - duration
    earliest = max(tst_hr, tet_hr - duration)
    # latest start is min of tst_hr + 1 and tet_hr + 1 - duration
    latest = min(tst_hr + 60, tet_hr + 60 - duration)
    if latest < earliest:
        print("warning: bad overlap")
        return int((tst_hr + tet_hr + duration) / 2)
    return random.randint(earliest, latest)


print(sample_tst(0, 0, 60))
print(sample_tst(0, 60, 60))
print(sample_tst(0, 120, 60))
print(sample_tst(0, 121, 60))

TypeError: sample_tst() takes 1 positional argument but 3 were given

In [90]:
def preprocess_trips(trips, config: dict, year: str):

    column_mapping = default(config["column_mappings"], year)
    trips = trips.select(column_mapping.keys()).rename(column_mapping)

    mask = pl.any_horizontal(pl.all().is_null())
    print(len(trips))
    keep = (
        trips.group_by("pid")
        .agg(mask.any().alias("flag"))
        .filter(~pl.col("flag"))
        .select("pid")
    )
    trips = trips.join(keep, on="pid")
    print(len(trips))

    # modes & acts
    mode_map = default(config["mode"], year)
    act_map = default(config["act"], year)
    trips = trips.with_columns(
        pl.col("mode").replace_strict(mode_map),
        pl.col("oact").replace_strict(act_map),
        pl.col("dact").replace_strict(act_map),
    )

    # duration
    trips = trips.with_columns(pl.col("duration").map_elements(sample_minute))

    # time to minutes
    trips = trips.with_columns(
        pl.col("tst") * 60,
        pl.col("tet") * 60,
    )

    # sample trip times
    trips = trips.with_columns(
        pl.struct("tst", "tet", "duration")
        .map_elements(sample_tst, return_dtype=pl.Int32)
        .alias("tst")
    )
    trips = trips.with_columns((pl.col("tst") + pl.col("duration")).alias("tet"))

    return trips


year = "2425"
config = yaml.safe_load(open("ltds/trip_dictionary.yaml"))
columns = list(default(config["column_mappings"], year).keys())

trips = pl.read_csv(
    root / "Trip.csv",
    columns=columns,
    # null_values="Missing",
)
trips = preprocess_trips(trips, config, year="2023-2024")

trips.head()

36170
36170


hid,pid,mode,tst,tet,duration,oact,dact,ozone,dzone
i64,i64,str,i32,i64,i64,str,str,i64,i64
24007421,2400742102,"""car""",1112,1127,15,"""home""","""leisure""",15,15
24007421,2400742101,"""car""",1081,1094,13,"""home""","""leisure""",15,15
24007421,2400742101,"""car""",1360,1374,14,"""leisure""","""home""",15,15
24007421,2400742102,"""car""",1362,1372,10,"""leisure""","""home""",15,15
24007421,2400742105,"""car""",1118,1132,14,"""home""","""leisure""",15,15


In [64]:
trips["duration"].value_counts(sort=True).head(10)

duration,count
i64,u32
10,6212
15,5139
5,4379
30,4119
20,3627
60,2255
45,1620
0,1526
25,1512
40,1364


In [None]:
years = ["2012-2020", "2022-2023", "2023-2024"]
hhs_names = [
    "households_vista_2012_2020_lga_v1.csv",
    "household_vista_2022_2023.csv",
    "household_vista_2023_2024.csv",
]
persons_names = [
    "persons_vista_2012_2020_lga_v1.csv",
    "person_vista_2022_2023.csv",
    "person_vista_2023_2024.csv",
]

trips_names = [
    "trips_vista_2012_2020_lga_v1.csv",
    "trips_vista_2022_2023.csv",
    "trips_vista_2023_2024.csv",
]

for year, hh_name, persons_name, trips_name in zip(
    years, hhs_names, persons_names, trips_n        02 Education (inc escort education)
        06 Other (inc Escort/ Worship)
        06 Home
        03 Leisure
        04 Shopping and personal business
    ames
):

    hh_config = yaml.safe_load(open("vista/hh_dictionary.yaml"))
    hh_columns = list(default(hh_config["column_mappings"], year).keys())

    person_config = yaml.safe_load(open("vista/person_dictionary.yaml"))
    person_columns = list(default(person_config["column_mappings"], year).keys())

    trips_config = yaml.safe_load(open("vista/trip_dictionary.yaml"))
    trips_columns = list(default(trips_config["column_mappings"], year).keys())

    print(year, ":")

    hhs = pl.read_csv(
        root / year / hh_name, columns=hh_columns, null_values="Missing/Refused"
    )
    hhs = preprocess_hhs(hhs, hh_config, year=year)

    persons = pl.read_csv(root / year / persons_name, columns=person_columns)
    persons = preprocess_persons(persons, person_config, year=year)

    trips = pl.read_csv(
        root / year / trips_name,
        columns=trips_columns,
        null_values="Missing",
    )
    trips = preprocess_trips(trips, config, year=year)

    print(
        "number of hhs: ",
        len(hhs),
        " & persons: ",
        len(persons),
        " & trips: ",
        len(trips),
    )

2012-2020 :
number of hhs:  30195  & persons:  77428  & trips:  206268
2022-2023 :
number of hhs:  3551  & persons:  9218  & trips:  26456
2023-2024 :
number of hhs:  2930  & persons:  8175  & trips:  24054
