In [None]:
import random
from pathlib import Path

import polars as pl
import yaml

In [None]:
root = Path("~/Data/foundata/NTS/tab")
trips_csv = root / "trip_eul_2002-2023.tab"
attributes_csv = root / "individual_eul_2002-2023.tab"
hhs_csv = root / "household_eul_2002-2023.tab"

In [15]:
def euro_sampler(bounds: tuple[int, int]) -> int:
    # gbp to euro
    # if bounds is None:
    #     return 0
    a, b = bounds
    return int(random.randint(int(a), int(b)) * 0.15)

In [21]:
hh_config = yaml.safe_load(open("../configs/nts/hh_dictionary.yaml"))

hh_columns = hh_config["column_mappings"]

hhs = pl.read_csv(hhs_csv, separator="\t", columns=list(hh_columns.keys())).rename(
    hh_columns
)

# month
month_config = hh_config["month"]
hhs = hhs.with_columns(pl.col("month").replace_strict(month_config))

# day of week
day_config = hh_config["day"]
hhs = hhs.with_columns(pl.col("day").replace_strict(day_config))

# sample income
income_config = hh_config["income"]
hhs = hhs.with_columns(
    pl.col("income")
    .replace_strict(income_config)
    .map_elements(euro_sampler, return_dtype=pl.Int32)
)

# ownership
ownership_config = hh_config["ownership"]
hhs = hhs.with_columns(pl.col("ownership").replace_strict(ownership_config))

# property type
property_type_config = hh_config["property_type"]
hhs = hhs.with_columns(pl.col("property_type").replace_strict(property_type_config))

# area
area_config = hh_config["area"]
hhs = hhs.with_columns(pl.col("area").replace_strict(area_config))

hhs.head()

hid,month,day,property_type,ownership,hh_size,hh_cars,area,weight,income,year
i64,str,str,str,str,i64,i64,str,f64,i32,i64
2005003851,"""June""","""Wednesday""","""detached""","""owns""",4,2,"""rural""",0.994879,29781,2005
2007003259,"""May""","""Tuesday""","""semi_detached""","""rents""",2,0,"""rural""",0.848777,0,2007
2008004519,"""July""","""Thursday""","""flat""","""rents""",3,1,"""urban""",1.14176,4294,2008
2010003811,"""July""","""Thursday""","""semi_detached""","""owns""",5,2,"""urban""",0.829649,4498,2010
2012006270,"""September""","""Thursday""","""semi_detached""","""rents""",4,0,"""urban""",0.930369,1008,2012


In [None]:
persons_config = yaml.safe_load(open("../configs/nts/persons_dictionary.yaml"))
persons_columns = persons_config["column_mappings"]