In [1]:
from pathlib import Path

import polars as pl

from foundata import cmap, nhts, verify
from foundata.utils import check_overlap, filter_time_consistent, table_joiner

In [2]:
configs_root = Path("configs")

# CMAP

In [3]:
hh_config = configs_root / "cmap" / "hh_dictionary.yaml"
person_config = configs_root / "cmap" / "person_dictionary.yaml"
trip_config = configs_root / "cmap" / "trip_dictionary.yaml"

data_root = Path("/home/fred/Data/foundata/CMAP")

In [4]:
hhs = cmap.load_households(data_root, hh_config)
hhs.head()

hid,weight,hh_income,residence,ownership,vehicles,hh_size,year,month,day
i64,f64,i32,str,str,i64,i64,i32,i8,str
20000083,155.2391,78025,"""flat""","""rent""",1,1,2017,10,"""wednesday"""
20000136,319.6422,140560,"""flat""","""rent""",1,3,2017,10,"""monday"""
20000228,540.4961,192274,"""house""","""own""",1,1,2017,10,"""sunday"""
20000248,662.6029,6024,"""flat""","""other""",1,3,2017,9,"""thursday"""
20000300,68.701,60403,"""flat""","""rent""",0,1,2017,9,"""wednesday"""


In [5]:
persons = cmap.load_persons(data_root, person_config)
persons.head()

hid,pid,sex,age,disability,education,can_wfh,industry,race,has_licence,relationship,employment
i64,i64,str,i64,bool,str,str,str,str,str,str,str
20000083,200000831,"""male""",45,False,"""undergraduate""","""yes""","""sales""","""white""","""yes""","""self""","""employed"""
20000136,200001361,"""female""",33,False,"""graduate""","""no""","""technical""","""white""","""yes""","""self""","""employed"""
20000136,200001362,"""male""",34,False,"""graduate""","""no""","""professional""","""white""","""yes""","""partner""","""employed"""
20000136,200001363,"""male""",1,False,"""none""","""n/a""","""n/a""","""white""","""n/a""","""child""","""n/a"""
20000228,200002281,"""female""",50,False,"""undergraduate""","""no""","""technical""","""multiracial""","""yes""","""self""","""employed"""


In [6]:
attributes = table_joiner(hhs, persons, on="hid").with_columns(
    country=pl.lit("usa"), source=pl.lit("cmap")
)

In [7]:
rurality = cmap.load_rurality(configs_root)
rurality_mapping = cmap.load_locations(data_root, rurality_table=rurality)
trips = cmap.load_trips(
    data_root, trip_config, rurality_mapping=rurality_mapping
)

In [8]:
attributes, trips = filter_time_consistent(attributes, trips)

Total trips: 98758, Total plans: 25007, from 30683 attributes
Removed 79 trips or 13 plans and 13 attributes due to time inconsistency


In [9]:
attributes.head()

hid,weight,hh_income,residence,ownership,vehicles,hh_size,year,month,day,pid,sex,age,disability,education,can_wfh,industry,race,has_licence,relationship,employment,country,source
i64,f64,i32,str,str,i64,i64,i32,i8,str,i64,str,i64,bool,str,str,str,str,str,str,str,str,str
20000083,155.2391,78025,"""flat""","""rent""",1,1,2017,10,"""wednesday""",200000831,"""male""",45,False,"""undergraduate""","""yes""","""sales""","""white""","""yes""","""self""","""employed""","""usa""","""cmap"""
20000136,319.6422,140560,"""flat""","""rent""",1,3,2017,10,"""monday""",200001361,"""female""",33,False,"""graduate""","""no""","""technical""","""white""","""yes""","""self""","""employed""","""usa""","""cmap"""
20000136,319.6422,140560,"""flat""","""rent""",1,3,2017,10,"""monday""",200001362,"""male""",34,False,"""graduate""","""no""","""professional""","""white""","""yes""","""partner""","""employed""","""usa""","""cmap"""
20000136,319.6422,140560,"""flat""","""rent""",1,3,2017,10,"""monday""",200001363,"""male""",1,False,"""none""","""n/a""","""n/a""","""white""","""n/a""","""child""","""n/a""","""usa""","""cmap"""
20000228,540.4961,192274,"""house""","""own""",1,1,2017,10,"""sunday""",200002281,"""female""",50,False,"""undergraduate""","""no""","""technical""","""multiracial""","""yes""","""self""","""employed""","""usa""","""cmap"""


In [10]:
trips.head()

hid,pid,seq,dact,mode,tet,tst,distance,oact,year,month,ozone,dzone
i64,i64,i64,str,str,i32,i32,f64,str,i32,i8,str,str
20000083,200000831,1,"""work""","""car""",446,428,4.977,"""home""",2017,10,"""urban""","""urban"""
20000083,200000831,2,"""medical""","""car""",724,713,4.949,"""work""",2017,10,"""urban""","""urban"""
20000083,200000831,3,"""shop""","""car""",734,732,0.71,"""medical""",2017,10,"""urban""","""urban"""
20000083,200000831,4,"""other""","""car""",748,747,1.51,"""shop""",2017,10,"""urban""","""urban"""
20000083,200000831,5,"""work""","""car""",759,751,3.315,"""other""",2017,10,"""urban""","""urban"""


In [11]:
# check for missing pids
# if missing from trips then assume stay at home
check_overlap(attributes, trips, on="pid")



set()

In [12]:
verify.columns(attributes, trips)

True

# NHTS

In [13]:
# nhts
hh_config = configs_root / "nhts" / "hh_dictionary.yaml"
person_config = configs_root / "nhts" / "person_dictionary.yaml"
trip_config = configs_root / "nhts" / "trip_dictionary.yaml"

data_root = Path("/home/fred/Data/foundata/NHTS")

In [14]:
hhs = nhts.load_households(data_root, hh_config)
hhs[2001].head()

Loading households for 2022 from hhv2pub.csv...
Loading households for 2017 from hhpub.csv...
Loading households for 2009 from HHV2PUB.CSV...
Loading households for 2001 from HHPUB.csv...


hh_size,ownership,hid,num_workers,num_vehicles,hh_income,day,year,month,race
i64,str,i64,i64,i64,i32,str,i32,i32,str
2,"""owned""",2001010000018,2,3,166809,"""sunday""",2001,5,"""white"""
1,"""rented""",2001010000045,1,1,22996,"""tuesday""",2001,4,"""white"""
1,"""owned""",2001010000474,0,2,0,"""monday""",2001,4,"""white"""
2,"""owned""",2001010000577,2,3,113224,"""wednesday""",2001,12,"""white"""
2,"""owned""",2001010000652,0,1,25564,"""friday""",2001,4,"""white"""
