In [None]:
import json
from pathlib import Path

import polars as pl
import yaml
from pam import read

In [13]:
cmap_root = Path("~/Data/foundata/cmap")

hhs = (
    pl.read_csv(cmap_root / "household.csv", ignore_errors=True)
    .fill_nan(-9)
    .fill_null(-9)
)
persons = (
    pl.read_csv(cmap_root / "person.csv", ignore_errors=True).fill_nan(-9).fill_null(-9)
)
trips = (
    pl.read_csv(cmap_root / "place.csv", ignore_errors=True)
    .fill_nan(-9)
    .fill_null(-9)
)
vehicles = (
    pl.read_csv(cmap_root / "vehicle.csv", ignore_errors=True)
    .fill_nan(-9)
    .fill_null(-9)
)

In [3]:
hhs.tail()

sampno,travdate,travday,hhsize,hhveh,transprob,transprob_o,hlive,restylast,restylast_o,restylast_zip,hhinc2,homeown,homeown_o,futuresurvey,incendonate,resty,resty_o,hhinc,download,source,source_o,hhtrips,phase,wthhfin
i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64
70100991,"""2018-10-04""",5,3,3,1,-1,2,2,-1,60827,-1,3,-1,1,0,1,-1,2,1,2,-1,9,0,647.0892
70100992,"""2018-09-25""",3,3,3,1,-1,34,-1,-1,-1,-1,2,-1,1,0,2,-1,8,2,97,-9,4,0,305.085
70100993,"""2018-09-10""",2,2,3,3,-1,25,-1,-1,-1,-1,1,-1,1,-7,1,-1,7,2,7,-1,8,0,615.7184
70100998,"""2019-04-23""",3,1,0,97,-9,2,3,-1,60624,-1,3,-1,2,0,3,-1,2,2,97,-9,6,0,74.5194
70100999,"""2019-04-24""",4,1,1,3,-1,35,-1,-1,-1,-1,3,-1,1,0,1,-1,1,2,97,-9,5,0,313.6208


In [4]:
# extract year and month
hhs = hhs.with_columns(
    pl.col("travdate").str.split("-").list.get(0).str.to_integer().alias("year"),
    pl.col("travdate").str.split("-").list.get(1).str.to_integer().alias("month"),
)

# map
with open("cmap/hh_dictionary.json") as f:
    hh_mapper = json.load(f)
hh_mapper = {c: {int(k): v for k, v in d.items()} for c, d in hh_mapper.items()}

hhs = hhs.with_columns(
    pl.col(col).replace_strict(mapping, default=None).fill_null(pl.col(col))
    for col, mapping in hh_mapper.items()
)

# check for numerics
string_cols = [
    col for col in hhs.columns
    if hhs[col].dtype == pl.String
]
can_integer_cols = [
    col for col in string_cols
    if hhs[col].str.to_integer(strict=False).null_count() == 0
]
can_float_cols = [
    col for col in string_cols
    if hhs[col].cast(pl.Float64, strict=False).null_count() == 0
]

hhs = hhs.with_columns([
    pl.col(can_integer_cols).cast(pl.Int64)
])

hhs


sampno,travdate,travday,hhsize,hhveh,transprob,transprob_o,hlive,restylast,restylast_o,restylast_zip,hhinc2,homeown,homeown_o,futuresurvey,incendonate,resty,resty_o,hhinc,download,source,source_o,hhtrips,phase,wthhfin,year,month
i64,str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,f64,i64,i64
20000083,"""2017-10-11""","""Wednesday""",1,1,"""Not ascertained""","""Appropriate skip""","""2""","""An apartment or condo,""","""Appropriate skip""","""60194""","""Appropriate skip""","""Rent""","""Appropriate skip""","""Yes""","""Not ascertained""","""An apartment or condo,""","""Appropriate skip""","""$60,000 to $74,999""","""Receive in the mail""","""Appropriate skip""","""Appropriate skip""",9,"""Pilot 1""",155.2391,2017,10
20000136,"""2017-10-23""","""Monday""",3,1,"""Not ascertained""","""Appropriate skip""","""1""","""Single-family attached house (…","""Appropriate skip""","""2116""","""Appropriate skip""","""Rent""","""Appropriate skip""","""Yes""","""Not ascertained""","""An apartment or condo,""","""Appropriate skip""","""$100,000 to $149,999""","""Receive in the mail""","""Appropriate skip""","""Appropriate skip""",4,"""Pilot 1""",319.6422,2017,10
20000228,"""2017-10-01""","""Monday""",1,1,"""Not ascertained""","""Appropriate skip""","""10""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Own""","""Appropriate skip""","""No""","""Not ascertained""","""Single-family detached house,""","""Appropriate skip""","""$150,000 or more""","""Receive in the mail""","""Appropriate skip""","""Appropriate skip""",9,"""Pilot 1""",540.4961,2017,10
20000248,"""2017-09-21""","""Thursday""",3,1,"""Not ascertained""","""Appropriate skip""","""26""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Occupied without payment of re…","""Appropriate skip""","""Yes""","""Not ascertained""","""An apartment or condo,""","""Appropriate skip""","""Less than $15,000""","""Receive in the mail""","""Appropriate skip""","""Appropriate skip""",4,"""Pilot 1""",662.6029,2017,9
20000300,"""2017-09-13""","""Wednesday""",1,0,"""Not ascertained""","""Appropriate skip""","""8""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Rent""","""Appropriate skip""","""Yes""","""Not ascertained""","""An apartment or condo,""","""Appropriate skip""","""$50,000 to $59,999""","""Print""","""Appropriate skip""","""Appropriate skip""",2,"""Pilot 1""",68.701,2017,9
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
70100991,"""2018-10-04""","""Thursday""",3,3,"""Too much traffic""","""Appropriate skip""","""2""","""Single-family attached house (…","""Appropriate skip""","""60827""","""Appropriate skip""","""Rent""","""Appropriate skip""","""Yes""","""Send the gift to my household""","""Single-family detached house,""","""Appropriate skip""","""$15,000 to $24,999""","""Print""","""Facebook""","""Appropriate skip""",9,"""Main""",647.0892,2018,10
70100992,"""2018-09-25""","""Tuesday""",3,3,"""Too much traffic""","""Appropriate skip""","""34""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Own with mortgage""","""Appropriate skip""","""Yes""","""Send the gift to my household""","""Single-family attached house (…","""Appropriate skip""","""$75,000 to $99,999""","""Receive in the mail""","""Other""","""Not ascertained""",4,"""Main""",305.085,2018,9
70100993,"""2018-09-10""","""Monday""",2,3,"""Roads and bridges are in poor …","""Appropriate skip""","""25""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Own without mortgage""","""Appropriate skip""","""Yes""","""I prefer not to answer""","""Single-family detached house,""","""Appropriate skip""","""$60,000 to $74,999""","""Receive in the mail""","""Partner emails""","""Appropriate skip""",8,"""Main""",615.7184,2018,9
70100998,"""2019-04-23""","""Tuesday""",1,0,"""Other:""","""-9""","""2""","""An apartment or condo,""","""Appropriate skip""","""60624""","""Appropriate skip""","""Rent""","""Appropriate skip""","""No""","""Send the gift to my household""","""An apartment or condo,""","""Appropriate skip""","""$15,000 to $24,999""","""Receive in the mail""","""Other""","""Not ascertained""",6,"""Main""",74.5194,2019,4


In [5]:
# map
with open("cmap/person_dictionary.json") as f:
    person_mapper = json.load(f)
person_mapper = {c: {int(k): v for k, v in d.items()} for c, d in person_mapper.items()}

persons = persons.with_columns(
    pl.col(col).replace(mapping, default=None).fill_null(pl.col(col))
    for col, mapping in person_mapper.items()
)

# check for numerics
string_cols = [
    col for col in persons.columns
    if persons[col].dtype == pl.String
]
can_integer_cols = [
    col for col in string_cols
    if persons[col].str.to_integer(strict=False).null_count() == 0
]
can_float_cols = [
    col for col in string_cols
    if persons[col].cast(pl.Float64, strict=False).null_count() == 0
]

persons = persons.with_columns([
    pl.col(can_integer_cols).cast(pl.Int64)
])

persons


(Deprecated in version 1.0.0)
  pl.col(col).replace(mapping, default=None).fill_null(pl.col(col))


sampno,perno,date_completed,retmode_final,retmode,proxy,proxyperno,age,aage,age18,sex,relate,relate_o,lic,hisp,hisp_o,race,race_o,smrtphn,emply_ask,jobs,wkstat,wkstat_o,volun_freq,wplace,wmode,wmode_o,wparkride,pervh,wrkhrs,wtrav,occup,occup_o,indus,indus_o,emply_transit,emply_transit_o,…,nogowhy2_4,nogowhy2_5,nogowhy2_rf,nogowhy2_dk,traveldatause,traveldatause_pilot,traveldatadevice,traveldatadevice_1,traveldatadevice_2,traveldatadevice_3,traveldatadevice_se,traveldatadevice_rf,traveldatadevice_dk,traveldatadevice_o,traveldatamode,traveldatamode_1,traveldatamode_2,traveldatamode_3,traveldatamode_4,traveldatamode_5,traveldatamode_se,traveldatamode_rf,traveldatamode_dk,traveldatamode_o,dtype,dtype_1,dtype_2,dtype_3,dtype_4,dtype_5,dtype_6,dtype_se,dtype_rf,dtype_dk,qc_trip_person,pertrips,wtperfin
i64,i64,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,f64
20000083,1,"""2017-10-12""","""WEB""","""Smartphone App & Web""","""No""",1,"""45""","""Appropriate skip""","""Appropriate skip""","""Male""","""Self""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""White ""","""Appropriate skip""","""Yes""","""Yes""","""1""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Same work place every day""","""Auto / van / truck (as the dri…","""Appropriate skip""","""Appropriate skip""","""No""","""42""","""5 days a week""","""Business and Financial Operati…","""Appropriate skip""","""44-45""","""Appropriate skip""","""Something else""","""UNKNOWN SUBSIDY PERCENT""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Rarely""","""2;3""","""Appropriate skip""","""Smartphones""","""Navigation devices""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""1;2;3""","""Driving""","""Public Transportation - Bus Sc…","""Public Transportation - Train …","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",1,"""9""",155.2391
20000136,1,"""2017-10-24""","""WEB""","""Smartphone App & Web""","""No""",1,"""33""","""Appropriate skip""","""Appropriate skip""","""Female""","""Self""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""White ""","""Appropriate skip""","""Yes""","""Yes""","""1""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Same work place every day""","""Auto / van / truck (as the dri…","""Appropriate skip""","""Appropriate skip""","""No""","""20""","""2 days a week""","""Healthcare Practitioners and T…","""Appropriate skip""","""Professional, Scientific, and …","""Appropriate skip""","""No subsidy offered""","""Appropriate skip""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Daily""","""Smartphones""","""Appropriate skip""","""Smartphones""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""1;2""","""Driving""","""Public Transportation - Bus Sc…","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",0,"""2""",319.6422
20000136,2,"""2017-10-24""","""WEB""","""Web Only""","""No""",2,"""34""","""Appropriate skip""","""Appropriate skip""","""Male""","""Spouse/Unmarried partner""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""White ""","""Appropriate skip""","""No""","""Yes""","""1""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Same work place every day""","""Walk""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""100""","""7 days a week""","""Healthcare Practitioners and T…","""Appropriate skip""","""Health Care and Social Assista…","""Appropriate skip""","""I don't know""","""Appropriate skip""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Weekly""","""Smartphones""","""Appropriate skip""","""Smartphones""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""1;2""","""Driving""","""Public Transportation - Bus Sc…","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",0,"""2""",319.6422
20000136,3,"""2017-10-24""","""-1""","""-1""","""Yes""",1,"""1""","""Appropriate skip""","""Appropriate skip""","""Male""","""Son/Daughter""","""Appropriate skip""","""Appropriate skip""","""No""","""Appropriate skip""","""White ""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""-1""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",0,"""Appropriate skip""",319.6422
20000228,1,"""2017-10-02""","""WEB""","""Smartphone App & Web""","""No""",1,"""50""","""Appropriate skip""","""Appropriate skip""","""Female""","""Self""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""Multiracial""","""Appropriate skip""","""Yes""","""Yes""","""1""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Work from home""","""[$I_DO] not travel to work""","""Appropriate skip""","""Appropriate skip""","""No""","""35""","""Appropriate skip""","""Business and Financial Operati…","""Appropriate skip""","""Professional, Scientific, and …","""Appropriate skip""","""No subsidy offered""","""Appropriate skip""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Rarely""","""Smartphones""","""Appropriate skip""","""Smartphones""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Public Transportation - Bus Sc…","""Appropriate skip""","""Public Transportation - Bus Sc…","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",1,"""9""",540.4961
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
70100992,3,"""2018-09-27""","""WEB""","""Web Only""","""No""",3,"""26""","""Appropriate skip""","""Appropriate skip""","""Male""","""Son/Daughter""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""White ""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""Unemployed but looking for wor…","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",…,"""Appropriate skip""","""I did none of these""","""Appropriate skip""","""Appropriate skip""","""No""","""-1""","""1;2;97""","""Message signs""","""Smartphones""","""Appropriate skip""","""Other""","""Appropriate skip""","""Appropriate skip""","""Radio""","""None""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""None""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",0,"""0""",305.085
70100993,1,"""2018-09-11""","""WEB""","""Web Only""","""No""",1,"""66""","""Appropriate skip""","""Appropriate skip""","""Male""","""Self""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""African American, Black""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""Retired ""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Yes""","""-1""","""Other""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Other""","""Appropriate skip""","""Appropriate skip""","""radio""","""Driving""","""Driving""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",0,"""6""",615.7184
70100993,2,"""2018-09-11""","""WEB""","""Web Only""","""Yes""",1,"""53""","""Appropriate skip""","""Appropriate skip""","""Female""","""Spouse/Unmarried partner""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""African American, Black""","""Appropriate skip""","""Yes""","""Yes""","""1""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Same work place every day""","""Auto / van / truck (as the dri…","""Appropriate skip""","""Appropriate skip""","""No""","""42""","""5 days a week""","""Community and Social Service O…","""Appropriate skip""","""Health Care and Social Assista…","""Appropriate skip""","""No subsidy offered""","""Appropriate skip""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Yes""","""-1""","""1;2""","""Message signs""","""Smartphones""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Driving""","""Driving""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",0,"""2""",615.7184
70100998,1,"""2019-04-24""","""WEB""","""Web and CATI""","""No""",1,"""64""","""Appropriate skip""","""Appropriate skip""","""Female""","""Self""","""Appropriate skip""","""No""","""No""","""Appropriate skip""","""African American, Black""","""Appropriate skip""","""Yes""","""No""","""Appropriate skip""","""Disabled non-worker""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",…,"""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""No""","""-1""","""I don't know""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""I don't know""","""Not ascertained""","""2;3""","""Appropriate skip""","""Public Transportation - Bus Sc…","""Public Transportation - Train …","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Not ascertained""","""Cane or Walker""","""Appropriate skip""","""Appropriate skip""","""Cane or Walker""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""","""Appropriate skip""",1,"""6""",74.5194


In [15]:
print(trips.columns)
trips.head(10)

['sampno', 'perno', 'traveldayno', 'placeno', 'locno', 'arrtime', 'deptime', 'travtime', 'actdur', 'distance', 'mode', 'mode_o', 'vehno', 'cravl', 'fare', 'payf', 'payf_o', 'paypk', 'pkamt', 'pkbas', 'pkbas_o', 'prkty', 'prkty_o', 'rand_a', 'tpurp', 'tpurp_o', 'tpurp2', 'tpurp2_o', 'tpurp_check', 'trip_appt', 'trip_appt_why', 'trip_appt_why_1', 'trip_appt_why_2', 'trip_appt_why_3', 'trip_appt_why_4', 'trip_appt_why_5', 'trip_appt_why_6', 'trip_appt_why_se', 'trip_appt_why_rf', 'trip_appt_why_dk', 'trip_appt_why_o', 'trip_appt_why2', 'trip_appt_why2_1', 'trip_appt_why2_2', 'trip_appt_why2_3', 'trip_appt_why2_4', 'trip_appt_why2_5', 'trip_appt_why2_se', 'trip_appt_why2_rf', 'trip_appt_why2_dk', 'trip_appt_why2_o', 'perno_1', 'perno_2', 'perno_3', 'perno_4', 'perno_5', 'perno_6', 'perno_7', 'perno_8', 'perno_9', 'perno_10', 'perno_11', 'perno_12', 'companions', 'hhcount', 'nonhhcount', 'hhparty', 'party', 'transfer_count', 'qc_day', 'qc_transit', 'plaza_total', 'tollways_paid', 'tollway_1

sampno,perno,traveldayno,placeno,locno,arrtime,deptime,travtime,actdur,distance,mode,mode_o,vehno,cravl,fare,payf,payf_o,paypk,pkamt,pkbas,pkbas_o,prkty,prkty_o,rand_a,tpurp,tpurp_o,tpurp2,tpurp2_o,tpurp_check,trip_appt,trip_appt_why,trip_appt_why_1,trip_appt_why_2,trip_appt_why_3,trip_appt_why_4,trip_appt_why_5,trip_appt_why_6,…,transfer_count,qc_day,qc_transit,plaza_total,tollways_paid,tollway_1,tollway_2,tollway_3,tollway_4,tollway_5,tollway_6,plazas_paid,plaza_1,plaza_2,plaza_3,plaza_4,plaza_5,plaza_6,plaza_total_ipass,plaza_1_ipass,plaza_2_ipass,plaza_3_ipass,plaza_4_ipass,plaza_5_ipass,plaza_6_ipass,plaza_total_cash,plaza_1_cash,plaza_2_cash,plaza_3_cash,plaza_4_cash,plaza_5_cash,plaza_6_cash,placeGroup,transitPlaceno,hdist,Loop_Trip_definition,time_distance_flag
i64,i64,i64,i64,i64,str,str,i64,i64,f64,i64,str,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64
20000083,1,1,1,10000,"""2017-10-11 03:00:00""","""2017-10-11 07:08:00""",0,248,-1.0,-1,"""-1""",-1,-1,-1.0,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,-9,-1.0,0,0
20000083,1,1,2,10002,"""2017-10-11 07:26:00""","""2017-10-11 11:53:00""",18,267,4.977,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,3,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,-9,4.193,0,0
20000083,1,1,3,10000,"""2017-10-11 12:04:28""","""2017-10-11 12:12:30""",11,9,4.949,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,12,-1,17,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,3,-9,4.193,0,0
20000083,1,1,4,10406,"""2017-10-11 12:14:52""","""2017-10-11 12:27:06""",2,12,0.71,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,8,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,4,-9,0.506,0,0
20000083,1,1,5,2000060,"""2017-10-11 12:28:28""","""2017-10-11 12:31:00""",1,3,1.51,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,10,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,5,-9,1.114,0,0
20000083,1,1,6,10002,"""2017-10-11 12:39:00""","""2017-10-11 16:06:00""",8,207,3.315,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,3,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,6,-9,2.675,0,0
20000083,1,1,7,2000059,"""2017-10-11 16:53:00""","""2017-10-11 18:32:10""",47,99,17.437,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,15,-1,15,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,7,-9,11.234,0,0
20000083,1,1,8,2000062,"""2017-10-11 18:40:26""","""2017-10-11 19:09:22""",8,29,0.411,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,9,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,8,-9,0.341,0,302
20000083,1,1,9,2000064,"""2017-10-11 19:18:47""","""2017-10-11 19:23:44""",10,5,3.283,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,11,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,9,-9,1.617,0,0
20000083,1,1,10,10000,"""2017-10-11 19:40:00""","""2017-10-12 03:00:00""",16,440,9.098,202,"""-1""",1,-1,-1.0,-1,-1,-1,-1,-1,-1,1,-1,-9,1,-1,0,-1,-1,2,-1,-1,-1,-1,-1,-1,-1,…,-1,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,-9,6.711,0,0


In [61]:
trips = (
    pl.read_csv(cmap_root / "place.csv", ignore_errors=True)
    .fill_nan(-9)
    .fill_null(-9)
)

# select columns
column_mapping = {
    "sampno": "hid",
    "perno": "pid",
    "locno": "dzone",
    "placeno": "seq",
    "placeGroup": "seq2",
    "tpurp": "dact",
    "mode": "mode",
    "arrtime": "tet",
    "deptime": "tst",
    "distance": "distance",
}

trips = trips.select(column_mapping.keys()).rename(column_mapping)

# shift values from prev act
trips = trips.with_columns(
    tst = pl.col("tst").shift(1).over(["hid", "pid"]),
    oact = pl.col("dact").shift(1).over(["hid", "pid"]),
    ozone = pl.col("dzone").shift(1).over(["hid", "pid"]),
    seq = pl.col("seq") - 1
).filter(pl.col("seq") != 0)  # remove first act

# squash multi stage trips using seq2
trips = trips.with_columns(
    mode = pl.col("mode").first().over(["hid", "pid", "seq2"]),
    tst = pl.col("tst").first().over(["hid", "pid", "seq2"]),
    tet = pl.col("tet").last().over(["hid", "pid", "seq2"]),
    oact = pl.col("oact").first().over(["hid", "pid", "seq2"]),
    dact = pl.col("dact").last().over(["hid", "pid", "seq2"]),
    ozone = pl.col("ozone").first().over(["hid", "pid", "seq2"]),
    dzone = pl.col("dzone").last().over(["hid", "pid", "seq2"]),
    distance = pl.col("distance").sum().over(["hid", "pid", "seq2"]),
).unique(subset=["hid", "pid", "seq2"], keep="first", maintain_order=True)

# datetime
trips = trips.with_columns(
    tst = pl.col("tst").str.to_datetime("%Y-%m-%d %H:%M:%S"),
    tet = pl.col("tet").str.to_datetime("%Y-%m-%d %H:%M:%S"),
)

# add year and dow
day_map = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}
trips = trips.with_columns(
    year = pl.col("tst").dt.year(),
    day = pl.col("tst").dt.weekday().replace_strict(day_map),
)

# convert times to minutes since midnight (let pandas deal with overlap)
trips = trips.with_columns(
    tst = (pl.col("tst").dt.hour().cast(pl.Int32) * 60 + pl.col("tst").dt.minute()),
    tet = (pl.col("tet").dt.hour().cast(pl.Int32) * 60 + pl.col("tet").dt.minute())
)

# map
with open("cmap/trip_dictionary.yaml") as f:
    trip_mapper = yaml.safe_load(f)
mode_mapping = trip_mapper["mode"]
act_mapping = trip_mapper["purpose"]

trips = trips.with_columns(
    pid = (pl.col("hid").cast(pl.String) + pl.col("pid").cast(pl.String)).cast(pl.Int64),
    # map
    mode = pl.col("mode").replace_strict(mode_mapping),
    oact = pl.col("oact").replace_strict(act_mapping),
    dact = pl.col("dact").replace_strict(act_mapping),
)

trips

hid,pid,dzone,seq,seq2,dact,mode,tet,tst,distance,oact,ozone,year,day
i64,i64,i64,i64,i64,str,str,i32,i32,f64,str,i64,i32,str
20000083,200000831,10002,1,2,"""work""","""car""",446,428,4.977,"""home""",10000,2017,"""Thursday"""
20000083,200000831,10000,2,3,"""medical""","""car""",724,713,4.949,"""work""",10002,2017,"""Thursday"""
20000083,200000831,10406,3,4,"""shop""","""car""",734,732,0.71,"""medical""",10000,2017,"""Thursday"""
20000083,200000831,2000060,4,5,"""other""","""car""",748,747,1.51,"""shop""",10406,2017,"""Thursday"""
20000083,200000831,10002,5,6,"""work""","""car""",759,751,3.315,"""other""",2000060,2017,"""Thursday"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
70100999,701009991,10000,1,2,"""home""","""car""",225,181,25.823,"""work""",1000000,2019,"""Thursday"""
70100999,701009991,1000001,2,3,"""leisure""","""car""",790,770,7.753,"""home""",10000,2019,"""Thursday"""
70100999,701009991,10000,3,4,"""shop""","""car""",961,930,7.651,"""leisure""",1000001,2019,"""Thursday"""
70100999,701009991,1000000,4,5,"""work""","""car""",1109,1039,26.098,"""shop""",10000,2019,"""Thursday"""


In [62]:
pam_trips = read.load_travel_diary(
    trips=trips.to_pandas()
)

Using from-to activity parser using 'oact' and 'dact' columns

        Unable to load household area ('hzone') - not found in trips_diary or unable to build from attributes.
        Pam will try to infer home location from activities, but this behaviour is not recommended.
        
Using freq of 'None' for all trips.
 Person pid:200002281 hid:20000228 plan does not start with 'home' activity: other
 Person pid:200007524 hid:20000752 plan does not start with 'home' activity: education
 Person pid:200018891 hid:20001889 plan does not start with 'home' activity: other
 Person pid:200019651 hid:20001965 plan does not start with 'home' activity: shop
 Person pid:200023683 hid:20002368 plan does not start with 'home' activity: work
 Person pid:200025351 hid:20002535 plan does not start with 'home' activity: leisure
 Person pid:200032641 hid:20003264 plan does not start with 'home' activity: work
 Person pid:200041801 hid:20004180 plan does not start with 'home' activity: shop
 Person pid:200

In [None]:
pam_trips.random_person().print()

Person: 700044231
{}
0:	Activity(act:work, location:10000, time:00:00:00 --> 03:00:00, duration:3:00:00)
1:	Leg(mode:bus, area:10000 --> 10002, time:03:00:00 --> 09:00:00, duration:6:00:00)
2:	Activity(act:work, location:10002, time:09:00:00 --> 22:00:00, duration:13:00:00)
3:	Leg(mode:rail, area:10002 --> 10000, time:22:00:00 --> 22:25:00, duration:0:25:00)
4:	Activity(act:home, location:10000, time:22:25:00 --> 00:00:00, duration:1:35:00)


In [70]:
hb = [person.home_based for _, _, person in pam_trips.people()]
print(len(hb))
sum(hb) /  len(hb)

25007


0.8622385731995041

In [None]:


# map
with open("cmap/trip_dictionary.yaml") as f:
    trip_mapper = yaml.safe_load(f)
trip_mapper = {c: {int(k): v for k, v in d.items()} for c, d in trip_mapper.items()}

trips = trips.with_columns(
    pl.col(col).replace_strict(mapping, default=None).fill_null(pl.col(col))
    for col, mapping in trip_mapper.items()
)

# check for numerics
string_cols = [
    col for col in trips.columns
    if trips[col].dtype == pl.String
]
can_integer_cols = [
    col for col in string_cols
    if trips[col].str.to_integer(strict=False).null_count() == 0
]
can_float_cols = [
    col for col in string_cols
    if trips[col].cast(pl.Float64, strict=False).null_count() == 0
]

trips = trips.with_columns([
    pl.col(can_integer_cols).cast(pl.Int64)
])

trips

ColumnNotFoundError: unable to find column "purpose"; valid columns: ["sampno", "perno", "dayno", "placeno", "locno", "arrtime", "deptime", "travtime", "actdur", "distance", "mode", "mode_o", "mode_imputed", "vehno", "tpurp", "tpurp_o", "tpurp2", "tpurp2_o", "tpurp_imputed", "cravl", "paypk", "pkamt", "pkbas", "pkbas_o", "prkty", "prkty_o", "payf", "payf_o", "fare", "trip_appt", "trip_appt_why", "trip_appt_why_1", "trip_appt_why_2", "trip_appt_why_3", "trip_appt_why_4", "trip_appt_why_5", "trip_appt_why_6", "trip_appt_why_se", "trip_appt_why_rf", "trip_appt_why_dk", "trip_appt_why_o", "trip_appt_why2", "trip_appt_why2_1", "trip_appt_why2_2", "trip_appt_why2_3", "trip_appt_why2_4", "trip_appt_why2_5", "trip_appt_why2_se", "trip_appt_why2_rf", "trip_appt_why2_dk", "trip_appt_why2_o", "initiated_by", "perno_1", "perno_2", "perno_3", "perno_4", "perno_5", "perno_6", "perno_7", "perno_8", "perno_9", "perno_10", "perno_11", "perno_12", "companions", "hhcount", "nonhhcount", "hhparty", "party"]

In [8]:
len(trips)

98034

In [9]:
trips["mode"].value_counts()

mode,count
i64,u32
502,7
702,2
505,2811
203,3687
503,10
…,…
202,25701
703,1
201,11
601,111


In [None]:
# import json

# maps = pl.read_csv(cmap_root / "data_dictionary.csv")

# for table, name in zip(
#     ["HOUSEHOLD", "PERSON", "GPS_DAY", "VEHICLE"],
#     ["hh", "person", "trip", "veh"]
# ):
#     data = maps.filter(pl.col("TABLE") == table)
#     mapper = {}
#     for i, frame in data.group_by("NAME"):
#         column = str(i[0]).lower()
#         mapper[column] = {}
#         keys = frame.select(pl.col("VALUE")).rows()
#         values = frame.select(pl.col("LABEL")).rows()
#         for k, v in zip(keys, values):
#             mapper[column][int(k[0])] = str(v[0])

#     with open(f"cmap/{name}_dictionary.json", "w", encoding="utf-8") as f:
#         json.dump(mapper, f, ensure_ascii=False, indent=4)