In [40]:
from pathlib import Path
import pandas as pd

from pam.read.matsim import load_attributes_map_from_v12, stream_matsim_persons
from pam.utils import datetime_to_matsim_time
from pam.core import Population, Person
from pam.activity import Activity, Plan, Leg

In [59]:
dir = Path("C:/Users/fred/Data/2019_baseline_re_run_20221209_low_cost_bus_walk")

# output paths
output_dir = dir / "processed"
output_dir.mkdir(exist_ok=True)
lhs_path = output_dir / "lhs.csv"
rhs_path = output_dir / "rhs.csv"
combined_path = output_dir / "combined.csv"
attributes_path = output_dir / "attributes.csv"

# input paths
input_plans = dir / "input_plans.xml"
iter_50 = (
    dir
    / "output_experienced_plans_ITER50.xml"
    / "output_experienced_plans_ITER50.xml"
)
iter_100 = (
    dir
    / "output_experienced_plans_ITER100.xml"
    / "output_experienced_plans_ITER100.xml"
)
iter_150 = (
    dir
    / "output_experienced_plans_ITER150.xml"
    / "output_experienced_plans_ITER150.xml"
)
iter_200 = (
    dir
    / "output_experienced_plans.xml_ITER200"
    / "output_experienced_plans.xml_ITER200"
)
iter_200.exists()


True

In [42]:
streamer0 = stream_matsim_persons(
    input_plans,
    simplify_pt_trips=True,
    # crop=True,
    keep_non_selected=True,
    leg_attributes=True,
    leg_route=True,
)
streamer1 = stream_matsim_persons(
    iter_50,
    simplify_pt_trips=True,
    # crop=True,
    keep_non_selected=True,
    leg_attributes=True,
    leg_route=True,
)
streamer2 = stream_matsim_persons(
    iter_100,
    simplify_pt_trips=True,
    # crop=True,
    keep_non_selected=True,
    leg_attributes=True,
    leg_route=True,
)
streamer3 = stream_matsim_persons(
    iter_150,
    simplify_pt_trips=True,
    # crop=True,
    keep_non_selected=True,
    leg_attributes=True,
    leg_route=True,
)
streamer4 = stream_matsim_persons(
    iter_200,
    simplify_pt_trips=True,
    # crop=True,
    keep_non_selected=True,
    leg_attributes=True,
    leg_route=True,
)
streamers = [streamer1, streamer2, streamer3, streamer4]


In [43]:
def dt_to_min(dt) -> int:
    h, m, s = datetime_to_matsim_time(dt).split(":")
    return (int(h) * 60) + int(m)


def person_to_schedule(person: Person) -> tuple:
    score = person.plan.score
    if score is None:
        score = 0
    record = []
    for component in person.plan:
        if isinstance(component, Leg):
            distance = component.euclidean_distance
            mode = component.mode
        else:
            distance = 0
            mode = "NA"
        record.append(
            [
                component.act,
                dt_to_min(component.start_time),
                dt_to_min(component.end_time),
                mode,
                distance,
            ]
        )
    return (person.pid, (score, [record]))


def pam_to_schedules(population: Population) -> dict:
    return dict([person_to_schedule(person) for person in population])


def add_pid(record, pid):
    record = [[pid] + line for line in record]
    return record

In [44]:
best = pam_to_schedules(streamer0)

lhss = []
rhss = []
mapper = {}
i = 0
j = 0

for s, streamer in enumerate(streamers):
    print("stream: ", s)
    for person in streamer:
        pid, (score, (record,)) = person_to_schedule(person)
        existing_score, existing_records = best[pid]
        if score > existing_score:
            new_records = existing_records + [record]
            for existing in existing_records:
                if len(record) == len(
                    existing
                ):  # protection against cropped plans
                    lhss.extend(add_pid(existing, i))
                    rhss.extend(add_pid(record, i))
                    mapper[i] = pid
                    i += 1
                else:
                    j += 1
            best[pid] = (score, new_records)

print("j: ", j)
print("i: ", i)

stream:  0
stream:  1
stream:  2
stream:  3
j:  0
i:  621856


In [45]:
lhs = pd.DataFrame(
    lhss, columns=["pid", "act", "start", "end", "mode", "distance"]
).set_index("pid")
rhs = pd.DataFrame(
    rhss, columns=["pid", "act", "start", "end", "mode", "distance"]
).set_index("pid")


In [46]:
attributes = load_attributes_map_from_v12(input_plans)
attributes = {k: attributes[v] for k, v in mapper.items()}
attributes = pd.DataFrame(attributes).T
attributes = attributes.drop(
    columns=[
        "hid",
        "hid_old",
        "hzone",
        "householdid",
        "individualid",
        "surveyyear",
    ]
)
attributes.index.name = "pid"
attributes.head()


Unnamed: 0_level_0,hcounty,gender,age,workstatus,hasLicence,hasCar,hasBike,car_avail,subpopulation,hhincome,sex,age_group,CarType,CarCO2
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Barnsley,male,82,inactive,no,True,False,never,car_avail_no_low,low,m,80 to 84,,
1,Barnsley,female,82,inactive,no,True,False,never,car_avail_no_low,low,f,80 to 84,,
2,Doncaster,male,82,inactive,no,True,False,never,car_avail_no_low,low,m,80 to 84,,
3,Doncaster,female,82,inactive,no,True,False,never,car_avail_no_low,low,f,80 to 84,,
4,Doncaster,male,82,inactive,no,True,False,never,car_avail_no_low,low,m,80 to 84,,


In [47]:
assert len(lhs.index.unique()) == len(rhs.index.unique()) == len(attributes)
len(attributes)


621856

In [50]:
lhs.to_csv(lhs_path)
rhs.to_csv(rhs_path)
attributes.to_csv(attributes_path)


In [51]:
lhs.describe()

Unnamed: 0,start,end,distance
count,4130846.0,4130846.0,4130846.0
mean,699.6928,916.4698,4.920237
std,381.1194,327.2063,18.43641
min,0.0,0.0,0.0
25%,495.0,634.0,0.0
50%,750.0,918.0,0.0
75%,995.0,1140.0,3.412645
max,1914.0,1914.0,473.7815


In [52]:
rhs.describe()

Unnamed: 0,start,end,distance
count,4130846.0,4130846.0,4130846.0
mean,700.9377,917.7148,4.920237
std,382.2562,327.7073,18.43641
min,0.0,0.0,0.0
25%,493.0,635.0,0.0
50%,751.0,919.0,0.0
75%,999.0,1144.0,3.412645
max,1919.0,1919.0,473.7815


In [62]:
combined = pd.concat(
    [
        lhs,
        rhs.rename(
            columns={
                "act": "target_act",
                "start": "target_start",
                "end": "target_end",
                "mode": "target_mode",
                "distance": "target_distance",
            }
        ),
    ],
    axis=1,
)
combined.to_csv(combined_path)


In [63]:
combined[["distance", "target_distance"]].max()

distance           473.78146
target_distance    473.78146
dtype: float64