# Aggregate matched trips to links

In [1]:
import pandas as pd
import geopandas as gpd
from pathlib import Path
import pickle
import sys
from collections import Counter

from bikewaysim.paths import config
from bikewaysim.map_matching import post_process

# combines the map match results into a single file
post_process.combine_results()

0


In [2]:
# load network
links = gpd.read_file(config['network_fp']/'final_network.gpkg',layer='edges')
nodes = gpd.read_file(config['network_fp']/'final_network.gpkg',layer='nodes')

# load the trips
with (config['cycleatl_fp']/'trips_4.pkl').open('rb') as fh:
    trips_df = pickle.load(fh)

# load the matches
# NOTE use this to indicate which match results to load
match_settings_idx = 0
with (config['matching_fp'] / f"match_dict_full_{match_settings_idx}.pkl").open('rb') as fh:
    match_dict = pickle.load(fh)

In [3]:
# NOTE determine the cutoff to use for an acceptable match
cutoff = 0.90 # set pct of points that need to be matched
above_threshold, below_threshold, failed_matches, match_ratios = post_process.mapmatch_results(match_dict,cutoff)
match_dict = {key:item for key,item in match_dict.items() if key in above_threshold}

# the linkid and direction of the matched edges
matched_edges_dict = {tripid:[(int(x),bool(y)) for x,y in item['edges'].values] for tripid, item in match_dict.items()}
# the lines that connect the GPS points to the network nodes
match_lines = {tripid:int(item['match_lines']['length'].mean()) for tripid, item in match_dict.items()}
# the pct of points that were matched
match_ratios = {tripid:round(item['match_ratio'],1) for tripid, item in match_dict.items()}

# add match dist and ration to the trips dataframe
trips_df['match_dist'] = trips_df['tripid'].map(match_lines)
trips_df['match_ratio'] = trips_df['tripid'].map(match_ratios)

597 / 682 (88%) successful matches
81 / 682 (12%) partial matches
4 / 682 (1%) failed matches


# Aggregate matches to links

In [5]:
from collections import Counter, defaultdict
import pandas as pd
#feed in a subset of matched_edges to return counts for specific queries such as
#the gender split on a link etc

links_fwd = defaultdict(list)
links_rev= defaultdict(list)
links_undirected = defaultdict(list)
for tripid, link_list in matched_edges_dict.items():
    for linkid in link_list:
        linkid0 = int(linkid[0])
        linkid1 = linkid[1]
        links_undirected[linkid[0]].append(tripid)
        if linkid1 == False:
            links_fwd[linkid0].append(tripid)
        else:
            links_rev[linkid0].append(tripid)

links_fwd = pd.Series(links_fwd)
links_rev = pd.Series(links_rev)
links_undirected = pd.Series(links_undirected)

link_counts = pd.DataFrame({'fwd':links_fwd,'rev':links_rev,'both':links_undirected})
link_counts['total_fwd'] = link_counts['fwd'].apply(lambda x: len(x) if isinstance(x,list) else 0)
link_counts['total_rev'] = link_counts['rev'].apply(lambda x: len(x) if isinstance(x,list) else 0)
link_counts['total'] = link_counts['both'].apply(lambda x: len(x) if isinstance(x,list) else 0)

#turn index back to int
link_counts.index = link_counts.index.astype(int)
#name the index
link_counts.index.name = 'linkid'

In [6]:
users = pd.read_pickle(config['cycleatl_fp']/'users_4.pkl')
trips = pd.read_pickle(config['cycleatl_fp']/'trips_4.pkl')
trips0 = pd.read_pickle(config['cycleatl_fp']/'trips_2.pkl')
userid = trips['userid'].to_dict()
users.set_index('userid',inplace=True)

In [7]:
import numpy as np
link_counts['fwd_users'] = link_counts['fwd'].apply(lambda x: list(set([userid[y] for y in x])) if isinstance(x,list) else np.nan)
link_counts['rev_users'] = link_counts['rev'].apply(lambda x: list(set([userid[y] for y in x])) if isinstance(x,list) else np.nan)
link_counts['both_users'] = link_counts['both'].apply(lambda x: list(set([userid[y] for y in x])) if isinstance(x,list) else np.nan)

link_counts['total_fwd_users'] = link_counts['fwd_users'].apply(lambda x: len(x) if isinstance(x,list) else 0)
link_counts['total_rev_users'] = link_counts['rev_users'].apply(lambda x: len(x) if isinstance(x,list) else 0)
link_counts['total_users'] = link_counts['both_users'].apply(lambda x: len(x) if isinstance(x,list) else 0)

In [8]:
trips0.set_index('tripid',inplace=True)
test = trips0['start_time'].dt.year.to_dict()
years = pd.DataFrame.from_dict(link_counts['both'].dropna().apply(lambda x: Counter([test.get(y) for y in x])).to_dict(),orient='index')
years.columns = [str(x) for x in years.columns]

In [9]:
years.index.name = 'tripid'
years = years.fillna(0).astype(int)
link_counts = link_counts.merge(years,left_index=True,right_index=True,how='left')
link_counts.fillna(0,inplace=True)

In [10]:
# TODO script this to get the attributes we generally want
# at some point look at more specific trip characterstics like average speed

# Trip attributes
commute = set(trips.loc[trips.trip_type == 'Commute'].index.tolist())
link_counts['commute_trips'] = link_counts['both'].apply(lambda x: len([y for y in x if y in commute]) if isinstance(x,list) else 0)

# User attributes
female = set(users.loc[users['gender']=='Female'].index.tolist())
male = set(users.loc[users['gender']=='Male'].index.tolist())
link_counts['female_trips'] = link_counts['both_users'].apply(lambda x: len([y for y in x if y in female]) if isinstance(x,list) else 0)
link_counts['male_trips'] = link_counts['both_users'].apply(lambda x: len([y for y in x if y in male]) if isinstance(x,list) else 0)


In [11]:
link_counts['2014']

linkid
0        0
1        0
7        0
67       0
70       1
        ..
32905    1
32911    1
32978    2
32987    0
32994    6
Name: 2014, Length: 5711, dtype: int64

In [12]:
new_links = pd.merge(links,link_counts,left_on='linkid',right_index=True)

new_links.fillna(0,inplace=True)

for col in ['fwd','rev','both','fwd_users','rev_users','both_users']:
    new_links[col] = new_links[col].astype(str)
new_links.to_file(Path.home()/'Downloads/link_counts.gpkg')