In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from simpledbf import Dbf5
import pyxlsb
import os
import time
import random
from shapely.geometry import MultiLineString, LineString, Point
import math
import shapely

import create_gtfs_from_basicinfo
import sys, os
import zipfile
import shutil



PyTables is not installed. No support for HDF output.
SQLalchemy is not installed. No support for SQL output.


In [2]:
start_time = time.time()

In [3]:
test_file = 'tests/test.shp'
rootdir = '/Users/frankiemacbook/OneDrive - VicGov/VITM/GTFS/Ref case'

fullpaths = map(lambda name: os.path.join(rootdir, name), os.listdir(rootdir))
dirs = [x for x in fullpaths if os.path.isdir(x)]
years = sorted([x.rsplit('/', 1)[-1] for x in dirs])
years

['2018', '2026', '2031', '2036', '2041', '2051']

In [4]:
data = pd.DataFrame()
for year in years:
    dbf = Dbf5(rootdir+'/'+year+'/PTLINK_ALL_Y'+year+'_VR19_Ref_C.DBF')
    temp = dbf.to_dataframe()
    temp.insert(0,'year',year)
    data = data.append(temp, ignore_index=True)
data = data[['PERIOD','A','B','VEHNAME','LINENO','LINKSEQ','STOPA','STOPB','NAME','LONGNAME','year']]
data['YEAR_LINE_PERIOD'] = data['year'] + "_" + data['LINENO'].apply(str) + "_" + data['PERIOD']
data['YEAR_LINE'] = data['year'] + "_" + data['LINENO'].apply(str)
data['A_B'] = data['A'].apply(str) + "_" + data['B'].apply(str)

In [5]:
tram_names = data.loc[data['VEHNAME'].str.contains("^(?:Tram)")]['VEHNAME'].unique().tolist()
vline_names = data.loc[data['VEHNAME'].str.contains("^(?:V[Ll]ine)")]['VEHNAME'].unique().tolist() + ['SP2']
train_names = data.loc[(data['VEHNAME'].str.contains("^(?:Metro)")) | \
                       (data['VEHNAME'].str.contains("^(?:HCMT)")) | \
                       (data['VEHNAME'].str.contains("^(?:COMENG)")) | \
                       (data['VEHNAME'].str.contains("^(?:SRL)"))]['VEHNAME'].unique().tolist()

replace_dict = {'Bus':'bus','SkyBus':'skybus'}
for name in tram_names:
    replace_dict[name] = 'tram'
for name in vline_names:
    replace_dict[name] = 'vline'
for name in train_names:
    replace_dict[name] = 'train'

data['VEHNAME'] = data['VEHNAME'].replace(replace_dict)
data = data.rename(columns = {'VEHNAME':'mode'})
all_modes = sorted(data['mode'].unique().tolist())
all_modes

['bus', 'skybus', 'train', 'tram', 'vline']

In [6]:
ignore_fields_list = ['REGION','LINKC_IP','LINKC_PM','LINKC_OP', 'LANES_AM', 'LANES_IP', 'LANES_PM', 'LANES_OP', 'PSPD_AM', 'PSPD_IP', 'PSPD_PM', 'PSPD_OP', 'TOLLROAD', \
                      'TOLLGANT', 'TOLLENTRY', 'TOLLEXIT', 'TURN_BAN', 'CLEARWAY', 'ROADTYPE', 'RD_NAME', 'LASTPROJ', 'LX', 'LX_CODE', 'CROSSING', 'LXCLOSE', 'CTIME', \
                      'RAIL_SPD', 'TRAM_SPD3', 'BFLG_AM', 'BFLG_PM', 'SL', 'SL_CODE', 'TTR', 'TRN_CRDN', 'VLN_CRDN', 'TRM_CRDN', 'CRDN_DIR', 'FFSPD_AM', \
                      'FFSPD_IP', 'FFSPD_PM', 'FFSPD_OP', 'FFTIME_AM', 'FFTIME_IP', 'FFTIME_PM', 'FFTIME_OP', 'CSPD_AM', 'CSPD_IP', 'CSPD_PM', 'CSPD_OP', 'TIME_AM', \
                      'TIME_IP', 'TIME_PM', 'TIME_OP', 'TRSTIME_AM', 'TRSTIME_IP', 'TRSTIME_PM', 'TRSTIME_OP', 'TRMTIME_AM', 'TRMTIME_IP', 'TRMTIME_PM', 'TRMTIME_OP', \
                      'BUSTIME_AM', 'BUSTIME_IP', 'BUSTIME_PM', 'BUSTIME_OP', 'SMBTIME_AM', 'SMBTIME_IP', 'SMBTIME_PM', 'SMBTIME_OP', 'SKBTIME_AM', 'SKBTIME_IP', \
                      'SKBTIME_PM', 'SKBTIME_OP', 'SPBTIME_AM', 'SPBTIME_IP', 'SPBTIME_PM', 'SPBTIME_OP', 'VEH_AM', 'VEH_IP', 'VEH_PM', 'VEH_OP', 'VEH_WD', 'PVV_AM', \
                      'PVV_IP', 'PVV_PM', 'PVV_OP', 'PVV_WD', 'AP_EMP_AM', 'AP_EMP_IP', 'AP_EMP_PM', 'AP_EMP_OP', 'AP_EMP_WD', 'AP_PAS_AM', 'AP_PAS_IP', 'AP_PAS_PM', \
                      'AP_PAS_OP', 'AP_PAS_WD', 'TRUCK_AM', 'TRUCK_IP', 'TRUCK_PM', 'TRUCK_OP', 'TRUCK_WD', 'RIGID_AM', 'RIGID_IP', 'RIGID_PM', 'RIGID_OP', 'RIGID_WD', \
                      'ARTIC_AM', 'ARTIC_IP', 'ARTIC_PM', 'ARTIC_OP', 'ARTIC_WD', 'BDBLE_AM', 'BDBLE_IP', 'BDBLE_PM', 'BDBLE_OP', 'BDBLE_WD', 'HPFV_AM', 'HPFV_IP', \
                      'HPFV_PM', 'HPFV_OP', 'HPFV_WD', 'PCU_AM', 'PCU_IP', 'PCU_PM', 'PCU_OP', 'PCU_WD', 'VEH_VKT_AM', 'VEH_VKT_IP', 'VEH_VKT_PM', 'VEH_VKT_OP', \
                      'PVV_VKT_AM', 'PVV_VKT_IP', 'PVV_VKT_PM', 'PVV_VKT_OP', 'HCV_VKT_AM', 'HCV_VKT_IP', 'HCV_VKT_PM', 'HCV_VKT_OP', 'VEH_VHT_AM', 'VEH_VHT_IP', \
                      'VEH_VHT_PM', 'VEH_VHT_OP', 'PVV_VHT_AM', 'PVV_VHT_IP', 'PVV_VHT_PM', 'PVV_VHT_OP', 'HCV_VHT_AM', 'HCV_VHT_IP', 'HCV_VHT_PM', 'HCV_VHT_OP', \
                      'HYCAP_AM', 'HYCAP_IP', 'HYCAP_PM', 'HYCAP_OP', 'VC_AM', 'VC_IP', 'VC_PM', 'VC_OP', 'PT_AM', 'PT_IP', 'PT_PM', 'PT_OP', 'PT_WD', 'TRAIN_AM', \
                      'TRAIN_IP', 'TRAIN_PM', 'TRAIN_OP', 'TRAIN_WD', 'TRAM_AM', 'TRAM_IP', 'TRAM_PM', 'TRAM_OP', 'TRAM_WD', 'BUS_AM', 'BUS_IP', 'BUS_PM', 'BUS_OP', \
                      'BUS_WD', 'VLINE_AM', 'VLINE_IP', 'VLINE_PM', 'VLINE_OP', 'VLINE_WD', 'WLK_AE_AM', 'WLK_AE_IP', 'WLK_AE_PM', 'WLK_AE_OP', 'WLK_AE_WD', \
                      'PNR_AE_AM', 'PNR_AE_IP', 'PNR_AE_PM', 'PNR_AE_OP', 'PNR_AE_WD', 'PNR_VH_AM', 'PNR_VH_IP', 'PNR_VH_PM', 'PNR_VH_OP', 'PNR_VH_WD', 'TFTMBU_AM', \
                      'TFTMBU_IP', 'TFTMBU_PM', 'TFTMBU_OP', 'TFTMBU_WD', 'TFRLRL_AM', 'TFRLRL_IP', 'TFRLRL_PM', 'TFRLRL_OP', 'TFRLRL_WD', 'TRNNVEH_AM', 'TRNNVEH_IP', \
                      'TRNNVEH_PM', 'TRNNVEH_OP', 'TRMNVEH_AM', 'TRMNVEH_IP', 'TRMNVEH_PM', 'TRMNVEH_OP', 'BUSNVEH_AM', 'BUSNVEH_IP', 'BUSNVEH_PM', 'BUSNVEH_OP', \
                      'VLNNVEH_AM', 'VLNNVEH_IP', 'VLNNVEH_PM', 'VLNNVEH_OP', 'TRNCAP_AM', 'TRNCAP_IP', 'TRNCAP_PM', 'TRNCAP_OP', 'TRN_VC_AM', 'TRN_VC_IP', \
                      'TRN_VC_PM', 'TRN_VC_OP', 'TRMCAP_AM', 'TRMCAP_IP', 'TRMCAP_PM', 'TRMCAP_OP', 'TRM_VC_AM', 'TRM_VC_IP', 'TRM_VC_PM', 'TRM_VC_OP', 'BUSCAP_AM', \
                      'BUSCAP_IP', 'BUSCAP_PM', 'BUSCAP_OP', 'BUS_VC_AM', 'BUS_VC_IP', 'BUS_VC_PM', 'BUS_VC_OP', 'VLNCAP_AM', 'VLNCAP_IP', 'VLNCAP_PM', 'VLNCAP_OP', \
                      'VLN_VC_AM', 'VLN_VC_IP', 'VLN_VC_PM', 'VLN_VC_OP', 'PTCAP_AM', 'PTCAP_IP', 'PTCAP_PM', 'PTCAP_OP', 'PT_VC_AM', 'PT_VC_IP', 'PT_VC_PM', 'PT_VC_OP']

# network = pd.DataFrame()

#     temp = gpd.read_file(rootdir+'/'+year+'/SUMMARY_LOADED_NETWORK_LINKS_Y'+year+'_VR19_Ref_C.shp',ignore_fields=ignore_fields_list)
#     temp = temp.loc[~temp['LINKC_AM'].isin([1,-1])]
#     temp['A_B'] = temp['A'].apply(str) + "_" + temp['B'].apply(str)
#     temp = temp.set_index('A_B')
#     missing_index = temp.index.difference(network.index)
#     network = network.append(temp.loc[missing_index, :])

# for year in years[-2:]:
network_year = years[-1]
#     network = network.reset_index().rename(columns = {'index':'A_B'})
network = gpd.read_file(rootdir+'/'+network_year+'/SUMMARY_LOADED_NETWORK_LINKS_Y'+network_year+'_VR19_Ref_C.shp',ignore_fields=ignore_fields_list)
network = network.loc[~network['LINKC_AM'].isin([1,-1])]
#     network = gpd.GeoDataFrame(network, geometry='geometry')
network.crs=('EPSG:20255')
network['DISTANCE'] = network['geometry'].length

network['A_B'] = network['A'].apply(str) + "_" + network['B'].apply(str)

In [7]:
pt_reporting = pd.DataFrame()
for year in years:
    temp = pd.read_excel(rootdir+'/'+year+'/DetailedPTReporting_v200619_Y'+year+'_VR19_Ref_C.xlsb', sheet_name='Line Summary', engine='pyxlsb', skiprows=2)
    temp.insert(0,'year',year)
    pt_reporting = pt_reporting.append(temp, ignore_index=True)

In [8]:
temp_data = data.merge(network[['DISTANCE','A_B','geometry']], how='left', on='A_B').reset_index()

In [9]:
segs = temp_data.groupby('A_B').first()[['index','mode','A','B','STOPA','STOPB','DISTANCE','geometry']]

In [10]:
missing_links = segs.loc[segs['DISTANCE'].isnull()]

In [11]:
def find_shortest_path(row):
    
    if row['node_path'] is None:
        
        global rows_processed, segs_not_found
        start_time = time.time()
        too_long = False
        bounding_nodes = [temp_data.loc[row['index']-1,'A'],temp_data.loc[row['index']+1,'B']]
        mode = row['mode']
        start_node = row['A']
        end_node = row['B']
        depth = 1
        max_depth = 500
        max_time = 90
        node_tree = [[start_node]]
        rows_processed +=1
        print('Finding missing nodes: %.1f' % (100 * rows_processed / total_rows) + '% pairs processed                      ', end='\r')
        if row['next_nodes'] is not None:
            node_tree = [node_tree[0] + row['next_nodes']]
            max_depth = 500
            max_time = 120
        while True:
            if (time.time() - start_time > 30) and too_long == False:
                print('Node pair ('+str(start_node)+', '+str(end_node)+') is taking a long time to process', end='\r')
                too_long = True
            new_tree = []
            for nodes in node_tree:
                try:
                    if mode in ['train','vline']:
                        new_nodes = network.loc[(network['A'] == nodes[-1]) & (network['LINKC_AM'] == 42), 'B'].to_list()
                    else:
                        new_nodes = network.loc[(network['A'] == nodes[-1]) & (network['LINKC_AM'] != 42), 'B'].to_list()
                    if any(x == end_node for x in new_nodes):
                        nodes.append(end_node)
                        row['node_path'] = split_list(nodes)
                        return row
                    if (depth >= 5) and (len(new_nodes) > 2) and (random.random() < 0.5):
                        new_nodes = []
                    for new_node in new_nodes:
                        if new_node not in nodes + bounding_nodes:
                            nodes_copy = nodes.copy()
                            nodes_copy.append(new_node)
                            new_tree.append(nodes_copy)
                except:
                    None
            node_tree = new_tree
            depth += 1
            if (depth == max_depth) or (len(node_tree) > 50000) or (time.time() - start_time > max_time):
                segs_not_found.append('"A"='+str(start_node)+' OR "B"='+str(end_node))
#                 print('Node pair ('+str(start_node)+', '+str(end_node)+') not found at depth='+str(depth)+' and tree length='+str(len(node_tree)), end='\r')
                row['node_path'] = None
    return row

def split_list(l):
    new_list = []
    for i in range(len(l)-1):
        new_list.append((l[i],l[i+1]))
    return new_list

missing_links.insert(0,'node_path',None)
missing_links.insert(0,'next_nodes',None)

next_nodes = {'23237_23051':[23212,23191,23188],
              '14610_23262':[23377,23375,23376],
              '10322_19292':[30688,30682,30677],
              '220608_44422':[220609,220610,220612],
              '23051_23237':[23059,23060,23066],
              '23262_14610':[23257,23253,23252],
              '44422_220608':[220102,220103,220104],
              '19292_10322':[30935,30937,30909],
              '27950_26914':[27933,27928,27924],
              '18386_233825':[279141,279140,279139],
              '233825_18386':[279144,279145,279146],
              '223580_15575':[223581,224430,224431]}

for k in next_nodes:
    if k in missing_links.index:
        missing_links.at[k,'next_nodes'] = next_nodes[k]

total_rows = missing_links.shape[0]
rows_processed = 0

segs_not_found = []
missing_links = missing_links.apply(find_shortest_path, axis=1)
if len(segs_not_found) > 0:
    print("\nSegments not found:")
    for s in segs_not_found:
        print(s)
        
# import os
# os.system("printf '\a'")

Finding missing nodes: 100.0% pairs processed                      

In [12]:
def pad_stops(group):
    group['STOPB'].iloc[:-1] = 0
    group['STOPA'].iloc[1:] = 0
    return group

if missing_links.size != 0:
    
    ml_expand = missing_links[['index','A','B','STOPA','STOPB']] \
        .merge(missing_links.node_path.explode(), right_index = True, left_index = True) \
        .dropna() \
        .sort_values(['index','A','B']) \
        .reset_index(drop = True)

    ml_expand['A_B'] = ml_expand['A'].apply(str) + "_" + ml_expand['B'].apply(str)

    ml_expand = ml_expand.sort_values(['A_B','index'])
    ml_expand = ml_expand.groupby('A_B').apply(pad_stops)

    temp_data = temp_data.merge(ml_expand[['A_B','STOPA','STOPB','node_path']], how='left', on='A_B', suffixes=('','_y')).sort_values(['year','LINENO','PERIOD','LINKSEQ']).reset_index(drop=True)

    temp_data[['new_A','new_B']] = pd.DataFrame(temp_data.loc[temp_data['node_path'].notnull(), 'node_path'].tolist(), index=temp_data.loc[temp_data['node_path'].notnull()].index)

    temp_data['STOPA'] = np.where(temp_data['node_path'].notnull(), temp_data['STOPA_y'], temp_data['STOPA'])
    temp_data['STOPB'] = np.where(temp_data['node_path'].notnull(), temp_data['STOPB_y'], temp_data['STOPB'])
    temp_data['A'] = np.where(temp_data['node_path'].notnull(), temp_data['new_A'], temp_data['A'])
    temp_data['B'] = np.where(temp_data['node_path'].notnull(), temp_data['new_B'], temp_data['B'])

    temp_data = temp_data.drop(['node_path', 'new_A', 'new_B', 'STOPA_y', 'STOPB_y'], axis=1)

    temp_data['A'] = temp_data['A'].apply(int)
    temp_data['B'] = temp_data['B'].apply(int)

    temp_data['A_B'] = temp_data['A'].apply(str) + "_" + temp_data['B'].apply(str)

In [13]:
df = temp_data.drop(['index','DISTANCE','geometry'], axis=1).merge(network[['LINKC_AM','A_B','DISTANCE','BUS_SPD','geometry']], how='left', on='A_B').replace(np.nan,"NaN")

df['LINKSEQ'] = df.groupby('YEAR_LINE_PERIOD').cumcount()
df = df.merge(df.groupby('YEAR_LINE_PERIOD')['B'].apply(list).apply(lambda x: ','.join([str(y) for y in x])).reset_index().rename(columns={'B':'ROUTE_NODES_SEQ'}), how='left', on='YEAR_LINE_PERIOD')

unique_routes = df.groupby('ROUTE_NODES_SEQ').first().reset_index().reset_index().rename(columns={'index':'UNIQUE_ROUTE'})[['ROUTE_NODES_SEQ','UNIQUE_ROUTE','YEAR_LINE_PERIOD']]

df = df.merge(unique_routes.drop('YEAR_LINE_PERIOD',axis=1), how='left', on='ROUTE_NODES_SEQ').drop('ROUTE_NODES_SEQ',axis=1)
df['UNIQUE_ROUTE_SEQ'] = df['UNIQUE_ROUTE'].apply(str) + '_' + df['LINKSEQ'].apply(str)

In [14]:
all_links_dict = {}

def create_links_dict(feature):
    global all_links_dict
    if feature['A'] not in all_links_dict:
        all_links_dict[feature['A']] = [feature['B']]
    else:
        if feature['B'] not in all_links_dict[feature['A']]:
            all_links_dict[feature['A']].append(feature['B'])
    if feature['B'] not in all_links_dict:
        all_links_dict[feature['B']] = [feature['A']]
    else:
        if feature['A'] not in all_links_dict[feature['B']]:
            all_links_dict[feature['B']].append(feature['A'])
    return
            
df.drop_duplicates(subset=['A_B'])[['A','B']].apply(create_links_dict,axis=1)

split_nodes = [node for node in all_links_dict if len(all_links_dict[node]) > 2]
split_nodes = pd.Series(split_nodes)
# df = df.merge(split_nodes, how='left', on='B')

In [15]:
def assert_route_and_linkc_termini(group):
    group['LINKC_AM_next'] = group['LINKC_AM'].shift(-1)
    group.loc[(group['LINKC_AM'] != group['LINKC_AM_next']) & \
              (((group['LINKC_AM'].isin([18,19,20,25])) & \
               (~group['LINKC_AM_next'].isin([18,19,20,25]))) | \
              ((~group['LINKC_AM'].isin([18,19,20,25])) & \
               (group['LINKC_AM_next'].isin([18,19,20,25])))),'split_line'] = True
    group['B_next'] = group['B'].shift(-1)
    group.loc[group['A'] == group['B_next'],'split_line'] = True
    group['first'].iloc[0] = True
    group['split_line'].iloc[-1] = True
    group['STOPA'].iloc[0] = 1
    group['STOPB'].iloc[-1] = 1
    return group

df_unique = df.loc[df['YEAR_LINE_PERIOD'].isin(unique_routes['YEAR_LINE_PERIOD'].to_list())]
df_unique.loc[:,'split_line'] = False
df_unique.loc[:,'first'] = False

df_unique = df_unique.sort_values(['UNIQUE_ROUTE','LINKSEQ']) \
    .groupby('UNIQUE_ROUTE').apply(assert_route_and_linkc_termini).sort_values(['UNIQUE_ROUTE','LINKSEQ']) \
    .reset_index(drop=True)
    
df_unique = df_unique[['A_B','A','B','STOPA','STOPB','LINKC_AM','BUS_SPD','first','split_line','DISTANCE','UNIQUE_ROUTE','UNIQUE_ROUTE_SEQ','LINKSEQ','mode','geometry']]

# df_unique.loc[((df_unique['B'].isin(split_nodes.to_list())) & (df_unique['mode'].isin(['bus','tram']))) | ((df_unique['STOPB'] == 1) & (df_unique['LINKC_AM'] == 42)),'split_line'] = True

temp_split_nodes_B = df_unique.loc[((df_unique['split_line'] == True) & (df_unique['mode'].isin(['bus','tram']))) | ((df_unique['STOPB'] == 1) & (df_unique['LINKC_AM'] == 42)),'B']
temp_split_nodes_A = df_unique.loc[df_unique['first'] == True,'A']
split_nodes = split_nodes.append(temp_split_nodes_A.loc[~temp_split_nodes_A.isin(split_nodes.to_list())])
split_nodes = split_nodes.append(temp_split_nodes_B.loc[~temp_split_nodes_B.isin(split_nodes.to_list())])
df_unique.loc[df_unique['B'].isin(split_nodes.to_list()),'split_line'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['split_line'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['first'] = False


In [16]:
split_line_rows = df_unique.loc[df_unique['split_line'] == True].copy()
split_line_rows.loc[:,'next_UNIQUE_ROUTE'] = split_line_rows.loc[:,'UNIQUE_ROUTE'].shift(-1)
split_line_rows.loc[:,'prev_UNIQUE_ROUTE'] = split_line_rows.loc[:,'UNIQUE_ROUTE'].shift(1)
split_line_rows.loc[:,'next_B'] = split_line_rows.loc[:,'B'].shift(-1)
split_line_rows.loc[:,'prev_B'] = split_line_rows.loc[:,'B'].shift(1)

found_A_equals_B = False
first_links = False
unique_A_equals_B = 0
def check_start_end_nodes(row):
    global found_A_equals_B, unique_A_equals_B, first_links
    if found_A_equals_B:
        row['A_equals_B'] = unique_A_equals_B
    if row['split_line'] == True:
        if (row['B'] == row['prev_B'] and \
            row['UNIQUE_ROUTE'] == row['prev_UNIQUE_ROUTE']) or \
            first_links:
            row['A_equals_B'] = unique_A_equals_B
            found_A_equals_B = False
            first_links = False
            unique_A_equals_B += 1
        if row['B'] == row['next_B'] and \
            row['UNIQUE_ROUTE'] == row['next_UNIQUE_ROUTE']:
            row['A_equals_B'] = -1
            found_A_equals_B = True
    elif row['first'] == True:
        if row['A'] == row['B_first_split'] and \
            row['UNIQUE_ROUTE'] == row['UNIQUE_ROUTE_first_split']:
            row['A_equals_B'] = unique_A_equals_B
            found_A_equals_B = True
            first_links = True
    return row

df_unique = df_unique.merge(split_line_rows[['UNIQUE_ROUTE','B','next_UNIQUE_ROUTE','prev_UNIQUE_ROUTE','next_B','prev_B']], \
                            how='left', left_index=True, right_index=True, suffixes=('','_first_split'))
df_unique[['UNIQUE_ROUTE_first_split','B_first_split']] = df_unique[['UNIQUE_ROUTE_first_split','B_first_split']].backfill()
df_unique['A_equals_B'] = -1
df_unique = df_unique.apply(check_start_end_nodes, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split_line_rows['next_UNIQUE_ROUTE'] = split_line_rows['UNIQUE_ROUTE'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split_line_rows['prev_UNIQUE_ROUTE'] = split_line_rows['UNIQUE_ROUTE'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split_line_rows['next_B'] = split_line_rows[

In [17]:
def add_nodes_in_loop(group):
    if group['A_equals_B'].iloc[0] != -1:
        add_node_at = math.floor(group.shape[0] * 0.5)
        group['split_line'].iloc[add_node_at] = True
    return group

df_unique = df_unique.groupby('A_equals_B').apply(add_nodes_in_loop)

In [18]:
links = []
stops = []
nodes = []
total_rows = df_unique.shape[0]
rows_processed = 0

def initial_consolidate_links_and_nodes(row):
    global links, stops, nodes, rows_processed
    a_node = row['A']
    b_node = row['B']
    stopa = 0
    stopb = 0
    dist = 0
    alllinks = np.nan
    allstops = np.nan
    allnodes = np.nan
    rows_processed +=1
    print('Consolidating links and nodes: %.1f' % (100 * rows_processed / total_rows) + '% processed', end='\r')
    new_line = row['geometry']
    line_slice = new_line.coords
    if len(links) > 0:
        line_slice = new_line.coords[1:]
    links += [l for l in line_slice]
    if len(nodes) == 0:
        stops.append(Point(links[0]))
        nodes.append(a_node)
        stopa = 1
    nodes.append(b_node)
    if row['split_line'] == True:
        alllinks = LineString(links)
        dist = alllinks.length
        stops.append(Point(links[-1]))
        stopb = 1
        allstops = stops.copy()
        allnodes = nodes.copy()
        links = []
        stops = []
        nodes = []
    return [stopa, stopb, dist, alllinks, allstops, allnodes]

df_unique[['STOPA','STOPB','SEGDIST','geometry','SEGSTOPS','SEGNODES']] = df_unique.apply(lambda row: pd.Series(initial_consolidate_links_and_nodes(row)), axis=1)

print('Consolidating links and nodes: finished           ')

Consolidating links and nodes: finished           


In [19]:
df_unique = df_unique.loc[df_unique['SEGSTOPS'].notnull()]

df_unique['start_node'] = df_unique['SEGNODES'].apply(lambda nodes: nodes[0])
df_unique['end_node'] = df_unique['SEGNODES'].apply(lambda nodes: nodes[-1])
df_unique['start_point'] = df_unique['SEGSTOPS'].apply(lambda stops: stops[0])
df_unique['end_point'] = df_unique['SEGSTOPS'].apply(lambda stops: stops[1])
df_unique['unique_seg'] = df_unique['SEGNODES'].apply(lambda nodes: ','.join(str(node) for node in nodes))

In [20]:
df_unique.loc[df_unique['start_node'] == df_unique['end_node'],['UNIQUE_ROUTE','SEGNODES','LINKSEQ']]

Unnamed: 0,UNIQUE_ROUTE,SEGNODES,LINKSEQ


In [21]:
# gpd.GeoDataFrame(df_unique[['start_node','end_node','geometry','mode','LINKC_AM','UNIQUE_ROUTE','LINKSEQ']],geometry='geometry').to_file('tests/df_unique.gpkg', driver='GPKG', crs='EPSG:20255')

In [22]:
df_unique = df_unique.sort_values(['UNIQUE_ROUTE','LINKSEQ'])

In [23]:
unique_seg_to_split = df_unique.loc[(df_unique['mode'].isin(['bus','tram'])) & (~df_unique['LINKC_AM'].isin([18,19,20,25])) & (df_unique['BUS_SPD'] != 80)].groupby('unique_seg').first().reset_index()[['start_node','end_node','start_point','end_point','unique_seg','split_line','LINKC_AM','SEGDIST','BUS_SPD','geometry']]

In [24]:
original_nodes = unique_seg_to_split.drop_duplicates(subset=['start_node'])[['start_node','start_point']].rename(columns={'start_node':'node','start_point':'geometry'})
temp_B_nodes = unique_seg_to_split.drop_duplicates(subset=['end_node'])[['end_node','end_point']].rename(columns={'end_node':'node','end_point':'geometry'})
original_nodes = original_nodes.append(temp_B_nodes.loc[~temp_B_nodes['node'].isin(original_nodes['node'].to_list())])
original_nodes = gpd.GeoDataFrame(original_nodes.reset_index(drop=True), geometry='geometry')

original_nodes.shape[0]

4628

In [25]:
# original_nodes.to_file('tests/original_nodes.gpkg', driver='GPKG', crs='EPSG:20255')

In [26]:
total_rows = unique_seg_to_split.shape[0]
rows_processed = 0
offset = 5
count = 0

def split_line(line, distance, segments):
    line_segs = [list(line.coords)]
    point = line.coords[0]
    bbox = (point[0] - offset, point[1] - offset, point[0] + offset, point[1] + offset)
    bboxes = [None]
    for i in range(segments - 1):
        coords = line_segs[-1]
        line = LineString(coords)
        line_segs = line_segs[:-1]
        for i, p in enumerate(coords):
            pd = line.project(Point(p))
            if pd == distance:
                point = coords[i]
                bbox = (point[0] - offset, point[1] - offset, point[0] + offset, point[1] + offset)
                line_segs += [coords[:i+1], coords[i:]]
                bboxes[-1] = (bboxes[-1],bbox)
                bboxes.append(bbox)
                break
            if pd > distance:
                cp = line.interpolate(distance)
                point = (cp.x, cp.y)
                bbox = (point[0] - offset, point[1] - offset, point[0] + offset, point[1] + offset)
                line_segs += [coords[:i] + [point],[point] + coords[i:]]
                bboxes[-1] = (bboxes[-1],bbox)
                bboxes.append(bbox)
                break
    bboxes[-1] = (bboxes[-1],None)
    return list(zip(line_segs, bboxes))
        
def add_stops(row):
    global rows_processed, count
    rows_processed +=1
    print('Splitting long segments: %.1f' % (100 * rows_processed / total_rows) + '% processed', end='\r')
    line = row['geometry']
    dist = line.length
    if dist > 300:
        if dist > 5000:
            segs = math.ceil(dist / 1000)
        elif dist > 1500:
            segs = math.ceil(dist / 400)
        else:
            segs = math.ceil(dist / 250)
        count += 1
        seg_length = dist / segs
        return split_line(line, seg_length, segs)
    bboxes = [(None,None)]
    line_segs = [list(line.coords)]
    return list(zip(line_segs, bboxes))

unique_seg_to_split['new_segments'] = unique_seg_to_split.apply(add_stops, axis=1)

print('Splitting long segments: finished           ')
print('Split '+str(count)+' of '+str(total_rows)+' segments')

Splitting long segments: finished           
Split 9482 of 12675 segments


In [27]:
# gpd.GeoDataFrame(unique_seg_to_split[['start_node','end_node','geometry']],geometry='geometry',crs='EPSG:20255').to_file('tests/unique_seg_to_split.gpkg',driver='GPKG')

In [28]:
temp_unique_seg_to_split = unique_seg_to_split.copy()

temp_unique_seg_to_split['STOPA'] = pd.Series([1]*temp_unique_seg_to_split.shape[0],index=temp_unique_seg_to_split.index)
temp_unique_seg_to_split['STOPB'] = pd.Series([1]*temp_unique_seg_to_split.shape[0],index=temp_unique_seg_to_split.index)

temp_unique_seg_to_split = temp_unique_seg_to_split.drop(['new_segments'],axis=1) \
    .merge(temp_unique_seg_to_split['new_segments'].explode(), right_index = True, left_index = True) \
    .reset_index(drop = True)

In [29]:
temp_unique_seg_to_split['temp_seq'] = temp_unique_seg_to_split.groupby('unique_seg').cumcount()
temp_unique_seg_to_split = temp_unique_seg_to_split.sort_values(['unique_seg','temp_seq'])

temp_unique_seg_to_split = temp_unique_seg_to_split.drop(['geometry'], axis=1).rename(columns={'new_segments':'geometry'})

In [30]:
temp_unique_seg_to_split[['geometry','bboxes']] = pd.DataFrame(temp_unique_seg_to_split['geometry'].tolist(), index=temp_unique_seg_to_split.index)
temp_unique_seg_to_split[['A_bbox','B_bbox']] = pd.DataFrame(temp_unique_seg_to_split.bboxes.tolist(), index=temp_unique_seg_to_split.index)
temp_unique_seg_to_split = temp_unique_seg_to_split.drop(['bboxes'],axis=1)

In [31]:
# Dummy nodes

max_node = df[['A','B']].max().max()

def insert_nodes(group):
    global max_node
    if group.shape[0] > 1:
        new_nodes = list(range(max_node+1,max_node+group.shape[0]))
        group['end_node'].iloc[:-1] = new_nodes
        group['start_node'].iloc[1:] = new_nodes
        max_node = max_node+group.shape[0]-1
    return group

temp_unique_seg_to_split = temp_unique_seg_to_split.groupby('unique_seg').apply(insert_nodes)

In [32]:
expanded_nodes = temp_unique_seg_to_split.loc[temp_unique_seg_to_split['A_bbox'].notnull()].drop_duplicates(subset=['start_node'])[['start_node','geometry','A_bbox']].rename(columns={'start_node':'node','A_bbox':'bbox'})
expanded_nodes.geometry = expanded_nodes.geometry.apply(lambda line: Point(line[0]))
temp_B_nodes = temp_unique_seg_to_split.loc[temp_unique_seg_to_split['B_bbox'].notnull()].drop_duplicates(subset=['end_node'])[['end_node','geometry','B_bbox']].rename(columns={'end_node':'node','B_bbox':'bbox'})
temp_B_nodes.geometry = temp_B_nodes.geometry.apply(lambda line: Point(line[-1]))
expanded_nodes = expanded_nodes.append(temp_B_nodes.loc[~temp_B_nodes['node'].isin(expanded_nodes['node'].to_list())])
expanded_nodes = expanded_nodes.append(original_nodes.loc[~original_nodes['node'].isin(expanded_nodes['node'].to_list())])
expanded_nodes = gpd.GeoDataFrame(expanded_nodes, geometry='geometry')
expanded_nodes['node'] = expanded_nodes['node'].apply(int)

expanded_nodes = expanded_nodes.sort_values('node').reset_index(drop=True)

In [33]:
# gpd.GeoDataFrame(expanded_nodes[['node','geometry']],geometry='geometry',crs='EPSG:20255').to_file('tests/expanded_nodes.gpkg',driver='GPKG')

In [34]:
new_nodes = expanded_nodes.loc[~expanded_nodes['node'].isin(original_nodes['node'].to_list())]

In [35]:
tmp = new_nodes.bbox.apply(lambda row: list(expanded_nodes.sindex.intersection(row)))
tmp = tmp.apply(lambda x: sorted(x))

In [36]:
dup_nodes = new_nodes.merge(tmp.explode().rename('near_node_idx'), how='left', left_index=True, right_index=True)
dup_nodes = dup_nodes.merge(expanded_nodes[['node','geometry']], how='left', left_on='near_node_idx', right_index=True, suffixes=('','_nearby')).drop(['bbox','geometry','near_node_idx'],axis=1)
dup_nodes = dup_nodes.loc[dup_nodes['node'] != dup_nodes['node_nearby']].sort_values(['node','node_nearby'])
replace_nodes = dup_nodes.groupby('node').first().reset_index().rename(columns={'node_nearby':'replace_with_node','geometry_nearby':'replace_with_geometry'})

replace_nodes.duplicated().any()

False

In [37]:
has_been_replaced = {}

def replaced(row):
    global has_been_replaced
    if row.replace_with_node not in has_been_replaced:
        has_been_replaced[row.node] = row.replace_with_node
    else:
        row.replace_with_node = has_been_replaced[row.replace_with_node]
        has_been_replaced[row.node] = row.replace_with_node
    return row
    
replace_nodes = replace_nodes.sort_values('node').apply(replaced, axis=1)
replace_nodes = replace_nodes.loc[replace_nodes['node'] != replace_nodes['replace_with_node']]

In [38]:
# gpd.GeoDataFrame(replace_nodes,geometry='replace_with_geometry',crs='EPSG:20255').to_file('tests/replace_nodes.gpkg',driver='GPKG')

In [39]:
replace_nodes.replace_with_geometry = replace_nodes.replace_with_geometry.apply(lambda geom: (geom.x,geom.y))

In [40]:
def update_geometry(row, node):
    if node == 'start_node':
        row.geometry[0] = row.replace_with_geometry
    elif node == 'end_node':
        row.geometry[-1] = row.replace_with_geometry
    return row.geometry

temp_unique_seg_to_split = temp_unique_seg_to_split.merge(replace_nodes, how='left', left_on='start_node', right_on='node').drop(['node'],axis=1)
temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull(),'start_node'] = temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull(),'replace_with_node']
temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull(),'geometry'] = temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull()].apply(lambda row: update_geometry(row, 'start_node'), axis=1)

temp_unique_seg_to_split = temp_unique_seg_to_split.drop(['replace_with_node','replace_with_geometry'],axis=1).merge(replace_nodes, how='left', left_on='end_node', right_on='node').drop(['node'],axis=1)
temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull(),'end_node'] = temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull(),'replace_with_node']
temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull(),'geometry'] = temp_unique_seg_to_split.loc[temp_unique_seg_to_split['replace_with_node'].notnull()].apply(lambda row: update_geometry(row, 'end_node'), axis=1)

temp_unique_seg_to_split.geometry = temp_unique_seg_to_split.geometry.apply(LineString)

temp_unique_seg_to_split = temp_unique_seg_to_split.drop(['start_point','end_point','A_bbox','B_bbox','replace_with_node','replace_with_geometry'], axis=1)

In [41]:
# gpd.GeoDataFrame(temp_unique_seg_to_split,geometry='geometry',crs='EPSG:20255').to_file('tests/temp_unique_seg_to_split.gpkg',driver='GPKG')

In [42]:
df_unique = df_unique.merge(temp_unique_seg_to_split, how='left', on='unique_seg', suffixes=('','_merged')).reset_index(drop=True)

In [43]:
df_unique.loc[df_unique['start_node_merged'].notnull(),'start_node'] = df_unique.loc[df_unique['start_node_merged'].notnull(),'start_node_merged']
df_unique.loc[df_unique['end_node_merged'].notnull(),'end_node'] = df_unique.loc[df_unique['end_node_merged'].notnull(),'end_node_merged']
df_unique.loc[df_unique['start_node_merged'].notnull(),'STOPA'] = df_unique.loc[df_unique['start_node_merged'].notnull(),'STOPA_merged']
df_unique.loc[df_unique['end_node_merged'].notnull(),'STOPB'] = df_unique.loc[df_unique['end_node_merged'].notnull(),'STOPB_merged']

df_unique.loc[df_unique['geometry_merged'].notnull(),'geometry'] = df_unique.loc[df_unique['geometry_merged'].notnull(),'geometry_merged']
df_unique = df_unique.drop(['start_node_merged','end_node_merged','STOPA_merged','STOPB_merged','geometry_merged'], axis=1)

df_unique['LINKSEQ'] = df_unique.sort_values(['UNIQUE_ROUTE','LINKSEQ','temp_seq']).groupby('UNIQUE_ROUTE').cumcount()

In [44]:
# gpd.GeoDataFrame(df_unique[['A','B','geometry']],geometry='geometry',crs='EPSG:20255').to_file('tests/df_unique.gpkg',driver='GPKG')

In [45]:
lasts = df_unique.reset_index().groupby('UNIQUE_ROUTE').last().reset_index().set_index('index')
lasts['last'] = True

df_unique = df_unique.merge(lasts['last'], how='left', left_index=True, right_index=True)

df_unique['stop_flag'] = False
df_unique.loc[((df_unique['STOPB'] == 1) & (~df_unique['LINKC_AM'].isin([18,19,20,25])) & (df_unique['BUS_SPD'] != 80)) | df_unique['B'].isin(original_nodes['node'].to_list()) | df_unique['last'] == True, 'stop_flag'] = True

In [46]:
links = []
nodes = []
total_rows = df_unique.shape[0]
rows_processed = 0

def final_consolidate_links_and_nodes(row):
    global links, nodes, rows_processed
    a_node = row['start_node']
    b_node = row['end_node']
    stopa = 0
    stopb = 0
    totaldist = np.nan
    alllinks = np.nan
    allnodes = np.nan
    rows_processed +=1
    print('Consolidating links and nodes: %.1f' % (100 * rows_processed / total_rows) + '% processed', end='\r')
    new_line = row['geometry']
    line_slice = new_line.coords
    if len(links) > 0:
        line_slice = new_line.coords[1:]
    links += [l for l in line_slice]
    if len(nodes) == 0:
        nodes.append((int(a_node),Point(links[0])))
    new_dist = new_line.length
    if row['stop_flag'] == True:
        alllinks = LineString(links)
        totaldist = alllinks.length
        nodes.append((int(b_node),Point(links[-1])))
        stopa = 1
        stopb = 1
        allnodes = nodes.copy()
        links = []
        nodes = []
    return [stopa, stopb, totaldist, alllinks, allnodes, [Point(new_line.coords[0]),Point(new_line.coords[-1])], new_dist]

df_unique[['STOPA','STOPB','SEGDIST','geometry','SEGNODES','SEGSTOPS','shape_dist_traveled']] = df_unique[['UNIQUE_ROUTE','LINKSEQ','UNIQUE_ROUTE_SEQ','start_node','end_node','geometry','stop_flag']].sort_values(['UNIQUE_ROUTE','LINKSEQ']).apply(lambda row: pd.Series(final_consolidate_links_and_nodes(row)), axis=1)

print('Consolidating links and nodes: finished           ')

Consolidating links and nodes: finished           


In [47]:
df_unique = df_unique.loc[df_unique['SEGDIST'].notnull()]

In [48]:
df_unique.loc[df_unique['STOPA'] != 1]

Unnamed: 0,A_B,A,B,STOPA,STOPB,LINKC_AM,BUS_SPD,first,split_line,DISTANCE,...,end_point,unique_seg,split_line_merged,LINKC_AM_merged,SEGDIST_merged,BUS_SPD_merged,temp_seq,last,stop_flag,shape_dist_traveled


In [49]:
df_unique['A'] = df_unique['SEGNODES'].apply(lambda nodes: int(nodes[0][0]))
df_unique['B'] = df_unique['SEGNODES'].apply(lambda nodes: int(nodes[1][0]))

In [50]:
def expand_points(group):
    first_line = group.iloc[0]
    first_line['SEGSTOPS'] = first_line['SEGSTOPS'][0]
    first_line['shape_dist_traveled'] = 0
    group['SEGSTOPS'] = group['SEGSTOPS'].apply(lambda x: x[1])
    group['shape_dist_traveled'] = (group['shape_dist_traveled'].cumsum()*100).apply(round)/100
    group['LINKSEQ'] = group['LINKSEQ'] + 1
    return first_line.to_frame().T.append(group)

df_unique = df_unique.sort_values(['UNIQUE_ROUTE','LINKSEQ'])
df_shapes = df_unique[['UNIQUE_ROUTE','LINKSEQ','SEGSTOPS','shape_dist_traveled']]
df_shapes = df_shapes.groupby('UNIQUE_ROUTE').apply(expand_points).reset_index(drop=True)

df_shapes = gpd.GeoDataFrame(df_shapes, geometry='SEGSTOPS', crs='epsg:20255').to_crs('epsg:4326')

df_shapes['shape_pt_lon'],df_shapes['shape_pt_lat'] = zip(*df_shapes['SEGSTOPS'].apply(lambda x: [x.x,x.y]))

df_shapes = df_shapes.rename({'LINKDIST':'shape_dist_traveled','LINKSEQ':'shape_pt_sequence'}, axis=1).drop(['SEGSTOPS'], axis=1)

In [51]:
df = df.sort_values(['year','LINENO','PERIOD','LINKSEQ']).drop(['A','B','STOPA','STOPB','LINKSEQ','geometry'], axis=1).merge(df_unique[['UNIQUE_ROUTE_SEQ','A','B','STOPA','STOPB','SEGDIST','geometry','SEGNODES','LINKSEQ']], how='left', on='UNIQUE_ROUTE_SEQ')

In [52]:
df.shape[0]

3152530

In [53]:
df = df.loc[(df.STOPA == 1) | (df.STOPB == 1)]

df['STOPA_shift'] = df['STOPA'].shift()
df['A_shift'] = df['A'].shift()

In [54]:
lasts = df.groupby(['year','NAME','PERIOD']).last()
lasts.loc[lasts['STOPB'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mode,LINENO,LONGNAME,YEAR_LINE_PERIOD,YEAR_LINE,A_B,LINKC_AM,DISTANCE,BUS_SPD,UNIQUE_ROUTE,...,A,B,STOPA,STOPB,SEGDIST,geometry,SEGNODES,LINKSEQ,STOPA_shift,A_shift
year,NAME,PERIOD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1


In [55]:
firsts = df.groupby(['year','NAME','PERIOD']).first()
firsts.loc[firsts['STOPA'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mode,LINENO,LONGNAME,YEAR_LINE_PERIOD,YEAR_LINE,A_B,LINKC_AM,DISTANCE,BUS_SPD,UNIQUE_ROUTE,...,A,B,STOPA,STOPB,SEGDIST,geometry,SEGNODES,LINKSEQ,STOPA_shift,A_shift
year,NAME,PERIOD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1


In [56]:
df['A'] = np.where(df['STOPA'] == 0, df['A_shift'], df['A'])
df['STOPA'] = np.where(df['STOPA'] == 0, df['STOPA_shift'], df['STOPA'])

df = df.loc[df.STOPB == 1].sort_values(['year','LINENO','PERIOD','LINKSEQ'])
df['A'] = df['A'].apply(int)
df['B'] = df['B'].apply(int)

df = df.drop(['LINKSEQ','UNIQUE_ROUTE_SEQ'],axis=1)

In [57]:
df.loc[df['A'] == df['B']]

Unnamed: 0,PERIOD,mode,LINENO,NAME,LONGNAME,year,YEAR_LINE_PERIOD,YEAR_LINE,A_B,LINKC_AM,...,UNIQUE_ROUTE,A,B,STOPA,STOPB,SEGDIST,geometry,SEGNODES,STOPA_shift,A_shift


In [58]:
# gpd.GeoDataFrame(df[['A','B','geometry']],geometry='geometry',crs='EPSG:20255').to_file('tests/df.gpkg',driver='GPKG')

In [59]:
pt_reporting['Time Period'] = pt_reporting['Time Period'].apply(lambda x: x.split(' ')[0])
pt_reporting['YEAR_NAME_PERIOD'] = pt_reporting['year'] + '_' + pt_reporting['Short Name'] + "_" + pt_reporting['Time Period']

df['YEAR_NAME_PERIOD'] = df['year'] + '_' + df['NAME'] + "_" + df['PERIOD']
unique_speeds_headways = df.groupby('YEAR_NAME_PERIOD').first().reset_index()
unique_speeds_headways = unique_speeds_headways.merge(pt_reporting[['YEAR_NAME_PERIOD','Headway','Average Speed']], how='left', on='YEAR_NAME_PERIOD').drop('YEAR_NAME_PERIOD', axis=1)[['YEAR_LINE','year','LINENO','PERIOD','NAME','LONGNAME','mode','Headway','Average Speed','UNIQUE_ROUTE']]
unique_speeds_headways = unique_speeds_headways.rename({'LINENO':'id','NAME':'route','LONGNAME':'name','PERIOD':'period','Average Speed':'speed','Headway':'headway'}, axis=1)

unique_speeds_headways

Unnamed: 0,YEAR_LINE,year,id,period,route,name,mode,headway,speed,UNIQUE_ROUTE
0,2018_305,2018,305,AM,109,BOX HILL - PORT MELBOURNE,tram,7.5,15.42,1454
1,2018_305,2018,305,IP,109,BOX HILL - PORT MELBOURNE,tram,10.0,16.24,1454
2,2018_305,2018,305,OP,109,BOX HILL - PORT MELBOURNE,tram,12.0,16.54,1454
3,2018_305,2018,305,PM,109,BOX HILL - PORT MELBOURNE,tram,7.5,15.98,1454
4,2018_2,2018,2,AM,11011,Mernda - Flinders Street - All Stations,train,13.3,36.16,782
...,...,...,...,...,...,...,...,...,...,...
31625,2051_1476,2051,1476,PM,WredstoneR,Sunbury - Redstone Hill,bus,40.0,20.09,424
31626,2051_1475,2051,1475,AM,Wredstone,Sunbury - Redstone Hill,bus,40.0,21.36,1304
31627,2051_1475,2051,1475,IP,Wredstone,Sunbury - Redstone Hill,bus,40.0,21.54,1304
31628,2051_1475,2051,1475,OP,Wredstone,Sunbury - Redstone Hill,bus,60.0,21.72,1304


In [60]:
def fix_missing_speeds(row):
    for i in range(len(speed_cols)):
        if row[speed_cols[i]] == 0:
            row[speed_cols[i]] = row[speed_cols[(i+3) % 4]]
    return row

pivot = pd.pivot_table(unique_speeds_headways, index=['YEAR_LINE'], columns=["period"], values=["headway","speed"], aggfunc=np.mean)
pivot.columns = ['_'.join(col).strip().lower() for col in pivot.columns.values]

speed_cols = ['speed_am','speed_ip','speed_pm','speed_op']
headway_cols = ['headway_am','headway_ip','headway_pm','headway_op']
unique_speeds_headways = unique_speeds_headways.merge(pivot.reset_index(), how='left', on='YEAR_LINE').groupby('YEAR_LINE').first().reset_index().replace(np.nan,0).drop('period',axis=1)
unique_speeds_headways = unique_speeds_headways.apply(fix_missing_speeds, axis=1)
unique_speeds_headways = unique_speeds_headways.loc[(unique_speeds_headways[headway_cols] != 0).any(1)]

In [61]:
def stop_nodes(group):
    global stop_list, link_list, dist_list, node_list, counter
    counter = 0
    link_list = []
    stop_list = []
    dist_list = []
    node_list = []
    group.apply(get_nodes, axis=1)
    return [list(zip(link_list, node_list, dist_list)), MultiLineString(link_list), stop_list]
    
def get_nodes(row):
    global stop_list, link_list, dist_list, node_list, counter
    link_list.append(row['geometry'])
    node_list.append(str(row['A'])+'_'+str(row['B']))
    dist_list.append(row['SEGDIST'])
    if counter == 0:
        stop_list.append(row['SEGNODES'][0])
    stop_list.append(row['SEGNODES'][1])
    counter += 1
    
def number_segments(l):
    segs = ''
    for i in range(len(l)):
        segs += str(i)+','
    return segs[:-1]

stop_patterns = df.loc[df['YEAR_LINE_PERIOD'].isin(unique_routes['YEAR_LINE_PERIOD'].to_list())].groupby('UNIQUE_ROUTE').apply(stop_nodes).reset_index()
stop_patterns[['SEGMENTS','LINE','STOPPATTERN']] = pd.DataFrame(stop_patterns[0].tolist(), index=stop_patterns.index)
stop_patterns = stop_patterns.drop(0, axis=1)
stop_patterns['segments'] = stop_patterns['SEGMENTS'].apply(number_segments)

stop_patterns = unique_speeds_headways.merge(stop_patterns, how='left', on='UNIQUE_ROUTE')

# Replace non-ASCII characters
table = {0x2013: '-', 0x2014: '--', 0x00a7: 'sect. ', 0x00A0: ' '}
stop_patterns.loc[~stop_patterns['name'].apply(lambda string: string.isascii()),'name'] = stop_patterns.loc[~stop_patterns['name'].apply(lambda string: string.isascii()),'name'].apply(lambda string: string.translate(table))
stop_patterns.loc[~stop_patterns['name'].apply(lambda string: string.isascii()),'name']

Series([], Name: name, dtype: object)

In [62]:
route_output = stop_patterns.groupby('YEAR_LINE').first().reset_index()[['year','id','route','mode','name','LINE']]
route_output = route_output.rename({'LINE':'geometry','mode':'OPERATOR_N','route':'ROUTE_SHORT','name':'ROUTE_LONG','id':'ROUTE_ID'}, axis=1)
route_output['SHAPE_ID'] = route_output['ROUTE_ID']

route_output = gpd.GeoDataFrame(route_output[['year','SHAPE_ID','ROUTE_ID','ROUTE_SHORT','ROUTE_LONG','OPERATOR_N','geometry']], geometry='geometry', crs='epsg:20255').to_crs('epsg:4326')

In [63]:
routes = stop_patterns[['year','route','id','mode','name']+headway_cols+speed_cols+['segments']]

In [64]:
seg_expand = stop_patterns[['id','year','mode','route','name','UNIQUE_ROUTE']] \
    .merge(stop_patterns.SEGMENTS.explode(), right_index = True, left_index = True) \
    .dropna() \
    .sort_values(['year','id']) \
    .reset_index(drop = True)

In [65]:
seg_expand[['SEGMENT','NODES','route_dist']] = pd.DataFrame(seg_expand['SEGMENTS'].tolist(), index=seg_expand.index)
seg_expand = gpd.GeoDataFrame(seg_expand.drop('SEGMENTS', axis=1).rename(columns={'SEGMENT':'geometry'}), geometry='geometry', crs='epsg:20255').to_crs('epsg:4326')

In [66]:
seg_expand['NODES'] = seg_expand['NODES'].apply(lambda x: x.split('_'))
seg_expand[['stop1','stop2']] = pd.DataFrame(seg_expand['NODES'].tolist(), index=seg_expand.index)
seg_expand = seg_expand.drop('NODES', axis=1)
seg_expand['stop1N'] = seg_expand['stop1'].apply(str)
seg_expand['stop2N'] = seg_expand['stop2'].apply(str)
seg_expand = seg_expand.rename({'id':'routes'}, axis=1)
seg_expand['id'] = seg_expand.groupby(['year','routes']).cumcount()

In [67]:
(seg_expand['stop1'] == seg_expand['stop2']).to_frame().groupby(0)[0].count()

0
False    483512
Name: 0, dtype: int64

In [68]:
sp_expand = stop_patterns[['id','year','mode','route','name']] \
    .merge(stop_patterns.STOPPATTERN.explode(), right_index = True, left_index = True) \
    .dropna() \
    .sort_values(['year','id']) \
    .reset_index(drop = True)

In [69]:
sp_expand[['id','STOPPATTERN']] = pd.DataFrame(sp_expand['STOPPATTERN'].tolist(), index=sp_expand.index)
sp_expand['name'] = sp_expand['id'].apply(str)
sp_expand = gpd.GeoDataFrame(sp_expand.rename(columns={'STOPPATTERN':'geometry'}), geometry='geometry', crs='epsg:20255').to_crs('epsg:4326')
sp_expand = sp_expand.drop_duplicates(subset=['id']).drop(['year','route','mode'], axis=1).reset_index(drop=True)

In [70]:
end_processing_time = time.time()
pre_processing_time = end_processing_time - start_time

# from importlib import reload
# import route_segs
# reload(create_gtfs_from_basicinfo)
# reload(route_segs)

# print('Saving network shapefiles')
# if not os.path.isdir('output/Network'):
#     os.mkdir('output/Network')
# for year in years:
#     route_output.loc[route_output['year'] == year].to_file('output/Network/VITM_'+year+'_segments.shp')
    
#     stop_ids = seg_expand.loc[seg_expand['year'] == year, ['stop1','stop2']].values
#     stop_ids = list(dict.fromkeys([int(val) for sublist in stop_ids for val in sublist]))
    
#     sp_expand.loc[sp_expand['id'].isin(stop_ids)].to_file('output/Network/VITM_'+year+'_stops.shp')

if not os.path.isdir('input/temp'):
    os.mkdir('input/temp')

modes = {'vline':'1','train':'2','tram':'3','bus':'4','skybus':'11'}

for year in years:
    print('Generating GTFS outputs for '+year+' reference case\n')
    for mode in all_modes:
        if mode not in modes:
            print("'"+mode+"' not a valid mode")
            continue
#         if mode not in seg_expand['mode'].unique().to_list():
#             continue
    #     if mode != 'train':
    #         continue
        sys.argv = ['', '--year', year, '--service', mode]

        if not os.path.isdir('output/'+modes[mode]):
            os.mkdir('output/'+modes[mode])

        stop_ids = seg_expand.loc[(seg_expand['mode'] == mode) & (seg_expand['year'] == year), ['stop1','stop2']].values
        stop_ids = list(dict.fromkeys([int(val) for sublist in stop_ids for val in sublist]))
        
        unique_route_ids = list(seg_expand.loc[(seg_expand['mode'] == mode) & (seg_expand['year'] == year), 'UNIQUE_ROUTE'].unique())

        sp_expand.loc[sp_expand['id'].isin(stop_ids)].to_file('input/temp/VITM_stops.shp')
        seg_expand.loc[(seg_expand['mode'] == mode) & (seg_expand['year'] == year)].drop('UNIQUE_ROUTE',axis=1).to_file('input/temp/VITM_segments.shp')
        routes.loc[(routes['mode'] == mode) & (routes['year'] == year)].to_csv('input/temp/VITM_routes.csv', index=False, sep=';')
        df_shapes.loc[df_shapes['UNIQUE_ROUTE'].isin(unique_route_ids)].drop('UNIQUE_ROUTE',axis=1).to_csv('input/temp/shapes.txt', index=False)

        create_gtfs_from_basicinfo.run()

    print('\nCreating output/VITM_'+year+'_GTFS.zip...\n')
    # create a ZipFile object
    with zipfile.ZipFile('output/VITM_'+year+'_GTFS.zip', 'w', zipfile.ZIP_DEFLATED) as zipObj:
        # Iterate over all the files in directory
        for folder_id in list(modes.values()):
            if os.path.isdir('output/'+folder_id):
                for folderName, subfolders, filenames in os.walk('output/'+folder_id):
                    for filename in filenames:
                        #create complete filepath of file in directory
                        filePath = os.path.join(folderName, filename)
                        # Add file to zip
                        zipObj.write(filePath, os.path.join(folder_id, filename))
                shutil.rmtree('output/'+folder_id)
    
print('Cleaning up temp files...')
shutil.rmtree('input/temp')

print('Done')

total_time = time.time() - start_time

print('Pre-processing time: %.1f minutes' % (pre_processing_time / 60))
print('Total time: %.1f minutes' % (total_time / 60))

os.system("printf '\a'")

Generating GTFS outputs for 2018 reference case

Creating GTFS feed for bus
Processing routes 0 to 907
About to save complete timetable to file output/4/google_transit.zip ...
...finished writing to file output/4/google_transit.zip.
Creating GTFS feed for skybus
Processing routes 0 to 17
About to save complete timetable to file output/11/google_transit.zip ...
...finished writing to file output/11/google_transit.zip.
Creating GTFS feed for train
Processing routes 0 to 267
About to save complete timetable to file output/2/google_transit.zip ...
...finished writing to file output/2/google_transit.zip.
Creating GTFS feed for tram
Processing routes 0 to 45
About to save complete timetable to file output/3/google_transit.zip ...
...finished writing to file output/3/google_transit.zip.
Creating GTFS feed for vline
Processing routes 0 to 95
About to save complete timetable to file output/1/google_transit.zip ...
...finished writing to file output/1/google_transit.zip.

Creating output/VITM_20

0