In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import geoplot
from pysal.lib import weights
import networkx as nx
from scipy.spatial import distance
import momepy
import pickle
import math

In [2]:
#mount_path = "/Users/shenhaowang/Dropbox (MIT)/project_media_lab_South_Australia"
mount_path = "/mnt/c/Users/jason/Dropbox (MIT)/"

In [3]:
## read files

# read dfs
trans_mcc_df = pd.read_pickle("../../data_process/trans_mcc_df.pkl")
trans_age_df = pd.read_pickle("../../data_process/trans_age_df.pkl")
flow_df = pd.read_pickle("../../data_process/flow_df.pkl")

# read spatial files
sa2_south_au = gpd.read_file("../../data_process/shapefiles/sa2_south_au.shp")
sa2_adelaide = gpd.read_file('../../data_process/shapefiles/sa2_adelaide.shp')

# read road networks
sa2_roads = gpd.read_file("../../data_process/shapefiles/sa2_roads.shp")

# read job and income data
jobs_all=pd.read_pickle("../../data_process/jobs_all.pkl")
jobs_industries=pd.read_pickle("../../data_process/jobs_industries.pkl")

In [4]:
def shortest_path(shp_file):
    """
        Inputs:
            shp_file - a shp file
        Outputs:
            A dictionary that maps (o,d) pairs to its shortest path
    """
    
    print("=====Running shortest_path=====")
    
    #convert to austrailia projection
    shp_file_proj = shp_file.to_crs("epsg:3112")
    # Step 1. Queen net
    shp_file_queen = weights.contiguity.Queen.from_dataframe(shp_file)
    
    # Step 2. Kernel net with the right euclidean weighting
    #Use all K nearest neighbors
    shp_file_kernel = weights.distance.Kernel.from_dataframe(shp_file_proj, k = shp_file_queen.n - 1)
    # turn the defaults to euclidean distances as weights.
    for i in shp_file_kernel.neighbors.keys():
        for j_idx in range(len(shp_file_kernel.neighbors[i])):
            j = shp_file_kernel.neighbors[i][j_idx]
            # note that kw.weights indices are 
            # i (node index), j_idx (index of the node on the list - not node index!)
            weight = shp_file_kernel.weights[i][j_idx]
            distance = (1 - weight)*shp_file_kernel.bandwidth[i]
            shp_file_kernel.weights[i][j_idx] = distance[0]
    
    # Step 3. assign euclidean weights to Queen net
    for o in shp_file_queen.neighbors.keys():
        for d_idx in range(len(shp_file_queen.neighbors[o])):
            d = shp_file_queen.neighbors[o][d_idx] # return the o and d SA2 original indices. 
            weight = shp_file_kernel[o][d] # get the kernel weight associated with the o and d.
            shp_file_queen.weights[o][d_idx] = weight
            
            
    # create the queen network in nx
    shp_file_nx = shp_file_queen.to_networkx()

    # assign weights to adelaide_nx
    for o,d in shp_file_nx.edges:
        shp_file_nx.edges[o,d]['weight'] = shp_file_queen[o][d]

    # example weight between nodes 0 and 1.
    shp_file_nx.get_edge_data(0, 1)
    
    # full paths.
    # return: (node, (distance, path))
    path=dict(nx.all_pairs_dijkstra(shp_file_nx, weight='weight'))
    
    # create a OD dictionary.
    OD_full_path = {}

    for o in range(110):
        for d in range(110):
            if d==103 or o==103: # note that 103 is the island - this is no path to it.
                pass
            else:
                OD_full_path[(o,d)] = path[o][1][d]
                
    print("=====DONE shortest_path=====")        
        
    return OD_full_path

In [5]:
OD_full_path = shortest_path(sa2_adelaide)

=====Running shortest_path=====


 There are 2 disconnected components.
 There is 1 island with id: 103.


=====DONE shortest_path=====


In [6]:
OD_full_path

{(0, 0): [0],
 (0, 1): [0, 1],
 (0, 2): [0, 18, 17, 14, 51, 2],
 (0, 3): [0, 12, 11, 3],
 (0, 4): [0, 12, 11, 3, 4],
 (0, 5): [0, 16, 10, 9, 5],
 (0, 6): [0, 12, 11, 3, 4, 6],
 (0, 7): [0, 12, 11, 3, 4, 7],
 (0, 8): [0, 16, 10, 9, 5, 8],
 (0, 9): [0, 16, 10, 9],
 (0, 10): [0, 16, 10],
 (0, 11): [0, 12, 11],
 (0, 12): [0, 12],
 (0, 13): [0, 18, 17, 14, 13],
 (0, 14): [0, 18, 17, 14],
 (0, 15): [0, 18, 17, 15],
 (0, 16): [0, 16],
 (0, 17): [0, 18, 17],
 (0, 18): [0, 18],
 (0, 19): [0, 21, 19],
 (0, 20): [0, 1, 20],
 (0, 21): [0, 21],
 (0, 22): [0, 22],
 (0, 23): [0, 23],
 (0, 24): [0, 1, 20, 35, 48, 41, 44, 29, 28, 31, 24],
 (0, 25): [0, 21, 37, 36, 39, 40, 45, 30, 27, 25],
 (0, 26): [0, 1, 20, 102, 38, 34, 26],
 (0, 27): [0, 21, 37, 36, 39, 40, 45, 30, 27],
 (0, 28): [0, 1, 20, 35, 48, 41, 44, 29, 28],
 (0, 29): [0, 1, 20, 35, 48, 41, 44, 29],
 (0, 30): [0, 21, 37, 36, 39, 40, 45, 30],
 (0, 31): [0, 1, 20, 35, 48, 41, 44, 29, 28, 31],
 (0, 32): [0, 21, 37, 36, 39, 40, 45, 32],
 (0, 33):

In [7]:
def union_road_land_shp(shp, road_shp):
    """
        Inputs:
            shp - a shp file
            road_shp - the shp file containing road information
        Outputs:
            A shp file with all the information merged
    """
    
    print("=====Running union_road_land_shp=====")
    
    # crs and projection
    shp_proj = shp.to_crs("epsg:3112")
    sa2_roads_proj = road_shp.to_crs("epsg:3112")

    
    # create the centroids for roads
    road_centroid = sa2_roads_proj.centroid
    
    # attach SA2 idx to road networks
    sa2_roads_proj['SA2_loc'] = -1 # init as -1.

    for SA2_idx in range(shp_proj.shape[0]):
        # assign SA2_idx to the road network
        within_logic = road_centroid.within(shp_proj.loc[SA2_idx, 'geometry'])
        sa2_roads_proj.loc[within_logic, 'SA2_loc'] = SA2_idx
        
    # Use only the 'class' variable for now. 
    sa2_roads_class_proj = sa2_roads_proj[['class', 'geometry', 'SA2_loc']]
    sa2_roads_class_proj_dummies = pd.get_dummies(sa2_roads_class_proj)
    
    
    # aggregate the road attribute dummies for SA2.
    sa2_roads_class_proj_dummies = sa2_roads_class_proj_dummies.loc[sa2_roads_class_proj_dummies['SA2_loc'] > -1]
    sa2_road_class_agg=sa2_roads_class_proj_dummies.groupby(by='SA2_loc').sum()
    
    # augment road class variables to SA2_network.
    shp_proj = shp_proj.merge(sa2_road_class_agg, how='inner', left_index=True, right_index=True)
    
    print("=====DONE union_road_land_shp=====")
    
    return shp_proj, sa2_roads_proj

In [8]:
elt1, elt2 = union_road_land_shp(sa2_adelaide, sa2_roads)

=====Running union_road_land_shp=====
=====DONE union_road_land_shp=====


In [9]:
elt1.head()

Unnamed: 0,SA2_MAIN16,SA2_5DIG16,SA2_NAME16,SA3_CODE16,SA3_NAME16,SA4_CODE16,SA4_NAME16,GCC_CODE16,GCC_NAME16,STE_CODE16,...,class_ART,class_BUS,class_COLL,class_FREE,class_HWY,class_LOCL,class_SUBA,class_TRK2,class_TRK4,class_UND
0,401011001,41001,Adelaide,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,208.0,13.0,402.0,0.0,0.0,1089.0,501.0,0.0,0.0,10.0
1,401011002,41002,North Adelaide,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,59.0,3.0,221.0,0.0,0.0,248.0,108.0,0.0,0.0,1.0
2,401021003,41003,Adelaide Hills,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,286.0,0.0,244.0,0.0,0.0,1096.0,188.0,2737.0,37.0,0.0
3,401021004,41004,Aldgate - Stirling,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,90.0,0.0,107.0,88.0,0.0,1360.0,421.0,700.0,0.0,19.0
4,401021005,41005,Hahndorf - Echunga,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,87.0,0.0,49.0,24.0,0.0,438.0,73.0,423.0,0.0,0.0


In [10]:
def get_degree_df(shp_proj, road_proj):
    """
        Inputs:
            shp_proj - a shp projection merged with road info; the first output union_road_land_shp(shp, road_shp)
            road_proj - a shp projection that has only road information; the second output of union_road_land_shp(shp, road_shp)
        Outputs:
            degree_df - a df with SA2 code and its degree counts
            node_degree_df - a pickle file with the shp file information + node degree counts
            edge_degree_df - a pickle file with the shp file information + agg node degree counts of the shortest path
    """
    
    print("=====Running get_degree_df=====")
    
    
    #get counts
    count ={}
    for elt in road_proj["SA2_loc"]:
        if elt in count:
            count[elt] += 1
        else:
            count[elt] = 1
    SA_idxs = sorted((key,count[key]) for key in count)
    
    
    sa_idx_to_graph = {}
    for sa_idx,c in SA_idxs[1:]:
        within = road_proj[road_proj["SA2_loc"]==sa_idx]
        graph = momepy.gdf_to_nx(within, approach='primal')
        sa_idx_to_graph[sa_idx] = graph
        
    
    degree_df = pd.DataFrame(columns=["SA2_MAIN16", "num_nodes", 
                                      "num_1degree", "num_2degree", 
                                      "num_3degree", "num_4degree", 
                                      "num_greater5degree"])
    
    for sa_idx in sa_idx_to_graph:
        g = sa_idx_to_graph[sa_idx]
        degree = dict(nx.degree(g))
        nx.set_node_attributes(g, degree, 'degree')
        g = momepy.node_degree(g, name='degree')
        node_df, edge_df, sw = momepy.nx_to_gdf(g, points=True, lines=True,
                                        spatial_weights=True)

        SA2_MAIN16 = shp_proj.iloc[sa_idx]["SA2_MAIN16"]
        #nodes is intersections
        num_nodes = len(node_df)
        #num_0degree = len(node_df[node_df["degree"]==0])
        num_1degree = len(node_df[node_df["degree"]==1])
        num_2degree = len(node_df[node_df["degree"]==2])
        num_3degree = len(node_df[node_df["degree"]==3])
        num_4degree = len(node_df[node_df["degree"]==4])
        num_greater5degree = len(node_df[node_df["degree"]>=5])
        degree_df = degree_df.append({"SA2_MAIN16": SA2_MAIN16, "num_nodes":num_nodes,  
                                      "num_1degree":num_1degree, "num_2degree":num_2degree, "num_3degree":num_3degree,
                                      "num_4degree":num_4degree,
                                      "num_greater5degree":num_greater5degree},
                                    ignore_index=True)
        
    print("=====DONE degree df=====")
    
    return degree_df

In [11]:
degree_df = get_degree_df(elt1, elt2)

=====Running get_degree_df=====


 There are 2 disconnected components.
 There are 3 disconnected components.
 There are 58 disconnected components.
 There are 21 disconnected components.
 There are 12 disconnected components.
 There are 18 disconnected components.
 There are 29 disconnected components.
 There are 47 disconnected components.
 There are 9 disconnected components.
 There are 23 disconnected components.
 There are 4 disconnected components.
 There are 5 disconnected components.
 There are 3 disconnected components.
 There are 3 disconnected components.
 There are 4 disconnected components.
 There are 4 disconnected components.
 There are 2 disconnected components.
 There are 3 disconnected components.
 There are 5 disconnected components.
 There are 7 disconnected components.
 There are 4 disconnected components.
 There are 10 disconnected components.
 There are 8 disconnected components.
 There are 4 disconnected components.
 There are 5 disconnected components.
 There are 4 disconnected components.
 The

 There are 6 disconnected components.
 There are 8 disconnected components.
 There are 10 disconnected components.
 There are 8 disconnected components.
 There are 10 disconnected components.
 There are 7 disconnected components.
 There are 9 disconnected components.
 There are 8 disconnected components.
 There are 8 disconnected components.
 There are 24 disconnected components.
 There are 8 disconnected components.
 There are 4 disconnected components.
 There are 5 disconnected components.
 There are 4 disconnected components.
 There are 5 disconnected components.
 There are 2 disconnected components.
 There are 6 disconnected components.
 There are 2 disconnected components.
 There are 12 disconnected components.
 There are 3 disconnected components.
 There are 8 disconnected components.
 There are 3 disconnected components.
 There are 3 disconnected components.
 There are 16 disconnected components.
 There are 6 disconnected components.
 There are 3 disconnected components.
 There 

 There are 12 disconnected components.
 There are 4 disconnected components.
 There are 3 disconnected components.
 There are 3 disconnected components.
 There are 3 disconnected components.
 There are 2 disconnected components.
 There are 5 disconnected components.
 There are 3 disconnected components.
 There are 5 disconnected components.
 There are 3 disconnected components.
 There are 4 disconnected components.
 There are 2 disconnected components.
 There are 7 disconnected components.
 There are 2 disconnected components.
 There are 8 disconnected components.
 There are 3 disconnected components.
 There are 2 disconnected components.


=====DONE degree df=====


In [12]:
degree_df

Unnamed: 0,SA2_MAIN16,num_nodes,num_1degree,num_2degree,num_3degree,num_4degree,num_greater5degree
0,401011001,1642,291,199,865,276,11
1,401011002,517,149,54,239,69,6
2,401021003,4034,1174,731,1982,141,6
3,401021004,2455,710,502,1126,110,7
4,401021005,989,286,245,421,36,1
...,...,...,...,...,...,...,...
105,404031105,157,28,19,90,20,0
106,404031106,530,102,39,308,81,0
107,404031107,1036,170,105,616,143,2
108,404031108,887,125,120,410,231,1


In [13]:
def get_specific_df(OD_full_path, shp, shp_proj, sa4_set=['401','402','403','404']):
    """
        Inputs:
            OD_full_path - output from shortest_path(); the first output union_road_land_shp(shp, road_shp)
            shp - original shp file
            shp_proj - the shp file merged with road attributes 
            
        Outputs:
            edge_specific_df - initial edge df with all info
            node_specific_df - intial node df with all info
    """
    
    print("=====Running get_specific_df=====")
    
    # read google api info
    with open(mount_path + '/SA data/dataSA/OD_Google_API_raw.pickle', 'rb') as w:
        OD_google_raw = pickle.load(w)

    with open(mount_path + '/SA data/dataSA/OD_Google_API_With_Map_Info.pickle', 'rb') as w:
        OD_google_with_map = pickle.load(w)
        
    jobs_all_sub = jobs_all[['num_jobs_000_persons', 'sa2_code16', 'median_income_per_job_aud_persons']]
    
    flow_adelaide_df = flow_df.loc[np.array([x[:3] in sa4_set for x in flow_df.agent_home_sa2])]
    flow_adelaide_df = flow_adelaide_df.loc[np.array([x[:3] in sa4_set for x in flow_adelaide_df.sa2])]
    
    flow_adelaide_df.rename(columns={'agent_home_sa2':'origin','sa2':'destination'}, inplace=True)
    flow_adelaide_df['OD'] = ''
    flow_adelaide_df['OD'] = flow_adelaide_df['origin'] + flow_adelaide_df['destination']
    flow_adelaide_df.groupby(by='OD').sum() # no repetition. 
    
    # reindex
    flow_adelaide_df.index = np.arange(flow_adelaide_df.shape[0])
    
    # create ten columns here.
    road_attribute_names_list = ['class_ART', 'class_BUS', 'class_COLL',
                                 'class_FREE', 'class_HWY', 'class_LOCL', 'class_SUBA', 'class_TRK2',
                                 'class_TRK4', 'class_UND']
    flow_adelaide_df[road_attribute_names_list] = 0.0
    
    
    # add the road attributes on the shortest path to the flow_adelaide_df.
    # time cost: 3-5 mins?
    for idx in np.arange(flow_adelaide_df.shape[0]):
        origin = flow_adelaide_df.loc[idx, 'origin']
        destination = flow_adelaide_df.loc[idx, 'destination']
        o_idx = shp.index[shp.SA2_MAIN16==origin].tolist()[0]
        d_idx = shp.index[shp.SA2_MAIN16==destination].tolist()[0]
        #print(o_idx,d_idx)

        try:
            # OD_full_path might not have all the shortest path...
            idx_list_on_shortest_path = OD_full_path[(o_idx, d_idx)]
            for node_on_shortest_path in idx_list_on_shortest_path:
                flow_adelaide_df.loc[idx, road_attribute_names_list] += shp_proj.loc[node_on_shortest_path, road_attribute_names_list]        
        except KeyError as error:
            pass
        
    
    # add the job information to flow dataframe.
    # origin
    flow_adelaide_df=flow_adelaide_df.merge(jobs_all_sub, left_on='origin', right_on='sa2_code16', how = 'left')
    flow_adelaide_df=flow_adelaide_df.rename(columns={'num_jobs_000_persons':'num_jobs_000_persons_origin', 'median_income_per_job_aud_persons':'median_income_per_job_aud_origin'})

    # destination
    flow_adelaide_df=flow_adelaide_df.merge(jobs_all_sub, left_on='destination', right_on='sa2_code16', how = 'left')
    flow_adelaide_df=flow_adelaide_df.rename(columns={'num_jobs_000_persons':'num_jobs_000_persons_destination', 'median_income_per_job_aud_persons':'median_income_per_job_aud_destination'})

    
    # augment the travel time and distance information to flow_adelaide_df
    flow_adelaide_df['od_duration_value']=0.0 
    flow_adelaide_df['od_distance_value']=0.0 

    for idx in range(flow_adelaide_df.shape[0]):
        if idx%100 == 0:
            print(idx)

        # idx is the index in flow_adelaide_df
        origin_sa2_idx = flow_adelaide_df.loc[idx,'origin']
        destination_sa2_idx = flow_adelaide_df.loc[idx,'destination']

        # return the corresponding idx from OD_Google_API
        filter_idx = np.multiply(OD_google_with_map.loc[:, 'o_sa2_idx'] == origin_sa2_idx,
                                 OD_google_with_map.loc[:, 'd_sa2_idx'] == destination_sa2_idx)
        idx_google_api = OD_google_with_map.index[filter_idx].tolist()[0] # this is the index in OD_google_with_map

        # 
        flow_adelaide_df.loc[idx, 'od_duration_value'] = OD_google_with_map.loc[idx_google_api, 'od_duration_value']
        flow_adelaide_df.loc[idx, 'od_distance_value'] = OD_google_with_map.loc[idx_google_api, 'od_distance_value']
        
    # replace 0.0 values by 1.0
    cols = ['sum_stay_duration','unique_agents','total_stays',
            'class_ART', 'class_BUS', 'class_COLL', 'class_FREE', 'class_HWY', 'class_LOCL', 
            'class_SUBA', 'class_TRK2', 'class_TRK4', 'class_UND',
            'od_duration_value', 'od_distance_value']

    for col in cols:
        flow_adelaide_df.loc[flow_adelaide_df.loc[:,col] == 0.0, col] = 1.0
        
    # dropped 433 observations. The df has nan.
    flow_adelaide_df.dropna(how = 'any', inplace = True)
    
    # add total road count as a variable
    flow_adelaide_df['road_counts'] = np.sum(flow_adelaide_df[['class_ART', 'class_BUS', 'class_COLL', 'class_FREE', 'class_HWY', 'class_LOCL', 
                                         'class_SUBA', 'class_TRK2', 'class_TRK4', 'class_UND']], axis = 1)
    edge_specific_df = flow_adelaide_df.copy()
    
    print("=====DONE EDGE=====")
    
    # origin and destination flow counts
    origin_flow_counts = flow_adelaide_df.groupby(by="origin",as_index=False,sort=False).sum()[['origin','unique_agents','sum_stay_duration','total_stays']]
    destination_flow_counts = flow_adelaide_df.groupby(by="destination",as_index=False,sort=False).sum()[['destination','unique_agents','sum_stay_duration','total_stays']]
    
    # compute origin and destination entropy (w.r.t. location). flow location diversity.
    # origin
    origin_flow_count_n = flow_adelaide_df.groupby('origin')[['unique_agents','sum_stay_duration','total_stays']].transform('sum')
    values = flow_adelaide_df[['unique_agents','sum_stay_duration','total_stays']]/origin_flow_count_n
    flow_adelaide_df[['unique_agents_origin_entropy','sum_stay_duration_origin_entropy','total_stays_origin_entropy']] = \
        -(values*np.log(values))
    origin_flow_entropy=flow_adelaide_df.groupby('origin',as_index=False,sort=False)[['unique_agents_origin_entropy','sum_stay_duration_origin_entropy','total_stays_origin_entropy']].sum()

    # destination
    destination_flow_count_n = flow_adelaide_df.groupby('destination')[['unique_agents','sum_stay_duration','total_stays']].transform('sum')
    values = flow_adelaide_df[['unique_agents','sum_stay_duration','total_stays']]/destination_flow_count_n
    flow_adelaide_df[['unique_agents_destination_entropy','sum_stay_duration_destination_entropy','total_stays_destination_entropy']] = \
        -(values*np.log(values))
    destination_flow_entropy=flow_adelaide_df.groupby('destination',as_index=False,sort=False)[['unique_agents_destination_entropy','sum_stay_duration_destination_entropy','total_stays_destination_entropy']].sum()

    # merge data to sa2_adelaide_road_proj
    # augment income and jobs
    sa2_data_prep=pd.merge(shp_proj, jobs_all_sub, left_on='SA2_MAIN16', right_on='sa2_code16', how = 'inner')
    sa2_data_prep=pd.merge(sa2_data_prep, origin_flow_counts, left_on='SA2_MAIN16', right_on='origin', how='inner', suffixes=[None,'_origin_counts'])
    sa2_data_prep=pd.merge(sa2_data_prep, destination_flow_counts, left_on='SA2_MAIN16', right_on='destination', how='inner', suffixes=[None,'_destination_counts'])
    sa2_data_prep=pd.merge(sa2_data_prep, origin_flow_entropy, left_on='SA2_MAIN16', right_on='origin', how='inner')
    sa2_data_prep=pd.merge(sa2_data_prep, destination_flow_entropy, left_on='SA2_MAIN16', right_on='destination', how='inner')

    # rename the '_origin_counts'
    sa2_data_prep = sa2_data_prep.rename(columns={'unique_agents':'unique_agents_origin_counts',
                                          'sum_stay_duration':'sum_stay_duration_origin_counts',
                                          'total_stays':'total_stays_origin_counts'})
    
    node_specific_df = sa2_data_prep.copy()
    
    print("=====DONE get_specific_df=====")
    
    return edge_specific_df, node_specific_df

In [14]:
edge_specific_df, node_specific_df = get_specific_df(OD_full_path, sa2_adelaide, elt1)

=====Running get_specific_df=====
0


  f"evaluating in Python space because the {repr(op_str)} "


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
=====DONE EDGE=====
=====DONE get_specific_df=====


In [15]:
edge_specific_df

Unnamed: 0,timestamp,destination,state,origin,unique_agents,sum_stay_duration,total_stays,OD,class_ART,class_BUS,...,class_UND,num_jobs_000_persons_origin,sa2_code16_x,median_income_per_job_aud_origin,num_jobs_000_persons_destination,sa2_code16_y,median_income_per_job_aud_destination,od_duration_value,od_distance_value,road_counts
0,2018-02-01T00:00:00.000+11:00,401011001,4,401011001,19863,268779414,681217,401011001401011001,208.0,13.0,...,10.0,14.057,401011001,17265.0,14.057,401011001,17265.0,1.0,1.0,2227.0
1,2018-02-01T00:00:00.000+11:00,401011002,4,401011001,6911,4857456,29322,401011001401011002,267.0,16.0,...,11.0,14.057,401011001,17265.0,5.459,401011002,24773.0,565.0,2897.0,2867.0
2,2018-02-01T00:00:00.000+11:00,401021003,4,401011001,404,253764,1334,401011001401021003,838.0,82.0,...,14.0,14.057,401011001,17265.0,5.373,401021003,24830.0,3293.0,39614.0,10415.0
3,2018-02-01T00:00:00.000+11:00,401021004,4,401011001,1184,903377,5487,401011001401021004,430.0,13.0,...,33.0,14.057,401011001,17265.0,14.156,401021004,28246.0,1603.0,20319.0,6590.0
4,2018-02-01T00:00:00.000+11:00,401021005,4,401011001,700,189721,1278,401011001401021005,517.0,13.0,...,33.0,14.057,401011001,17265.0,3.395,401021005,25031.0,2156.0,34698.0,7684.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10668,2018-02-01T00:00:00.000+11:00,404031105,4,404031109,2304,5876916,12172,404031109404031105,59.0,1.0,...,1.0,3.909,404031109,28657.0,1.867,404031105,26115.0,345.0,3289.0,520.0
10669,2018-02-01T00:00:00.000+11:00,404031106,4,404031109,2967,5413932,16935,404031109404031106,105.0,1.0,...,1.0,3.909,404031109,28657.0,9.750,404031106,25913.0,528.0,5066.0,1025.0
10670,2018-02-01T00:00:00.000+11:00,404031107,4,404031109,1811,937583,8366,404031109404031107,262.0,1.0,...,18.0,3.909,404031109,28657.0,18.962,404031107,23052.0,572.0,6601.0,1992.0
10671,2018-02-01T00:00:00.000+11:00,404031108,4,404031109,2464,1334692,11329,404031109404031108,182.0,1.0,...,21.0,3.909,404031109,28657.0,13.439,404031108,23003.0,695.0,6931.0,1849.0


In [16]:
node_specific_df

Unnamed: 0,SA2_MAIN16,SA2_5DIG16,SA2_NAME16,SA3_CODE16,SA3_NAME16,SA4_CODE16,SA4_NAME16,GCC_CODE16,GCC_NAME16,STE_CODE16,...,sum_stay_duration_destination_counts,total_stays_destination_counts,origin_y,unique_agents_origin_entropy,sum_stay_duration_origin_entropy,total_stays_origin_entropy,destination_y,unique_agents_destination_entropy,sum_stay_duration_destination_entropy,total_stays_destination_entropy
0,401011001,41001,Adelaide,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,1274311971,6577944,401011001,3.987705,1.588336,2.324296,401011001,4.460324,3.949646,4.237396
1,401011002,41002,North Adelaide,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,236898517,1123442,401011002,3.855433,1.944331,2.609353,401011002,4.391005,3.152188,3.975034
2,401021003,41003,Adelaide Hills,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,132330008,304810,401021003,3.956555,1.266519,2.456366,401021003,4.007255,0.948900,2.042530
3,401021004,41004,Aldgate - Stirling,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,339128660,922067,401021004,3.879316,1.355832,2.310259,401021004,3.942693,1.099284,2.075796
4,401021005,41005,Hahndorf - Echunga,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,99670729,281143,401021005,3.732677,1.404100,2.330624,401021005,4.017761,1.654530,2.549629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,404031105,41105,Fulham,40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,41857389,121431,404031105,3.607086,1.765320,2.501771,404031105,3.689014,1.975506,2.784143
99,404031106,41106,Lockleys,40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,252973587,697096,404031106,3.813245,1.737626,2.658701,404031106,4.044919,1.416921,2.608267
100,404031107,41107,Plympton,40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,533857644,1527065,404031107,3.897496,1.747409,2.601546,404031107,4.083340,1.695853,2.732940
101,404031108,41108,Richmond (SA),40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,422944715,1549687,404031108,3.944291,1.692916,2.532070,404031108,4.279049,2.308701,3.390496


In [22]:
def union_degree(node_df, edge_df, degree_df, OD_full_path):
    """
        Inputs:
            node_df, edge_df - the node and edge specific df from get_specific_df
            OD_full_path - output of shortest_path
            degree_df - output of get degree df
        Outputs:
            edge_degree_df, node_degree_df - respective dfs merged with degree df
    """
    
    print("=====Running union_degree=====")
    
    node_degree_df = node_df.merge(degree_df, how="left", on="SA2_MAIN16")
    
    origin_dest = list(zip(edge_df["origin"].values, edge_df["destination"].values))
    
    edge_degree_df = pd.DataFrame(columns=["sa2_code16_x", "sa2_code16_y", "num_nodes_x", 
                                  "num_1degree_x", "num_2degree_x", "num_3degree_x", "num_4degree_x",
                                    "num_greater5degree_x",
                                      "num_nodes_y", 
                                  "num_1degree_y", "num_2degree_y", "num_3degree_y", "num_4degree_y",
                                    "num_greater5degree_y"])
    
    sa_to_i = {}
    i_to_sa = {}
    sa_to_data = {}
    for i, row in degree_df.iterrows():
        print(i)
        i_to_sa[i] = row["SA2_MAIN16"]
        sa_to_i[row["SA2_MAIN16"]] = i
        sa_to_data[row["SA2_MAIN16"]] = row[['num_nodes', 'num_1degree','num_2degree', 'num_3degree', 'num_4degree', 'num_greater5degree']]
        
    for o,d in origin_dest:
        if o != d:
            o_data = degree_df[degree_df["SA2_MAIN16"]==o]
            d_data = degree_df[degree_df["SA2_MAIN16"]==d]

            num_nodes_pth = 0
            num_1degree_pth = 0
            num_2degree_pth = 0
            num_3degree_pth = 0
            num_4degree_pth = 0
            num_greater5degree_pth = 0
            oid = sa_to_i[o]
            did = sa_to_i[d]
            for i in OD_full_path[(oid,did)]:
                sa = i_to_sa[i]
                num_nodes_pth += float(sa_to_data[sa][0])
                num_1degree_pth += float(sa_to_data[sa][1])
                num_2degree_pth += float(sa_to_data[sa][2])
                num_3degree_pth += float(sa_to_data[sa][3])
                num_4degree_pth += float(sa_to_data[sa][4])
                num_greater5degree_pth += float(sa_to_data[sa][5])



            num_nodes_x = float(o_data["num_nodes"].iloc[0])
            num_1degree_x = float(o_data["num_1degree"].iloc[0])
            num_2degree_x = float(o_data["num_2degree"].iloc[0])
            num_3degree_x = float(o_data["num_3degree"].iloc[0])
            num_4degree_x = float(o_data["num_4degree"].iloc[0])
            num_greater5degree_x = float(o_data["num_greater5degree"].iloc[0])

            num_nodes_y = float(d_data["num_nodes"].iloc[0])
            num_1degree_y = float(d_data["num_1degree"].iloc[0])
            num_2degree_y = float(d_data["num_2degree"].iloc[0])
            num_3degree_y = float(d_data["num_3degree"].iloc[0])
            num_4degree_y = float(d_data["num_4degree"].iloc[0])
            num_greater5degree_y = float(d_data["num_greater5degree"].iloc[0])


        else:
            o_data = degree_df[degree_df["SA2_MAIN16"]==o]
            d_data = degree_df[degree_df["SA2_MAIN16"]==d]
            num_nodes_x = num_nodes_y = num_nodes_pth = float(o_data["num_nodes"].iloc[0])
            num_1degree_x = num_1degree_y = num_1degree_pth = float(o_data["num_1degree"].iloc[0])
            num_2degree_x = num_2degree_y = num_2degree_pth = float(o_data["num_2degree"].iloc[0])
            num_3degree_x = num_3degree_y = num_3degree_pth = float(o_data["num_3degree"].iloc[0])
            num_4degree_x = num_4degree_y = num_4degree_pth = float(o_data["num_4degree"].iloc[0])
            num_greater5degree_x = num_greater5degree_y = num_greater5degree_pth = float(o_data["num_greater5degree"].iloc[0])

        edge_degree_df = edge_degree_df.append({"sa2_code16_x": o, "sa2_code16_y":d ,"num_nodes_x":num_nodes_x, 
                                      "num_1degree_x":num_1degree_x, "num_2degree_x":num_2degree_x, 
                                      "num_3degree_x":num_3degree_x, "num_4degree_x":num_4degree_x,
                                      "num_greater5degree_x":num_greater5degree_x,
                                      "num_nodes_y":num_nodes_y, 
                                      "num_1degree_y":num_1degree_y, "num_2degree_y":num_2degree_y, 
                                      "num_3degree_y":num_3degree_y, "num_4degree_y":num_4degree_y,
                                      "num_greater5degree_y":num_greater5degree_y,
                                      "num_nodes_pth":num_nodes_pth,
                                      "num_1degree_pth":num_1degree_pth,
                                      "num_2degree_pth":num_2degree_pth,
                                      "num_3degree_pth":num_3degree_pth,
                                      "num_4degree_pth":num_4degree_pth,
                                       "num_greater5degree_pth":num_greater5degree_pth        },
                                    ignore_index=True)
    edge_degree_df = edge_df.merge(edge_degree_df, how="left", on=["sa2_code16_x","sa2_code16_y"])
    
    print("=====DONE union_degree=====")
    
    return edge_degree_df, node_degree_df

In [19]:
edge_degree_df, node_degree_df = union_degree(node_specific_df, edge_specific_df, degree_df, OD_full_path)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109


In [20]:
edge_degree_df

Unnamed: 0,timestamp,destination,state,origin,unique_agents,sum_stay_duration,total_stays,OD,class_ART,class_BUS,...,num_2degree_y,num_3degree_y,num_4degree_y,num_greater5degree_y,num_1degree_pth,num_2degree_pth,num_3degree_pth,num_4degree_pth,num_greater5degree_pth,num_nodes_pth
0,2018-02-01T00:00:00.000+11:00,401011001,4,401011001,19863,268779414,681217,401011001401011001,208.0,13.0,...,199.0,865.0,276.0,11.0,291.0,199.0,865.0,276.0,11.0,1642.0
1,2018-02-01T00:00:00.000+11:00,401011002,4,401011001,6911,4857456,29322,401011001401011002,267.0,16.0,...,54.0,239.0,69.0,6.0,440.0,253.0,1104.0,345.0,17.0,2159.0
2,2018-02-01T00:00:00.000+11:00,401021003,4,401011001,404,253764,1334,401011001401021003,838.0,82.0,...,731.0,1982.0,141.0,6.0,2073.0,1220.0,4337.0,801.0,19.0,8450.0
3,2018-02-01T00:00:00.000+11:00,401021004,4,401011001,1184,903377,5487,401011001401021004,430.0,13.0,...,502.0,1126.0,110.0,7.0,1222.0,853.0,2602.0,587.0,18.0,5282.0
4,2018-02-01T00:00:00.000+11:00,401021005,4,401011001,700,189721,1278,401011001401021005,517.0,13.0,...,245.0,421.0,36.0,1.0,1508.0,1098.0,3023.0,623.0,19.0,6271.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,2018-02-01T00:00:00.000+11:00,404031105,4,404031109,2304,5876916,12172,404031109404031105,59.0,1.0,...,19.0,90.0,20.0,0.0,65.0,39.0,241.0,39.0,1.0,385.0
10236,2018-02-01T00:00:00.000+11:00,404031106,4,404031109,2967,5413932,16935,404031109404031106,105.0,1.0,...,39.0,308.0,81.0,0.0,139.0,59.0,459.0,100.0,1.0,758.0
10237,2018-02-01T00:00:00.000+11:00,404031107,4,404031109,1811,937583,8366,404031109404031107,262.0,1.0,...,105.0,616.0,143.0,2.0,269.0,172.0,850.0,194.0,7.0,1492.0
10238,2018-02-01T00:00:00.000+11:00,404031108,4,404031109,2464,1334692,11329,404031109404031108,182.0,1.0,...,120.0,410.0,231.0,1.0,224.0,187.0,644.0,282.0,6.0,1343.0


In [21]:
node_degree_df

Unnamed: 0,SA2_MAIN16,SA2_5DIG16,SA2_NAME16,SA3_CODE16,SA3_NAME16,SA4_CODE16,SA4_NAME16,GCC_CODE16,GCC_NAME16,STE_CODE16,...,destination_y,unique_agents_destination_entropy,sum_stay_duration_destination_entropy,total_stays_destination_entropy,num_nodes,num_1degree,num_2degree,num_3degree,num_4degree,num_greater5degree
0,401011001,41001,Adelaide,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,401011001,4.460324,3.949646,4.237396,1642,291,199,865,276,11
1,401011002,41002,North Adelaide,40101,Adelaide City,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,401011002,4.391005,3.152188,3.975034,517,149,54,239,69,6
2,401021003,41003,Adelaide Hills,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,401021003,4.007255,0.948900,2.042530,4034,1174,731,1982,141,6
3,401021004,41004,Aldgate - Stirling,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,401021004,3.942693,1.099284,2.075796,2455,710,502,1126,110,7
4,401021005,41005,Hahndorf - Echunga,40102,Adelaide Hills,401,Adelaide - Central and Hills,4GADE,Greater Adelaide,4,...,401021005,4.017761,1.654530,2.549629,989,286,245,421,36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,404031105,41105,Fulham,40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,404031105,3.689014,1.975506,2.784143,157,28,19,90,20,0
99,404031106,41106,Lockleys,40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,404031106,4.044919,1.416921,2.608267,530,102,39,308,81,0
100,404031107,41107,Plympton,40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,404031107,4.083340,1.695853,2.732940,1036,170,105,616,143,2
101,404031108,41108,Richmond (SA),40403,West Torrens,404,Adelaide - West,4GADE,Greater Adelaide,4,...,404031108,4.279049,2.308701,3.390496,887,125,120,410,231,1


In [34]:
def union_poi(node_degree_df, edge_degree_df):
    """
        Inputs:
            node_degree_df, edge_degree_df - outputs of union degree
        Outputs:
            Inputs merged with poi df
    """
    
    print("=====Running union_poi=====")
    
    
#     poi_df = pd.read_pickle("../../data_process/poi_df.pickle")
#     sa_codes_poi = []
#     for i, centroid in enumerate(poi_df.geometry):
#         if i%100 == 0: print(i)
#         found = False
#         for i, row in sa2_south_au.iterrows():
#             if row["geometry"].contains(centroid):
#                 sa_codes_poi.append(row["SA2_MAIN16"])
#                 found = True
#                 break
#         if not found:
#             sa_codes_poi.append("0")
#     poi_df["SA2_MAIN16"] = sa_codes_poi
#     poi_df.to_pickle("../../data_process/poi_df_cleaned.pickle")
    poi_df = pd.read_pickle("../../data_process/poi_df_cleaned.pickle")
    poi_df = poi_df[poi_df["SA2_MAIN16"]!="0"]
    
    
    count = poi_df.groupby(["SA2_MAIN16"],as_index=False).aggregate(["count"])
    split_count = poi_df.groupby(["SA2_MAIN16","type"],as_index=False).aggregate(["count"])
    
    poi_df = pd.DataFrame()
    
    poi_df["SA2_MAIN16"] = count.index.values
    poi_df["poi_count"] = count[( 'geometry', 'count')].values
    
    entropy = {}
    for i, row in split_count.iterrows():
        sa_id, _type = i
        total_count = poi_df.loc[poi_df["SA2_MAIN16"]==sa_id]["poi_count"]
        val = row[( 'geometry', 'count')]/total_count

        if sa_id not in entropy:
            entropy[sa_id] = (-val * np.log(val))
        else:
            entropy[sa_id] += (-val * np.log(val))
            
    entropy_list = []
    for sa_id in poi_df.SA2_MAIN16:
        entropy_list.append(float(entropy[sa_id]))
        
    poi_df["poi_count_entropy"] = entropy_list
    
    node_degree_entropy_df = node_degree_df.merge(poi_df,how="left",on="SA2_MAIN16")
    
    sa_ids_poi = set(poi_df["SA2_MAIN16"].values)
    
    edge_degree_df = edge_degree_df[edge_degree_df["sa2_code16_y"].isin(sa_ids_poi)]
    edge_degree_df = edge_degree_df[edge_degree_df["sa2_code16_x"].isin(sa_ids_poi)]
    
    count_dic = {key:val for key,val in zip(poi_df["SA2_MAIN16"].values, poi_df["poi_count"].values)}
    
    entropy_x = []
    poi_count_x = []
    notin = 0
    for sa_id in edge_degree_df["sa2_code16_x"].values:
        if sa_id in entropy:
            entropy_x.append(float(entropy[sa_id]))
            poi_count_x.append(float(count_dic[sa_id]))
        else:
            notin += 1
    notin=0
    entropy_y = []
    poi_count_y = []
    for sa_id in edge_degree_df["sa2_code16_y"]:
        if sa_id in entropy:
            entropy_y.append(float(entropy[sa_id]))
            poi_count_y.append(float(count_dic[sa_id]))
        else:
            notin += 1
    
    edge_degree_df["poi_entropy_x"] = entropy_x
    edge_degree_df["poi_entropy_y"] = entropy_y
    
    edge_degree_df["poi_count_x"] = poi_count_x
    edge_degree_df["poi_count_y"] = poi_count_y
    
    print("=====DONE union_poi=====")
    
    return edge_degree_df, node_degree_entropy_df

In [35]:
edge_degree_poi_df, node_degree_poi_df = union_poi(node_degree_df, edge_degree_df)

=====Running union_poi=====
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
1800