In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
data = pd.read_csv("../data/dataviz.csv")
data.head()

Unnamed: 0,plat,plng,dlat,dlng,t,duration,distance,road
0,46.517116,6.630342,46.508528,6.627598,11:15:00,132.0,1336.0,"[311463563, 2940544168, 311463564, 561463459, ..."
1,46.517116,6.630342,46.521429,6.656347,20:00:00,426.0,3702.0,"[311463563, 2940544168, 311463564, 561463459, ..."
2,46.517116,6.630342,46.558792,6.680104,18:15:00,707.0,8354.0,"[311463563, 2940544168, 311463564, 561463459, ..."
3,46.520649,6.635173,46.51959,6.634718,18:45:00,163.0,1521.0,"[2223091605, 3786724165, 3786724164, 302530573..."
4,46.510731,6.630553,46.51595,6.657525,19:00:00,437.0,3818.0,"[3516999773, 3817926827, 567951313, 567951312,..."


In [3]:
with open('../data/nodes_dict.json') as json_file:  
    nodes_dict = json.load(json_file)

In [4]:
paths_nodes = (data.road.apply(lambda x: x[1:-1].split(", ")))

In [5]:
# Compute number of passages on each edge
nonvalid_nodes = set()
def update_path_heat(edges_heat, edges_id, path_nodes, nodes_dict, driver_id):
    if len(path_nodes) <=1:
        return edges_heat, edges_id
    elif path_nodes[0] not in nodes_dict:
        nonvalid_nodes.add(path_nodes[0])
        return update_path_heat(edges_heat, edges_id, path_nodes[1:], nodes_dict, driver_id)
    elif path_nodes[1] not in nodes_dict:
        nonvalid_nodes.add(path_nodes[1])
        return update_path_heat(edges_heat, edges_id, path_nodes[:1]+path_nodes[2:], nodes_dict, driver_id)
    else:
        edge = path_nodes[:2]            
        dic_key = '-'.join(sorted(edge))

        edges_heat[dic_key] = 1 + edges_heat.get(dic_key, 0)
        if edges_id.get(dic_key, None) == None:
            edges_id[dic_key] = []
        edges_id.get(dic_key).append(driver_id)
        return update_path_heat(edges_heat, edges_id, path_nodes[1:], nodes_dict, driver_id)

edges_heat = {}
edges_id = {}
nonvalid_nodes = set()
for i, path_nodes in paths_nodes.iteritems():
    edges_heat, edges_id = update_path_heat(edges_heat, edges_id, path_nodes, nodes_dict, i)

len(edges_heat)

12660

In [6]:
# Make a dataframes that has a heat value for each edge (pair of nodes)
heat_df = pd.DataFrame(list(edges_heat.items()), columns=['edge','heat'])
id_df = pd.DataFrame(list(edges_id.items()), columns=['edge','id'])
heat_df = heat_df.merge(id_df)

heat_df['node1'] = heat_df.apply (lambda row: row.edge.split('-')[0],axis=1)
heat_df['node2'] = heat_df.apply (lambda row: row.edge.split('-')[1],axis=1)

heat_df = heat_df[['node1', 'node2', 'heat', 'id']].sort_values(['heat'], ascending=False)
heat_df = heat_df.reset_index(drop=True)
heat_df.head()

Unnamed: 0,node1,node2,heat,id
0,2657469978,3786724226,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
1,2223091607,3787835432,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
2,3025305734,3786724164,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
3,2223091605,3786724165,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
4,3786724164,3786724165,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."


In [7]:
heat_df.shape

(12660, 4)

In [8]:
# Verify that all nodes are valid
heat_df['valid'] = heat_df.apply (lambda row: row.node1 in nodes_dict and row.node2 in nodes_dict,axis=1)

heat_df=heat_df[heat_df.valid==True]
heat_df = heat_df.drop('valid', axis=1)
heat_df.shape

(12660, 4)

In [9]:
# Add the corresponding latitude & longitude for each node
heat_df['lat1'] = heat_df.apply (lambda row: nodes_dict[row.node1]['lat'],axis=1)
heat_df['lon1'] = heat_df.apply (lambda row: nodes_dict[row.node1]['lon'],axis=1)
heat_df['lat2'] = heat_df.apply (lambda row: nodes_dict[row.node2]['lat'],axis=1)
heat_df['lon2'] = heat_df.apply (lambda row: nodes_dict[row.node2]['lon'],axis=1)
heat_df = heat_df[['node1','lat1','lon1','node2','lat2','lon2','heat', 'id']]
print(heat_df.shape)
heat_df.head()

(12660, 8)


Unnamed: 0,node1,lat1,lon1,node2,lat2,lon2,heat,id
0,2657469978,46.5209304,6.6358808,3786724226,46.5208706,6.6357914,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
1,2223091607,46.5207321,6.6355847,3787835432,46.5207605,6.6356271,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
2,3025305734,46.5206458,6.6353659,3786724164,46.5206459,6.6353559,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
3,2223091605,46.5206503,6.6351191,3786724165,46.5206473,6.6352327,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
4,3786724164,46.5206459,6.6353559,3786724165,46.5206473,6.6352327,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."


In [10]:
# Get starting edges (from pickup location to 1st node)
init_edges = data.apply(lambda row: [row['plat'], row['plng'], row['road'][1:].split(", ")[0]],axis=1)
new_edges = []

for i, (plat, plng, node2) in init_edges.iteritems():
    edge_idx = [i for (i,e) in enumerate(new_edges) if (e['lat1']==plat) and (e['lon1']==plng) and (e['node2']==node2)]
    
    if len(edge_idx) > 0:
        # edge is already present
        e = new_edges[edge_idx[0]]
        e['heat'] = e['heat'] + 1
        e['id'] = e['id'] + [i]
        new_edges[edge_idx[0]] = e
    else:
        # need to create new edge
        lat2, lon2 = nodes_dict[node2].values()
        heat = 1
        idx = [i]
        
        e = {'lat1':plat, 'lon1':plng, 'node2':node2, 'lat2':lat2, 'lon2':lon2, 'heat': heat, 'id':idx}
        new_edges.append(e)
        
len(new_edges)

43

In [11]:
# Add starting edges to the dataframe
heat_df_final = heat_df.copy()
for edge in new_edges:
    heat_df_final = heat_df_final.append(edge, ignore_index=True)

heat_df_final = heat_df_final.sort_values(['heat'], ascending=False)
heat_df_final = heat_df_final.reset_index(drop=True)
heat_df_final.shape

(12703, 8)

In [13]:
# Save to csv
heat_df_final.to_csv("../data/heatmap_data.csv", index=False, na_rep='NA')

In [None]:
h_df = heat_df[:0].copy()
for edge in new_edges:
    h_df = h_df.append(edge, ignore_index=True)
h_df