In [19]:
import numpy as np
import pandas as pd
import json

In [20]:
data = pd.read_csv("../data/dataviz.csv")
data.head()

Unnamed: 0,plat,plng,dlat,dlng,t,road
0,46.517116,6.630342,46.508528,6.627598,11:15:00,"[311463563, 2940544168, 311463564, 561463459, ..."
1,46.517116,6.630342,46.521429,6.656347,20:00:00,"[311463563, 2940544168, 311463564, 561463459, ..."
2,46.517116,6.630342,46.558792,6.680104,18:15:00,"[311463563, 2940544168, 311463564, 561463459, ..."
3,46.520649,6.635173,46.51959,6.634718,18:45:00,"[2223091605, 3786724165, 3786724164, 302530573..."
4,46.510731,6.630553,46.51595,6.657525,19:00:00,"[3516999773, 3817926827, 567951313, 567951312,..."


In [21]:
with open('../data/nodes_dict.json') as json_file:  
    nodes_dict = json.load(json_file)

In [22]:
paths_nodes = (data.road.apply(lambda x: x[1:-1].split(", ")))

In [23]:
# Compute number of passages on each edge
nonvalid_nodes = set()
def update_path_heat(edges_heat, edges_id, path_nodes, nodes_dict, driver_id):
    if len(path_nodes) <=1:
        return edges_heat, edges_id
    elif path_nodes[0] not in nodes_dict:
        nonvalid_nodes.add(path_nodes[0])
        return update_path_heat(edges_heat, edges_id, path_nodes[1:], nodes_dict, driver_id)
    elif path_nodes[1] not in nodes_dict:
        nonvalid_nodes.add(path_nodes[1])
        return update_path_heat(edges_heat, edges_id, path_nodes[:1]+path_nodes[2:], nodes_dict, driver_id)
    else:
        edge = path_nodes[:2]            
        dic_key = '-'.join(sorted(edge))

        edges_heat[dic_key] = 1 + edges_heat.get(dic_key, 0)
        if edges_id.get(dic_key, None) == None:
            edges_id[dic_key] = []
        edges_id.get(dic_key).append(driver_id)
        return update_path_heat(edges_heat, edges_id, path_nodes[1:], nodes_dict, driver_id)

edges_heat = {}
edges_id = {}
nonvalid_nodes = set()
for i, path_nodes in paths_nodes.iteritems():
    edges_heat, edges_id = update_path_heat(edges_heat, edges_id, path_nodes, nodes_dict, i)

len(edges_heat)

12660

In [24]:
edges_heat

{'266636001-3127737964': 9,
 '312441284-35297498': 1,
 '258755890-565109269': 2,
 '266877375-567834691': 6,
 '253487784-5331033802': 14,
 '1755832994-252684258': 18,
 '330910115-571534549': 2,
 '309393913-3276659405': 1,
 '3396495050-5340501095': 32,
 '3143676850-3672033571': 1,
 '1094605091-1278435054': 25,
 '310281130-4272866481': 3,
 '288295782-580046870': 1,
 '330912224-641405884': 13,
 '2940544166-311463540': 1,
 '598646752-598646756': 1,
 '1041724052-1895885159': 2,
 '268426413-567137596': 1,
 '310281133-310281137': 3,
 '258755804-677515257': 5,
 '3111114493-337938579': 10,
 '2261189527-2261189530': 1,
 '320018632-3276628623': 1,
 '1444680818-2295333658': 4,
 '414572548-5340246267': 2,
 '265744956-415963417': 2,
 '256362036-583293919': 169,
 '308684455-5136211347': 2,
 '617578136-617578137': 1,
 '2296782366-2322313468': 8,
 '1475388276-1485676762': 1,
 '1050101515-330920216': 5,
 '3102623133-3102623134': 2,
 '282645328-574495036': 10,
 '269946389-5280238449': 12,
 '1449084751-504

In [30]:
# Make a dataframes that has a heat value for each edge (pair of nodes)
heat_df = pd.DataFrame(list(edges_heat.items()), columns=['edge','heat'])
id_df = pd.DataFrame(list(edges_id.items()), columns=['edge','id'])
heat_df = heat_df.merge(id_df)

heat_df['node1'] = heat_df.apply (lambda row: row.edge.split('-')[0],axis=1)
heat_df['node2'] = heat_df.apply (lambda row: row.edge.split('-')[1],axis=1)

heat_df = heat_df[['node1', 'node2', 'heat', 'id']].sort_values(['heat'], ascending=False)
heat_df = heat_df.reset_index(drop=True)
heat_df.head()

Unnamed: 0,node1,node2,heat,id
0,3786724226,4601360744,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
1,3786724164,3786724165,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
2,3787835432,4601360744,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
3,2657469978,3786724226,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
4,3025305734,3786724164,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."


In [31]:
heat_df.shape

(12660, 4)

In [32]:
# filter out non-valid nodes ids
heat_df['valid'] = heat_df.apply (lambda row: row.node1 in nodes_dict and row.node2 in nodes_dict,axis=1)

heat_df=heat_df[heat_df.valid==True]
heat_df = heat_df.drop('valid', axis=1)
heat_df.shape

(12660, 4)

In [33]:
# Add the corresponding latitude & longitude for each node
heat_df['lat1'] = heat_df.apply (lambda row: nodes_dict[row.node1]['lat'],axis=1)
heat_df['lon1'] = heat_df.apply (lambda row: nodes_dict[row.node1]['lon'],axis=1)
heat_df['lat2'] = heat_df.apply (lambda row: nodes_dict[row.node2]['lat'],axis=1)
heat_df['lon2'] = heat_df.apply (lambda row: nodes_dict[row.node2]['lon'],axis=1)
heat_df = heat_df[['node1','lat1','lon1','node2','lat2','lon2','heat', 'id']]
print(heat_df.shape)
heat_df.head()

(12660, 8)


Unnamed: 0,node1,lat1,lon1,node2,lat2,lon2,heat,id
0,3786724226,46.5208706,6.6357914,4601360744,46.5208475,6.6357465,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
1,3786724164,46.5206459,6.6353559,3786724165,46.5206473,6.6352327,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
2,3787835432,46.5207605,6.6356271,4601360744,46.5208475,6.6357465,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
3,2657469978,46.5209304,6.6358808,3786724226,46.5208706,6.6357914,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."
4,3025305734,46.5206458,6.6353659,3786724164,46.5206459,6.6353559,995,"[3, 5, 7, 10, 13, 15, 17, 24, 25, 27, 28, 30, ..."


In [123]:
# Save to csv
heat_df.to_csv("../data/heatmap_data.csv", index=False)