In [1]:
import pandas as pd
import networkx as nx
import geopandas as gpd
import shapely.geometry

import matplotlib
%matplotlib inline

import glob

In [2]:
nx.__version__

'2.1'

In [30]:
files = glob.glob("data/taxi_clean/*_od_v2.pkl")
files = files[0:1]

df3 = pd.concat(pd.read_pickle(f) for f in files)

In [32]:
df3.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_distance,NODEID_O,NODEID_D
0,2016-04-01 00:00:00,2016-04-01 00:01:59,-73.976883,40.758495,-73.977669,40.753902,0.5,22520,22499
7,2016-04-01 00:00:01,2016-04-01 00:03:46,-73.988899,40.745426,-73.991821,40.738445,0.6,21375,20746
10,2016-04-01 00:00:02,2016-04-01 00:06:56,-73.979752,40.780949,-73.966621,40.802837,1.8,21737,9051086
13,2016-04-01 00:00:02,2016-04-01 00:04:42,-74.00473,40.719166,-74.002861,40.723362,0.62,20217,20377
15,2016-04-01 00:00:03,2016-04-01 00:17:55,-73.984787,40.768509,-73.936722,40.813862,4.8,21488,42515


In [20]:
def uniform_str(x):
    strd = str(x)
    while len(strd) < 7:
        strd = '0' + strd
    return strd
df["NODEID_O"] = df["NODEID_O"].apply(uniform_str)
df["NODEID_D"] = df["NODEID_D"].apply(uniform_str)
df["pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
del df['tpep_dropoff_datetime']
del df['tpep_pickup_datetime']

mn_nodes = gpd.read_file("data/mn_nodes.shp")
init_graph = nx.read_gpickle("data/final_graph_1st_pass_nx_2.1.pkl")
for a, b in init_graph.edges.items():
    b['speed'] = 25.
    b['weight'] = (b['dist'] / b['speed']) * 3600

In [27]:
def first_average(g):
    travel_time = ((g["dropoff_datetime"] - g["pickup_datetime"]).mean().total_seconds())
    res = g.iloc[0]
    res["tt_avg"] = travel_time
    res["n_trips"] = len(g)
    return res

df = df.groupby(["NODEID_O", "NODEID_D"]).apply(first_average)
del df["NODEID_O"]
del df["NODEID_D"]
df.reset_index(inplace=True)

In [28]:
# first trip filtering - > 2 mins, < 1 hour
df = df[(df["NODEID_O"] != df["NODEID_D"]) &
        (df["tt_avg"] > 120) &
        (df["tt_avg"] < 3600)].reset_index(drop=True)

In [35]:
!pip install toolz

Collecting toolz
  Downloading https://files.pythonhosted.org/packages/14/d0/a73c15bbeda3d2e7b381a36afb0d9cd770a9f4adc5d1532691013ba881db/toolz-0.9.0.tar.gz (45kB)
[K    100% |████████████████████████████████| 51kB 1.7MB/s ta 0:00:01
[?25hBuilding wheels for collected packages: toolz
  Running setup.py bdist_wheel for toolz ... [?25ldone
[?25h  Stored in directory: /home/christian/.cache/pip/wheels/f4/0c/f6/ce6b2d1aa459ee97cc3c0f82236302bd62d89c86c700219463
Successfully built toolz
Installing collected packages: toolz
Successfully installed toolz-0.9.0
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [36]:
import numpy as np
import toolz


S_trip = None
T_s = None
O_s = None

def trip_dist(trip):
    try:
        p = nx.algorithms.shortest_path(init_graph,
                                    trip["NODEID_O"], 
                                    trip["NODEID_D"])
    except Exception as e:
        print(e)
        print("=================".format(n1, n2))
        return np.nan
    edges = ((p[i], p[i+1]) for i in range(len(p) - 1))
    street_dist = sum(init_graph.get_edge_data(*e)['dist'] for e in edges)
    return street_dist

@toolz.curry
def trip_path(trip, graph=None):
    """
    Compute trip time (in hours)
    """
    global S_trip
    global T_s
    try:
        p = nx.algorithms.shortest_path(graph,
                                    trip["NODEID_O"], 
                                    trip["NODEID_D"])
        edges = ((p[i], p[i+1]) for i in range(len(p) - 1))
        for e in edges:
            S_trip.add(e)
            T_s[e].add(trip.name)

    except Exception as e:
        n1 = str(e).split(" ")[-1][:-1]
        n2 = str(e).split(" ")[-3]
        print("error on nodes {}, {}".format(n1, n2))
        return np.nan
    return ",".join(p)

bad = None

@toolz.curry
def trip_time(p,graph=None):
    global bad
    edges = ((p[i], p[i+1]) for i in range(len(p) - 1))
    t = 0.
    for e in edges:
        try:
            w = graph.get_edge_data(*e)['weight']
        except Exception as q:
            bad = p
            raise q
        t += w
    return t

In [None]:
# second trip filtering
df["dist"] = df.apply(trip_dist, axis=1)
df.dropna(subset=["dist"], inplace=True)
df["speed"] = df["dist"] / (df["tt_avg"])
df["speed"].hist()

In [41]:
df = df[((df["speed"] * 3600.) > 1) & (df["speed"] < (65 / (3600.)))].reset_index(drop=True)

In [42]:
(df["speed"] * 3600).describe()

count    465110.000000
mean         14.490035
std           5.981888
min           1.001195
25%          10.359372
50%          13.539822
75%          17.621062
max          64.894304
Name: speed, dtype: float64

In [None]:
from collections import defaultdict

# iterative steps
again = True
done = False
base_graph = init_graph.copy()



while again:
    tt = trip_time(graph=base_graph)
    path = trip_path(graph=base_graph)
    again = False
    S_trip = set() # all touched streets
    T_s = defaultdict(set) # basically trips_by_street
    O_s = defaultdict(np.float64) # offset_by_street
    df["path"] = df.apply(path, axis=1).str.split(",")
    df["et"] = df["path"].apply(tt)
    df["rel_err"] = (df["et"] - df["tt_avg"]) / df["tt_avg"]
    df["rel_err"].hist() if not done else None
    done = True
    for street, trips in T_s.items():
        trips_df = df.loc[trips]
        O_s[street] = ((trips_df["et"] - trips_df["tt_avg"]) * trips_df["n_trips"]).sum()
    k = 1.2
    print("rel_err sum is {}".format(df["rel_err"].sum()))
    while True:
        g_c = base_graph.copy()
        tt2 = trip_time(graph=g_c)
        for street in S_trip:
            a, b = street # street connects nodes a and b
            e = base_graph.edges[street]
            if O_s[street] < 0:
                g_c[a][b]['weight'] = e["weight"] * k
            else:
                g_c[a][b]['weight'] = e["weight"] / k
        df["et_new"] = df["path"].apply(tt2)
        df["new_rel_err"] = (df["et_new"] - df["tt_avg"]) / df["tt_avg"]
        print("new_rel_err sum is {}".format(df["new_rel_err"].sum()))
        if np.abs(df["new_rel_err"]).sum() < np.abs(df["rel_err"]).sum():
            df["et"] = df["et_new"]
            df["rel_err"] = df["new_rel_err"]
            again = True
            base_graph = g_c
            break
        else:
            print("k updated to {}".format(k))
            k = 1 + (k - 1) * .75
            if k < 1.0001:
                break            
                
df["rel_err"].hist()

rel_err sum is -195526.78005809945
new_rel_err sum is -142514.0988678577
rel_err sum is -142514.0988678577
new_rel_err sum is -88124.66426537318
rel_err sum is -88124.66426537318
new_rel_err sum is -35872.74192097382
rel_err sum is -35872.74192097382
new_rel_err sum is -19504.80044316536
rel_err sum is -19504.80044316536
new_rel_err sum is 10727.225747638988


KeyboardInterrupt: 

In [None]:
nx.write_gpickle(base_graph,"data/base_graph_1st_pass_nx_{}.pkl".format(nx.__version__))

In [None]:
import networkx as nx
nx.algorithms.all_pairs_dijkstra_path_length(init_graph)

In [None]:
for e, attrs in base_graph:
    

In [None]:
final_graph = base_graph.copy()
for e, attrs in final_graph.edges.iteritems():
    attrs["speed"] = attrs["weight"] / attrs["dist"]
    

In [None]:
final_graph.edges.iteritems().next()

In [None]:
S = set(final_graph.edges.iterkeys())
ES = S
NS = S - S_trip
N_S = nodes_by_street = {s: set(final_graph.edges(s)) for s in S}
n_s_i = n_by_street = sorted({s: len(N_S[s].intersection(S_trip)) for s in NS}.iteritems(), key=lambda x: x[1], reverse=True)

for s, n in n_s_i:
    if n == 0:
        continue
    intersecting_speeds = [final_graph.edges[(e1,e2)]['speed'] for e1, e2 in N_S[s].intersection(ES)]
    final_e = final_graph.edges[s]
    v_s_i = sum(intersecting_speeds) / float(n)
    t_s_i = final_e["dist"] / v_s_i
    final_e["speed"] = v_s_i
    final_e["weight"] = t_s_i
    ES.add(s)
    NS = NS - set([s])

In [None]:
nx.write_gpickle(final_graph,"data/final_graph_1st_pass_nx_{}.pkl".format(nx.__version__))

In [None]:
old = 9285
thing = (u'9000428', u'0021076')
base_graph.edges.iteritems().next()[1]['ix'] = 9285

In [None]:
base_graph.edges.iteritems().next()

In [None]:
S_trip.__iter__().next()

In [None]:
init_graph.edge[u'0066086']['0087506']['speed'] = 25.0

In [None]:
init_graph.edge[u'0066086']['0087506']

In [None]:
.0056284915 * 25

In [None]:
pd.__version__

In [None]:
import functools

In [None]:
import toolz

In [None]:
tt = toolz.curry(trip_time)

In [None]:
tt(3)

In [None]:
pd.Series(np.log(e["speed"]) for _, e in final_graph.edges.iteritems()).hist()

In [None]:
np.e**5