In [None]:
import pandas as pd
import networkx as nx
import geopandas as gpd
import shapely.geometry

import matplotlib
%matplotlib inline

import glob

In [None]:
files = glob.glob("data/taxi_clean/*_od.pkl")
files = files[0:1]

df = pd.concat(pd.read_pickle(f) for f in files)
df = df.sample(50000)

def uniform_str(x):
    strd = str(x)
    while len(strd) < 7:
        strd = '0' + strd
    return strd
df["NODEID_O"] = df["NODEID_O"].apply(uniform_str)
df["NODEID_D"] = df["NODEID_D"].apply(uniform_str)
df["pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
del df['tpep_dropoff_datetime']
del df['tpep_pickup_datetime']

mn_nodes = gpd.read_file("data/mn_nodes.shp")
init_graph = nx.read_gpickle("data/init_graph.pkl")

In [None]:
def first_average(g):
    travel_time = ((g["dropoff_datetime"] - g["pickup_datetime"]).mean().total_seconds() / 60.)
    res = g.iloc[0]
    res["tt_avg"] = travel_time
    res["n_trips"] = len(g)
    return res

df = df.groupby(["NODEID_O", "NODEID_D"]).apply(first_average)
del df["NODEID_O"]
del df["NODEID_D"]
df.reset_index(inplace=True)

In [None]:
# first trip filtering
df = df[(df["NODEID_O"] != df["NODEID_D"]) &
        (df["tt_avg"] > 120 / 60.) &
        (df["tt_avg"] < 3600 / 60.)].reset_index(drop=True)

In [None]:
import numpy as np
import toolz


S_trip = None
T_s = None
O_s = None

def trip_dist(trip):
    try:
        p = nx.algorithms.shortest_path(init_graph,
                                    trip["NODEID_O"], 
                                    trip["NODEID_D"])
    except Exception as e:
        n1 = str(e).split(" ")[-1][:-1]
        n2 = str(e).split(" ")[-3]
        print "error on nodes {}, {}".format(n1, n2)
        return np.nan
    edges = ((p[i], p[i+1]) for i in range(len(p) - 1))
    street_dist = sum(init_graph.get_edge_data(*e)['dist'] for e in edges)
    return street_dist

@toolz.curry
def trip_path(trip, graph=None):
    """
    Compute trip time (in hours)
    """
    global S_trip
    global T_s
    try:
        p = nx.algorithms.shortest_path(graph,
                                    trip["NODEID_O"], 
                                    trip["NODEID_D"])
        edges = ((p[i], p[i+1]) for i in range(len(p) - 1))
        for e in edges:
            S_trip.add(e)
            T_s[e].add(trip.name)

    except Exception as e:
        n1 = str(e).split(" ")[-1][:-1]
        n2 = str(e).split(" ")[-3]
        print "error on nodes {}, {}".format(n1, n2)
        return np.nan
    return ",".join(p)

@toolz.curry
def trip_time(p,graph=None):
    edges = ((p[i], p[i+1]) for i in range(len(p) - 1))
    t = 0.
    for e in edges:
        w = graph.get_edge_data(*e)['weight']
        t += w
    return t

# second trip filtering
df["dist"] = df.apply(trip_dist, axis=1)
df.dropna(subset=["dist"], inplace=True)
df["speed"] = df["dist"] / (df["tt_avg"] / 60.)
df["speed"].hist()
df = df[(df["speed"] > 1) & (df["speed"] < 65)].reset_index(drop=True)


In [None]:
from collections import defaultdict

# iterative steps
again = True
done = False
base_graph = init_graph.copy()
while again:
    tt = trip_time(graph=base_graph)
    paths
    again = False
    S_trip = set() # all touched streets
    T_s = defaultdict(set) # basically trips_by_street
    O_s = defaultdict(np.float64) # offset_by_street
    df["et"] = df["path"].apply(tt)
    df["rel_err"] = (df["et"] - df["tt_avg"]) / df["tt_avg"]
    df["rel_err"].hist() if not done else None
    done = True
    for street, trips in T_s.iteritems():
        trips_df = df.loc[trips]
        O_s[street] = ((trips_df["et"] - trips_df["tt_avg"]) * trips_df["n_trips"]).sum()
    k = 1.2
    print "rel_err sum is {}".format(df["rel_err"].sum())
    while True:
        g_c = base_graph.copy()
        tt2 = trip_time(graph=g_c)
        for street in S_trip:
            a, b = street # street connects nodes a and b
            e = init_graph.edge[a][b]
            if O_s[street] < 0:
                g_c[a][b]['weight'] = e["weight"] * k
            else:
                g_c[a][b]['weight'] = e["weight"] / k
        df["et_new"] = df["path"].apply(tt2)
        df["new_rel_err"] = (df["et_new"] - df["tt_avg"]) / df["tt_avg"]
        print "new_rel_err sum is {}".format(df["new_rel_err"].sum())
        if np.abs(df["new_rel_err"].sum()) < np.abs(df["rel_err"].sum()):
            df["et"] = df["et_new"]
            df["rel_err"] = df["new_rel_err"]
            again = True
            base_graph = g_c
            break
        else:
            print "k updated to {}".format(k)
            k = 1 + (k - 1) * .75
            if k < 1.0001:
                break            
                
df["rel_err"].hist()

In [None]:
import networkx as nx
nx.algorithms.all_pairs_dijkstra_path_length(init_graph)

In [None]:
S = set(g.edges_iter())
ES = S
NS = S - S_trip
N_S = nodes_by_street = {s: set(g.edges(s)) for s in S}
n_s_i = n_by_street = sorted({s: len(N_S[s].intersect(S_trip)) for s in NS}.iteritems(), key=lambda x: x[1], reverse=True)

for s, n in n_s_i:
    intersecting_weights = [g.edge[e1][e2]['weight'] for e1, e2 in N_S[s].intersect(ES)]
    v_s_i = sum(intersecting_weights) / float(n)
    ES.add(s)
    NS = NS - s

In [None]:
T_s[(u'0043417', u'9048165')]

In [None]:
S_trip.__iter__().next()

In [None]:
init_graph.edge[u'0066086']['0087506']['speed'] = 25.0

In [None]:
init_graph.edge[u'0066086']['0087506']

In [None]:
.0056284915 * 25

In [None]:
pd.__version__

In [None]:
import functools

In [None]:
import toolz

In [None]:
tt = toolz.curry(trip_time)

In [None]:
tt(3)