In [1]:
import pandas as pd
# from sklearn.neighbors import NearestNeighbors
import numpy as np

# The NNM is a Generative Model, hence no split of train and test dataset

In [2]:
data = pd.read_csv("./data/tt_data_for_ml.csv")[:10000]

In [3]:
data.head()


Unnamed: 0,id,t_travtime,lat_d,lon_d,lat_o,lon_o,od_dist,year,t_time_start,o_type_ageb,o_type_dist,d_type_ageb,d_type_dist,taxi,wt_tottrips,ageb_eod1990,t_distr_o,t_distr_d
0,1,40.0,19.378813,-99.119835,19.436481,-99.160004,7.732718,1994,1600.0,0,1,1,0,0,281.0,9007032,2,13
1,2,30.0,19.436481,-99.160004,19.378813,-99.119835,7.732718,1994,730.0,1,0,0,1,0,257.0,9007032,13,2
2,4,5.0,19.378813,-99.119835,19.376709,-99.117508,1.423484,1994,805.0,0,1,1,0,0,324.0,9007032,13,13
3,5,10.0,19.378813,-99.119835,19.376709,-99.117508,1.423484,1994,1350.0,0,1,1,0,0,498.0,9007032,13,13
4,7,60.0,19.35512,-99.067688,19.378813,-99.119835,6.098191,1994,700.0,1,0,0,1,0,319.0,9007032,13,39


In [4]:
data = data[["t_travtime", "lat_d", "lon_d", "lat_o", "lon_o", "od_dist"]]
data.head()


Unnamed: 0,t_travtime,lat_d,lon_d,lat_o,lon_o,od_dist
0,40.0,19.378813,-99.119835,19.436481,-99.160004,7.732718
1,30.0,19.436481,-99.160004,19.378813,-99.119835,7.732718
2,5.0,19.378813,-99.119835,19.376709,-99.117508,1.423484
3,10.0,19.378813,-99.119835,19.376709,-99.117508,1.423484
4,60.0,19.35512,-99.067688,19.378813,-99.119835,6.098191


# Basic Agenda
+ Find k nearest neighbors whose the sum of the ordinary and destination distances is smaller than the threshold
+ Using these neightbors' travel time information and use some smoothing techniques to fit
+ techs include spatial decay (weight): 
    * Option1: w = softmax of 1/distance
    * Option2: w = oddistance(i)/oddistance(j)
    * Option3: w = 1/numsOfNeighbors Average of travel time
    * Option2: EMA smoothing 
+ Use these weights * traveltime = prediction and output the prediction



In [5]:
data_distance_inversion = data.copy()

In [6]:
def euclidean_distance(route1, route2):
    # Similarities between two routes

    o_distance = np.sqrt((route1['lat_o'] - route2["lat_o"])**2 + (route1['lon_o'] - route2["lon_o"])**2)
    d_distance = np.sqrt((route1['lat_d'] - route2["lat_d"])**2 + (route1['lon_d'] - route2["lon_d"])**2)
    
    return o_distance + d_distance

In [7]:
def softmax(x):
    # print(x)
    # print(np.max(x))
    exp_x = np.exp(x - np.max(x)) # stability of softmax
    return exp_x / exp_x.sum()


In [8]:
def calculate_relevance(row):
    if row['distance'] != 0:
        return 1 / row['distance']
    else:
        return 1

In [9]:
## This method's weight utilize the inversion of the sum of distances between the o,d pairs 
## And utilize the softmax to calculate the weight

def find_neighbors(dataframe, target_route, numsOfneighbors, max_distance = 10, min_distance = 0, coefficient = 100):
    N = len(dataframe)
    for i in range(N):
        distance = euclidean_distance(target_route, dataframe.loc[i])
        dataframe.loc[i, 'distance'] = distance * 100

    #  Problems: The nums of the neighbors may not be sufficient enough Solved by assigning the relavance to be 1
    neighbors = dataframe.nsmallest(numsOfneighbors + 1, 'distance')
    # print(neighbors)
    neighbors = neighbors[neighbors['distance'] < max_distance] 
    # Exclude the route itself
    # Why distances * 10? because the inversion of distances are too large in softmax method

    
    neighbors['relevance'] = neighbors.apply(calculate_relevance, axis=1)
    neighbors['weight'] = softmax(neighbors['relevance'].values)
    pred = sum(np.array(neighbors["t_travtime"]) * np.array(neighbors["weight"]))
    # print(pred)
    return neighbors

find_neighbors(data_distance_inversion, data_distance_inversion.loc[3], 10)
# What if there exist some same data except the travel time data?

# O -> D
# One point

# 

Unnamed: 0,t_travtime,lat_d,lon_d,lat_o,lon_o,od_dist,distance,relevance,weight
2,5.0,19.378813,-99.119835,19.376709,-99.117508,1.423484,0.0,1.0,0.094544
3,10.0,19.378813,-99.119835,19.376709,-99.117508,1.423484,0.0,1.0,0.094544
10,5.0,19.376394,-99.114906,19.376709,-99.117508,1.367435,0.549059,1.821297,0.214941
65,10.0,19.37195,-99.125443,19.376709,-99.117508,1.734485,0.886287,1.128303,0.107487
3670,15.0,19.383644,-99.12883,19.376709,-99.117508,1.931292,1.021022,0.979411,0.092618
103,30.0,19.376709,-99.117508,19.366385,-99.115265,1.838063,1.3702,0.72982,0.07216
32,30.0,19.374498,-99.132889,19.376709,-99.117508,2.012927,1.374868,0.727343,0.071982
3646,30.0,19.376709,-99.117508,19.389099,-99.11927,1.953199,1.565182,0.638903,0.065889
41,30.0,19.370321,-99.134109,19.376709,-99.117508,2.231685,1.660907,0.602081,0.063507
89,15.0,19.364767,-99.13063,19.376709,-99.117508,2.344453,1.771503,0.564493,0.061164


In [11]:
# Decoration Function 
def predict_traveltime(dataframe, numsOfneighbors, max_distance = 10, min_distance = 0, coefficient = 10):
    N = len(dataframe)
    mse = 0 
    p = []
    for i in range(N):
        # Time Complexity O(n * 2)
        target_route = dataframe.loc[i]
        gt = target_route["t_travtime"]
        neighbors = find_neighbors(dataframe, target_route, numsOfneighbors, max_distance, min_distance, coefficient)
        prediction = sum(np.array(neighbors["t_travtime"]) * np.array(neighbors["weight"]))
        # print(f"GT: {gt}, pred: {prediction}")
        mse += (gt - prediction) ** 2
        p.append(prediction)
        # print(mse)

    return mse / N, p

mse = predict_traveltime(data_distance_inversion[:1000], 10)[0]
y_pred = predict_traveltime(data_distance_inversion[:1000], 10)[1]
# MSE = 270000 To be modified.
mse # 269.87


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


269.87266132819235

# Method 2
+ Use the cosine function to calculate the similarities between pairs 
+ Use these similarities to linear combination of the travel time

In [None]:
cos_data = data.copy()
cos_data.head()
# minimum o#  normalized 
# more attributes

Unnamed: 0,t_travtime,lat_d,lon_d,lat_o,lon_o,od_dist
0,40.0,19.378813,-99.119835,19.436481,-99.160004,7.732718
1,30.0,19.436481,-99.160004,19.378813,-99.119835,7.732718
2,5.0,19.378813,-99.119835,19.376709,-99.117508,1.423484
3,10.0,19.378813,-99.119835,19.376709,-99.117508,1.423484
4,60.0,19.35512,-99.067688,19.378813,-99.119835,6.098191


In [None]:
def cosine_similarity(route1, route2):
    # drop traveltime 

    q = np.array(route1.drop(['t_travtime']))
    k = np.array(route2.drop(['t_travtime']))
    similarity = np.dot(q,k) / ((np.linalg.norm(q) * np.linalg.norm(k)))
    # o_distance = np.sqrt((route1['lat_o'] - route2["lat_o"])**2 + (route1['lon_o'] - route2["lon_o"])**2)
    # d_distance = np.sqrt((route1['lat_d'] - route2["lat_d"])**2 + (route1['lon_d'] - route2["lon_d"])**2)
    
    return similarity

cosine_similarity(cos_data.loc[1], cos_data.loc[3] )


20436.272203125
0.99902736777309


In [None]:
def find_neighbors(dataframe, target_route, numsOfneighbors, max_distance = 10, min_distance = 0, coefficient = 100):
    N = len(dataframe)
    for i in range(N):
        similarity = cosine_similarity(target_route, dataframe.loc[i])
        dataframe.loc[i, 'similarity'] = similarity

    #  Problems: The nums of the neighbors may not be sufficient enough Solved by assigning the relavance to be 1
    neighbors = dataframe.nsmallest(numsOfneighbors + 1, 'distance')
    # print(neighbors)
    neighbors = neighbors[neighbors['distance'] < max_distance] 
    # Exclude the route itself
    # Why distances * 10? because the inversion of distances are too large in softmax method

    neighbors['relevance'] = neighbors.apply(calculate_relevance, axis=1)
    neighbors['weight'] = softmax(neighbors['relevance'].values)
    pred = sum(np.array(neighbors["t_travtime"]) * np.array(neighbors["weight"]))
    # print(pred)
    return neighbors