In [1]:
import pandas as pd
import os
from datetime import date, datetime
import json
import requests
from geopy.distance import distance 
from math import radians, cos, sin, asin, sqrt

In [2]:
# OS dependent path
# load bog, mex, and uio csv
bog_path = os.path.join("dataset", "bog_clean.csv")
mex_path = os.path.join("dataset", "mex_clean.csv")
equ_path = os.path.join("dataset", "uio_clean.csv")
# read dataset
bog_df = pd.read_csv(bog_path)
mex_df = pd.read_csv(mex_path)
equ_df = pd.read_csv(equ_path)
dataset_df = pd.concat([bog_df, mex_df, equ_df], ignore_index=True)
dataset_df = dataset_df.drop(["store_and_fwd_flag", "id"], axis=1)
dataset_df = dataset_df.drop_duplicates()
# change date and time format
dataset_df["pickup_datetime"] = pd.to_datetime(dataset_df["pickup_datetime"], format="%Y/%m/%d %H:%M:%S")
dataset_df["dropoff_datetime"] = pd.to_datetime(dataset_df["dropoff_datetime"], format="%Y/%m/%d %H:%M:%S")
# strip time from datetime
dataset_df["pickup_datetime"] = pd.to_datetime(dataset_df["pickup_datetime"]).dt.date
dataset_df["dropoff_datetime"] = pd.to_datetime(dataset_df["dropoff_datetime"]).dt.date
# sort by pickup date time
dataset_df = dataset_df.sort_values(by=["pickup_datetime"])
dataset_df = dataset_df.reset_index(drop=True)

In [154]:
def osrm_api(p_lon, p_lat, d_lon, d_lat) :
    # request API to get driving distance and duration between 2 locations
    # http://project-osrm.org/
    # distance in meters and duration in seconds
    r = requests.get(
            f"""http://router.project-osrm.org/route/v1/driving/{p_lon},{p_lat};{d_lon},{d_lat}?overview=false"""
    )
    route = json.loads(r.content)["routes"][0]
    map_distance = route["distance"]
    map_duration = route["duration"]
    return map_distance, map_duration

def haversine(p_lon, p_lat, d_lon, d_lat) :
    # calculate distance using the great circle method
    # R = earth radius in KM
    R = 6372.8 
    dLat = radians(d_lat - p_lat)
    dLon = radians(d_lon - d_lon)
    lat1 = radians(p_lat)
    lat2 = radians(d_lat)
    a = sin(dLat/2)**2 + cos(lat1)*cos(lat2)*sin(dLon/2)**2
    c = 2*asin(sqrt(a))
    return R * c

def geodesic(p_lon, p_lat, d_lon, d_lat) :
    # calculate distance using geodesic method
    pickup = (p_lat, p_lon)
    dropoff = (d_lat, d_lon)
    result = distance(pickup, dropoff).km
    return result * 1000 * 1.5165198608676824

def distance_estimator(df) :
    # calculate the duration then append to est_duration
    # using geodesic
    distance = []
    for i in range(len(df)) :
        PICKUP_LONG = df.pickup_longitude[i]
        PICKUP_LAT = df.pickup_latitude[i]
        DROPOFF_LONG = df.dropoff_longitude[i]
        DROPOFF_LAT = df.dropoff_latitude[i]
        result = geodesic(PICKUP_LONG, PICKUP_LAT, DROPOFF_LONG, DROPOFF_LAT) *1000
        distance.append(result)
    return distance

def time_calc(meters, kmph) :
    time =  meters/ (kmph * (1000/3600))
    return time

def duration_estimator(df) :
    # dalam kota limit 50 km/h
    # pemukiman limit 30 km/h
    # ambil rata-rata kecepatan jadi 40 km/h
    duration = []
    for i in range(len(df)) :
        d = df.est_meters[i]
        v = 40 # average speed in km/h
        t = time_calc(d, v) # time travel in seconds
        duration.append(t)
    return duration

def calibration(df) : 
    # calibrate geodesic to osrm with an coeficient
    # pass 100 samples of shuffled data
    SPEED = 40
    time_coef = []
    dist_coef = []
    for i in range(len(df)) : 
        PICKUP_LONG = df.pickup_longitude[i]
        PICKUP_LAT = df.pickup_latitude[i]
        DROPOFF_LONG = df.dropoff_longitude[i]
        DROPOFF_LAT = df.dropoff_latitude[i]
        osrm_data = osrm_api(PICKUP_LONG, PICKUP_LAT, DROPOFF_LONG, DROPOFF_LAT)
        geodesic_data = geodesic(PICKUP_LONG, PICKUP_LAT, DROPOFF_LONG, DROPOFF_LAT) * 1000
        geodesic_time_data = time_calc(geodesic_data, SPEED)
        dc = osrm_data[0] / geodesic_data
        tc = osrm_data[1] / geodesic_time_data
        dist_coef.append(dc)
        time_coef.append(tc)
    result_dist_coef = sum(dist_coef) / len(df)
    result_time_coef = sum(time_coef) / len(df)
    return result_dist_coef, result_time_coef

In [155]:
print(osrm_api(-74.05888051, 4.711039752, -74.11598336, 4.66310966))
print(geodesic(-74.05888051, 4.711039752, -74.11598336, 4.66310966))
print(time_calc(geodesic(-74.05888051, 4.711039752, -74.11598336, 4.66310966), 40))

print(haversine(-74.05888051, 4.711039752, -74.11598336, 4.66310966) * 1000)
print(time_calc(haversine(-74.05888051, 4.711039752, -74.11598336, 4.66310966) * 1000, 40))

(12704.3, 926.2)
12526.778764998448
2.732952293981997
5331.088832256043
1.1630772544850452


In [113]:
# iterating the calibration 10 times and get the average
coefs = []
for i in range(10) :
    sample_df = dataset_df.sample(30)
    sample_df = sample_df.sort_values(by=["pickup_datetime"])
    sample_df = sample_df.reset_index(drop=True)
    coefs.append(calibration(sample_df))

In [120]:
dist_c = [1.5812027456359385, 1.775139120811639, 1.4392439755714015, 1.3834894548884848, 1.4078692995501634, 1.383879117744345, 1.4912986182103856, 1.4227422885901597, 1.7266688709667026, 1.553665116707606]
dist_t = [1.783073914785963, 2.397118283815813, 1.592531898189508, 1.4400788390020063, 1.5754468084440547, 1.5953026198694815, 1.7228758377662727, 1.495075463027367, 2.083137218285952, 1.6176344445670887]

In [123]:
print(sum(dist_c) / len(dist_c))
print(sum(dist_t) / len(dist_t))

1.5165198608676824
1.7302275327753507


In [153]:
print(osrm_api(-74.05888051, 4.711039752, -74.11598336, 4.66310966))
print(geodesic(-74.05888051, 4.711039752, -74.11598336, 4.66310966)) 
print(time_calc(geodesic(-74.05888051, 4.711039752, -74.11598336, 4.66310966), 40)) 

(12704.3, 926.2)
12526.778764998448
1127.4100888498604


the distance seems reliable with coefs, but the duration not... lets calibrate the duration only but after adding the distance coeficient

In [148]:
t_coefs = []
for i in range(10) :
    sample_df = dataset_df.sample(30)
    sample_df = sample_df.sort_values(by=["pickup_datetime"])
    sample_df = sample_df.reset_index(drop=True)
    t_coefs.append(calibration(sample_df)[1])

In [149]:
t_coefs

[0.0016042374342462877,
 0.001626591171339282,
 0.0011225784683371136,
 0.0011129368256387238,
 0.008220444363866484,
 0.000998268400835961,
 0.0012050853179174517,
 0.005805988925102309,
 0.0010993998361858589,
 0.0014454470605348218]

In [151]:
print(sum(t_coefs) / len(t_coefs))

0.0024240977804004292


no just dont use its bad, only use the distance!