In [5]:
import pandas as pd
import os
from datetime import date, datetime
import json
import requests

In [6]:
# OS dependent path
# load bog, mex, and uio csv
bog_path = os.path.join("dataset", "bog_clean.csv")
mex_path = os.path.join("dataset", "mex_clean.csv")
equ_path = os.path.join("dataset", "uio_clean.csv")
# read dataset
bog_df = pd.read_csv(bog_path)
mex_df = pd.read_csv(mex_path)
equ_df = pd.read_csv(equ_path)
dataset_df = pd.concat([bog_df, mex_df, equ_df], ignore_index=True)
dataset_df = dataset_df.drop(["store_and_fwd_flag", "id"], axis=1)
dataset_df = dataset_df.drop_duplicates()
# change date and time format
dataset_df["pickup_datetime"] = pd.to_datetime(dataset_df["pickup_datetime"], format="%Y/%m/%d %H:%M:%S")
dataset_df["dropoff_datetime"] = pd.to_datetime(dataset_df["dropoff_datetime"], format="%Y/%m/%d %H:%M:%S")
# strip time from datetime
dataset_df["pickup_datetime"] = pd.to_datetime(dataset_df["pickup_datetime"]).dt.date
dataset_df["dropoff_datetime"] = pd.to_datetime(dataset_df["dropoff_datetime"]).dt.date
# sort by pickup date time
dataset_df = dataset_df.sort_values(by=["pickup_datetime"])
dataset_df = dataset_df.reset_index(drop=True)

In [7]:
def distance_duration_estimator(df) :
    # calculate the duration then append to est_duration
    # using OSRM (Open Source Routing Machine) API
    # we could use haversine instead, but its not realistic to measure the distance of a straight line
    distance = []
    duration = []
    for i in range(len(df)) :
        PICKUP_LONG = df.pickup_longitude[i]
        PICKUP_LAT = df.pickup_latitude[i]
        DROPOFF_LONG = df.dropoff_longitude[i]
        DROPOFF_LAT = df.dropoff_latitude[i]
        result = osrm_api(PICKUP_LONG, PICKUP_LAT, DROPOFF_LONG, DROPOFF_LAT)
        distance.append(result[0])
        duration.append(result[1])
    return distance, duration

def osrm_api(p_lon, p_lat, d_lon, d_lat) :
    # request API to get driving distance and duration between 2 locations
    # http://project-osrm.org/
    # distance in meters and duration in seconds
    r = requests.get(
            f"""http://router.project-osrm.org/route/v1/driving/{p_lon},{p_lat};{d_lon},{d_lat}?overview=false"""
    )
    route = json.loads(r.content)["routes"][0]
    map_distance = route["distance"]
    map_duration = route["duration"]
    return map_distance, map_duration

In [None]:
est_dist, est_duration = distance_duration_estimator(dataset_df)

In [None]:
estimation = pd.DataFrame({
    "est_dist" : est_dist,
    "est_duration" : est_duration
})

estimation.to_csv(os.path.join("dataset", "trip_estimated.csv"), index=False)