In [26]:
import pandas as pd
import os
from datetime import date, datetime
import re
from geopy.distance import distance

In [33]:
# OS dependent path
# load bog, mex, and uio csv
bog_path = os.path.join("dataset", "bog_clean.csv")
mex_path = os.path.join("dataset", "mex_clean.csv")
equ_path = os.path.join("dataset", "uio_clean.csv")
# read dataset
bog_df = pd.read_csv(bog_path)
mex_df = pd.read_csv(mex_path)
equ_df = pd.read_csv(equ_path)
# add a feature that contain the country name to distinguish 'em later
bog_df["country"] = "colombia"
mex_df["country"] = "mexico"
equ_df["country"] = "equador"
dataset_df = pd.concat([bog_df, mex_df, equ_df], ignore_index=True)
dataset_df = dataset_df.drop(["store_and_fwd_flag", "id"], axis=1)
dataset_df = dataset_df.drop_duplicates()
# change date and time format
dataset_df["pickup_datetime"] = pd.to_datetime(dataset_df["pickup_datetime"], format="%Y/%m/%d %H:%M:%S")
dataset_df["dropoff_datetime"] = pd.to_datetime(dataset_df["dropoff_datetime"], format="%Y/%m/%d %H:%M:%S")
# strip time from datetime
dataset_df["pickup_datetime"] = pd.to_datetime(dataset_df["pickup_datetime"]).dt.date
dataset_df["dropoff_datetime"] = pd.to_datetime(dataset_df["dropoff_datetime"]).dt.date
# sort by pickup date time
dataset_df = dataset_df.sort_values(by=["pickup_datetime"])
dataset_df = dataset_df.reset_index(drop=True)

In [136]:
def geodesic(p_lon, p_lat, d_lon, d_lat) :
    # calculate distance using geodesic method
    # COEF is a coeficient for calibrating the geodesic result to nearly match osrm
    # see trip_estimator_methods.ipynb for coeficient
    # distance in meters
    COEF = 1.5165
    pickup = (p_lat, p_lon)
    dropoff = (d_lat, d_lon)
    result = distance(pickup, dropoff).km
    return result * COEF * 1000

def distance_estimator(df) :
    # calculate the duration then append to est_duration
    # using geodesic
    # name the estimated distance to est_meters
    distance = []
    for i in range(len(df)) :
        PICKUP_LONG = df.pickup_longitude[i]
        PICKUP_LAT = df.pickup_latitude[i]
        DROPOFF_LONG = df.dropoff_longitude[i]
        DROPOFF_LAT = df.dropoff_latitude[i]
        result = geodesic(PICKUP_LONG, PICKUP_LAT, DROPOFF_LONG, DROPOFF_LAT)
        distance.append(int(result))
    return distance

def duration_estimator(df) :
    # dalam kota limit 50 km/h
    # pemukiman limit 30 km/h
    # ambil rata-rata kecepatan jadi 40 km/h
    # and please name the estimated distance to est_meters or this doesnt work
    # also note : 40 km/h to 60 km/h have 0.6667 scale difference
    # 40km/h to 15 km/h have 2.6667 scale difference
    # use that for time tolerance
    time = []
    for i in range(len(df)) :
        d = df.est_meters[i]
        v = 40 # average speed in km/h
        t = d / (v * (1000/3600)) # time travel in seconds
        time.append(int(t))
    return time

def labeler(df) :
    # make label 0 : False or 1 : True, based on below :
    # if the day taken is > 1, false
    # if the distance lower or higher than est_distance  + tolerance, false
    # if the duration lower or higher than est_duration + tolerance, false
    # set the distance tolerance to +- 20% for taking shortcut and taking longer route
    # set the duration tolerance to + 50% like very congested road
    # set the trip duration tolerance to (lower bound : * 0.6667) and (higher bound : * 2.6667)
    # distance < 700 meters, false
    # wait_sec > 5 minutes, false
    label = []
    for i in range(len(df)) :
        day_delta = (df.dropoff_datetime[i] - df.pickup_datetime[i]).days
        DLB = df.est_meters[i] * 0.8 # mean might be closer
        DHB = df.est_meters[i] * 1.5 # mean might be further
        TLB = df.est_duration[i] * 0.6667 # mean might be faster (~ 40km/h - 60km/h)
        THB = df.est_duration[i] * 2.6667 * 1.5 # mean might be slower (~ 15km/h - 40km/h) and 50% longer
        DISTB = 700
        WT = 7 * 60
        if day_delta > 1 :
            label.append(0)
        elif df.est_meters[i] < DISTB : 
            label.append(0)
        elif df.wait_sec[i] > WT : 
            label.append(0)
        elif (df.dist_meters[i] > DLB) and (df.dist_meters[i] < DHB) :
            if (df.trip_duration[i] > TLB) and (df.trip_duration[i] < THB) :
                label.append(1)
            else :
                label.append(0)
        else :
            label.append(0)
    return label

def services_extractor(df) :
    # extract services name from vendor_id and map the services based in 2022
    # some normal services is not available in 2022, it'll be taxi service instead
    # uberangel is exclusive to colombia, it'll be uberblack service instead
    # ubersuv will be uberxl
    SERVICE_NAME = re.compile(
        r"taxi|uberxl|uberx|uberblack|ubervan|uberangel|ubersuv"
    )
    df["vendor_id"] = df["vendor_id"].str.lower()
    service = []
    for i in range(len(df)) :
        extract = SERVICE_NAME.search(df.vendor_id[i])
        if extract != None :
            ext_group = extract.group()
            if (ext_group  == "ubervan") or (ext_group == "ubersuv"):
                service.append("uberxl")
            elif ext_group == "uberangel" :
                service.append("uberblack")
            else : 
                service.append(ext_group)
        else :
            service.append("taxi")
    return service

def price_estimator(df) :
    # estimate the price based on 2022 uber price
    # the price is in US dollar
    # base fare + rate per menit + rate per KM
    # if the calculated price is lower than minimum fare, use minimum fare
    
    return 0

In [124]:
df_cp = dataset_df

In [121]:
df_cp["vendor_id"] = df_cp["vendor_id"].str.lower()
# df_cp.loc[[165]]
service = []
SERVICE_NAME = re.compile(
        r"taxi|uberxl|uberx|uberblack|ubervan|uberangel|ubersuv"
    )
extract = SERVICE_NAME.search(df_cp.vendor_id[165])
if extract != None :
    ext_group = extract.group()
    if (ext_group  == "ubervan") or (ext_group == "ubersuv"):
        service.append("uberxl")
    elif ext_group == "uberangel" :
        service.append("uberblack")
    else : 
        service.append(ext_group)
else :
    service.append("taxi")

In [122]:
print(service)

['uberxl']


In [138]:
df_cp["service"] = services_extractor(df_cp)
df_cp["est_meters"] = distance_estimator(df_cp)
df_cp["est_duration"] = duration_estimator(df_cp)
df_cp["label"] = labeler(df_cp)

In [37]:
price_path = os.path.join("dataset", "uber_prices_2022.csv")
price_df = pd.read_csv(price_path)

In [132]:
df_cp["service"].unique()

array(['taxi', 'uberx', 'uberxl', 'uberblack'], dtype=object)

In [11]:
# p_service_data = price_df[(price_df.country == "colombia") & (price_df.name == "taxi")]

In [16]:
# for j in range(len(p_service_data)) :
#     print(p_service_data.min_fare[j])

1.056


In [18]:
# type(p_service_data["min_fare"][0])

numpy.float64

IF the label is 1, calculate real recorded data... IF the label is 0, calculate estimated data

In [150]:
total = []
for i in range(len(df_cp)) :
    service = df_cp.service[i]
    country = df_cp.country[i]
    label = df_cp.label[i]
    p_service_data = price_df[(price_df.country == f"{country}") & (price_df.name == f"{service}")]
    MIN_FARE = round(float(p_service_data["min_fare"]), 1)
    BASE_FARE = float(p_service_data["base_fare"])
    CANCEL_FEE = round(float(p_service_data["cancel_fee"]), 1)
    if label == 1 :
        # VALID DATA CALCULATION
        if df_cp.trip_duration[i] >= 60 : 
            # for valid NOT cancel
            PER_MIN = float(p_service_data["per_min"]) * df_cp.trip_duration[i] / 60
            PER_KM = float(p_service_data["per_km"]) * df_cp.dist_meters[i] / 1000
            calc = BASE_FARE + PER_MIN + PER_KM
            calc = round(calc, 1)
            if calc >= MIN_FARE :
                total.append(calc)
            else :
                total.append(MIN_FARE)
        else : 
            # for valid but cancel
            calc = CANCEL_FEE
            total.append(calc)
    else :
        # NOT VALID DATA CALCULATION BUT LEGIT
        # theres must be data with 0 label but wait_sec is valid
        if (df_cp.est_duration[i] >= 60) and (df_cp.wait_sec[i] < 7 * 60) :
            # for NOT valid but NOT cancel
            PER_MIN = float(p_service_data["per_min"]) * df_cp.est_duration[i] / 60
            PER_KM = float(p_service_data["per_km"]) * df_cp.est_meters[i] / 1000
            calc = BASE_FARE + PER_MIN + PER_KM
            calc = round(calc, 1)
            if calc >= MIN_FARE :
                total.append(calc)
            else :
                total.append(MIN_FARE)
        else :
            # for NOT valid but cancel
            # also 0 label data but waiting more than 7 minutes
            calc = CANCEL_FEE
            total.append(calc)
print(total) 

[0.0, 0.0, 0.0, 0.0, 6.6, 0.0, 10.3, 2.8, 2.0, 0.0, 0.0, 0.0, 0.0, 4.1, 0.0, 0.0, 3.6, 1.9, 1.7, 1.7, 0.0, 6.5, 3.7, 1.5, 2.3, 0.0, 2.5, 2.7, 0.0, 1.6, 0.0, 0.0, 3.6, 12.8, 0.0, 0.0, 3.1, 4.2, 0.0, 0.0, 2.2, 3.0, 6.6, 2.0, 1.7, 11.4, 1.3, 2.2, 0.0, 1.3, 0.0, 1.3, 3.1, 7.3, 0.0, 0.0, 4.8, 7.0, 0.0, 0.0, 0.0, 3.5, 0.0, 2.6, 0.0, 0.0, 2.4, 0.0, 0.0, 0.0, 3.9, 1.4, 2.9, 3.3, 1.9, 2.3, 6.0, 0.0, 0.0, 1.3, 1.7, 0.0, 3.1, 0.0, 3.5, 1.9, 4.0, 2.7, 2.4, 1.3, 2.0, 3.1, 1.3, 3.4, 4.8, 2.4, 2.2, 2.9, 4.3, 0.0, 1.6, 6.3, 1.3, 1.3, 1.9, 5.8, 0.0, 1.7, 1.3, 2.0, 0.0, 1.7, 1.8, 0.0, 3.9, 0.0, 0.0, 0.0, 0.0, 4.5, 0.0, 6.8, 3.9, 1.3, 0.0, 0.0, 0.0, 2.3, 0.0, 2.3, 0.0, 1.9, 0.0, 0.0, 1.6, 2.4, 0.0, 2.1, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 2.2, 0.0, 2.7, 8.9, 0.0, 2.5, 0.0, 4.6, 1.3, 0.0, 1.4, 3.7, 2.3, 0.0, 1.7, 1.8, 2.9, 2.5, 0.0, 0.0, 1.1, 0.0, 0.0, 1.5, 0.0, 5.8, 0.0, 1.3, 3.2, 3.6, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 7.9, 0.0, 6.9, 3.5, 1.6, 3.8, 1.4, 0.0, 5.7, 5.3, 2.0, 0.0, 2.8, 3.0, 3.0, 0.0, 1.4, 0.0, 0

In [None]:
        else : 
            # for valid but cancel
            calc = CANCEL_FEE
            total.append(calc)



else :
        # NOT VALID DATA CALCULATION
        if df_cp.est_duration[i] >= 60 :
            # for NOT valid but NOT cancel
            PER_MIN = float(p_service_data["per_min"]) * df_cp.est_duration[i] / 60
            PER_KM = float(p_service_data["per_km"]) * df_cp.est_meters[i] / 1000
            calc = BASE_FARE + PER_MIN + PER_KM
            calc = round(calc, 1)
            if calc >= MIN_FARE :
                total.append(calc)
            else :
                total.append(MIN_FARE)
        else :
            # for NOT valid but cancel
            calc = CANCEL_FEE
            total.append(calc)