In [121]:
import numpy as np
import pandas as pd
import pickle
from sklearn.neighbors import NearestCentroid
from geopy.distance import geodesic
# from geopy.geocoders import Nominatim
from datetime import datetime

In [123]:
#maps module
import googlemaps

In [165]:
with open('../models/xgb_model.pkl', 'rb') as x:
    xgb_model = pickle.load(x)

In [167]:
with open('../models/dbs_pickups.pkl', 'rb') as p:
    dbs1_model = pickle.load(p)

In [169]:
with open('../models/dbs_dropoffs.pkl', 'rb') as d:
    dbs2_model = pickle.load(d)

In [171]:
# confirm
print(type(xgb_model))
print(type(dbs1_model))
print(type(dbs2_model))

<class 'sklearn.model_selection._search.GridSearchCV'>
<class 'sklearn.cluster._dbscan.DBSCAN'>
<class 'sklearn.cluster._dbscan.DBSCAN'>


In [133]:
original_pickup_coordinates = pd.read_csv('../data/clean_data/taxi_clean_set_v3.csv', usecols = ['pickup_longitude', 'pickup_latitude'])
print(original_pickup_coordinates.shape)
original_pickup_coordinates.head(2)

(53351, 2)


Unnamed: 0,pickup_longitude,pickup_latitude
0,-73.993578,40.721298
1,-73.985358,40.744602


In [135]:
original_dropoff_coordinates = pd.read_csv('../data/clean_data/taxi_clean_set_v3.csv', usecols = ['dropoff_longitude', 'dropoff_latitude'])
print(original_dropoff_coordinates.shape)
original_dropoff_coordinates.head(2)

(53351, 2)


Unnamed: 0,dropoff_longitude,dropoff_latitude
0,-73.967109,40.756657
1,-73.956073,40.778083


In [174]:
api_key = 'AIzaSyAtqQc3fGOEPHdXUN_OQaQ_bE54ogYcSvk'
gmaps = googlemaps.Client(key = api_key)

In [176]:
address = 'Strandtorget 1, 9008 Tromsø, Norway'

In [178]:
def get_coordinates(address):
    location = gmaps.geocode(address)
    if location:
        return location[0]['geometry']['location']['lat'], location[0]['geometry']['location']['lng']
    else:
        raise ValueError(f'Address {address} is invalid. Try again')

In [180]:
get_coordinates(address)  # <<---- this function works!

(69.64735879999999, 18.9561025)

In [208]:
def pickup_cluster_mapping(dbs1_model, original_pickup_coordinates):
   
    cluster_centers = []
    cluster_labels = []

    # compute the centroid of each cluster (excluding noise points)
    for cluster_id in np.unique(dbs1_model.labels_):
        
        if cluster_id != -1:
            points_in_cluster = original_pickup_coordinates[dbs1_model.labels_ == cluster_id]
            cluster_centers.append(points_in_cluster.mean(axis = 0))
            cluster_labels.append(cluster_id)

    cluster_centers = np.array(cluster_centers)
    cluster_labels = np.array(cluster_labels)

    nearest_centroid = NearestCentroid()
    nearest_centroid.fit(cluster_centers, cluster_labels)

    return nearest_centroid

In [210]:
def dropoff_cluster_mapping(dbs2_model, original_dropoff_coordinates):
    
    cluster_centers = []
    cluster_labels = []

    # compute the centroid of each cluster (excluding noise points)
    for cluster_id in np.unique(dbs2_model.labels_):
        
        if cluster_id != -1:
            points_in_cluster = original_dropoff_coordinates[dbs2_model.labels_ == cluster_id]
            cluster_centers.append(points_in_cluster.mean(axis = 0))
            cluster_labels.append(cluster_id)

    cluster_centers = np.array(cluster_centers)
    cluster_labels = np.array(cluster_labels)

    nearest_centroid = NearestCentroid()
    nearest_centroid.fit(cluster_centers, cluster_labels)

    return nearest_centroid

In [212]:
def is_within_cluster(coordinates, cluster, cluster_mapping_model):

    predicted_cluster = cluster_mapping_model.predict([coordinates])[0]

    if predicted_cluster == cluster:
        return 1
    else:
        return 0

In [214]:
def prepare_features(pickup_address,
                     dropoff_address, 
                     pickup_cluster_mapping_model = pickup_cluster_mapping(dbs1_model, original_pickup_coordinates), 
                     dropoff_cluster_mapping_model = dropoff_cluster_mapping(dbs2_model, original_dropoff_coordinates)):

    # parse coordinates ------------------------------------------------------------------------------------------
    pickup_coordinates = get_coordinates(pickup_address)
    dropoff_coordinates = get_coordinates(dropoff_address)
    #-------------------------------------------------------------------------------------------------------------

    
    # determine geodesic distance --------------------------------------------------------------------------------
    geodesic_distance = geodesic(pickup_coordinates, dropoff_coordinates).kilometers

    # estimate actual ride distance
    if geodesic_distance < 10:
        estimated_distance = geodesic_distance * 1.15
    else:
        estimated_distance = geodesic_distance * 1.2
    #-------------------------------------------------------------------------------------------------------------


    
    # chronological variables-------------------------------------------------------------------------------------
    now = datetime.now()
    hour = now.hour
    day = now.weekday()
    month = now.month

    # weekend rides
    if day in [5, 6]:
        weekend_rides = 1
    else:
        weekend_rides = 0    

    # xmas holiday rides
    if month in [11, 12]:
        holiday_rides = 1
    else:
        holiday_rides = 0

    # distance_hour interaction
    if estimated_distance * hour == 0:
        distance_hour = estimated_distance
    else:
        distance_hour = estimated_distance * hour
    #------------------------------------------------------------------------------------------------------------

    
    # clusters---------------------------------------------------------------------------------------------------
    # pickup clusters
    p_0 = is_within_cluster(pickup_coordinates, cluster = 0, cluster_mapping_model = pickup_cluster_mapping_model)
    p_1 = is_within_cluster(pickup_coordinates, cluster = 1, cluster_mapping_model = pickup_cluster_mapping_model)
    p_3 = is_within_cluster(pickup_coordinates, cluster = 3, cluster_mapping_model = pickup_cluster_mapping_model)
    p_4 = is_within_cluster(pickup_coordinates, cluster = 4, cluster_mapping_model = pickup_cluster_mapping_model)

    # dropoff clusters
    d_0 = is_within_cluster(dropoff_coordinates, cluster = 0, cluster_mapping_model = dropoff_cluster_mapping_model)
    d_1 = is_within_cluster(dropoff_coordinates, cluster = 1, cluster_mapping_model = dropoff_cluster_mapping_model)
    d_2 = is_within_cluster(dropoff_coordinates, cluster = 2, cluster_mapping_model = dropoff_cluster_mapping_model)
    d_3 = is_within_cluster(dropoff_coordinates, cluster = 3, cluster_mapping_model = dropoff_cluster_mapping_model)
    #-------------------------------------------------------------------------------------------------------------
    


    # JFK and LGA rides-------------------------------------------------------------------------------------------
    if p_3 or d_2:
        JFK = 1
    else:
        JFK = 0

    if p_1 or d_1:
        LGA = 1
    else:
        LGA = 0
    #-------------------------------------------------------------------------------------------------------------



    
    # gather and organize all features----------------------------------------------------------------------------
    # feature vector
    features = {
        'p_0': p_0,
        'p_1': p_1,
        'p_3': p_3,
        'p_4': p_4,
        'd_0': d_0,
        'd_1': d_1,
        'd_2': d_2,
        'd_3': d_3,
        'estimated_distance': estimated_distance,
        'distance_hour': distance_hour,
        'JFK': JFK,
        'LGA': LGA,
        'weekend_rides': weekend_rides,
        'holiday_rides': holiday_rides
    }
    #--------------------------------------------------------------------------------------------------------------

    

    return pd.DataFrame([features])

In [249]:
pickup_address = '1 E 161st St, Bronx, NY 10451'
dropoff_address = 'LaGuardia Airport, TERMINAL C GATES C 12, C 44, NY 11371'

In [251]:
print(get_coordinates(pickup_address))
print(get_coordinates(dropoff_address))

(40.8292979, -73.9278437)
(40.77029, -73.864372)


In [253]:
output = prepare_features(pickup_address, dropoff_address)
output

Unnamed: 0,p_0,p_1,p_3,p_4,d_0,d_1,d_2,d_3,estimated_distance,distance_hour,JFK,LGA,weekend_rides,holiday_rides
0,0,0,1,0,0,0,1,0,9.732957,87.596613,1,0,0,1


In [255]:
xgb_model.predict(output)[0]

28.191748