In [1]:
import pandas as pd
import numpy as np
from geopy.distance import distance
import pydeck as pdk
import pickle

In [2]:
model = pickle.load(open('random_forest.pickle', 'rb'))

In [3]:
data = pd.read_csv('../../../capstone_other/files_too_large_for_github/ML_table_with_availability_clusters.csv')

In [4]:
may_day = data[(data['month'] == 7) & (data['num_day'] == 7) & (data['hour'] == 9)]

In [5]:
may_day = may_day.groupby('dock_id').apply(pd.DataFrame.sample, random_state = 5, n=1).reset_index(drop=True)

In [6]:
station_ids = may_day['dock_id']

In [7]:
may_day = may_day[['month', 'num_day', 'hour', 'latitude', 'longitude', 'tot_docks', 
          'tmp', 'rain', 'weekday_cluster', 'weekend_cluster']]

In [8]:
predictions = model.best_estimator_.predict(may_day)

In [9]:
may_day['predictions'] = predictions

In [10]:
may_day['dock_id'] = station_ids

In [11]:
may_day['avail_bikes_proportion'] = may_day['predictions']/may_day['tot_docks']

In [12]:
may_day

Unnamed: 0,month,num_day,hour,latitude,longitude,tot_docks,tmp,rain,weekday_cluster,weekend_cluster,predictions,dock_id,avail_bikes_proportion
0,7,7,9,40.683826,-73.976323,62,17.2,0,2,3,24.129655,83,0.389188
1,7,7,9,40.696089,-73.978034,19,17.2,0,2,0,10.640033,119,0.560002
2,7,7,9,40.686768,-73.959282,19,17.2,0,1,0,10.317241,120,0.543013
3,7,7,9,40.692395,-73.993379,24,17.2,0,2,3,14.432032,143,0.601335
4,7,7,9,40.698399,-73.980689,19,17.2,0,2,0,8.264377,144,0.434967
...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,7,7,9,40.805726,-73.936322,33,17.2,0,2,2,11.410936,3505,0.345786
303,7,7,9,40.801307,-73.939817,29,17.2,0,0,2,7.117525,3506,0.245432
304,7,7,9,40.804555,-73.939686,36,17.2,0,0,2,10.120374,3507,0.281122
305,7,7,9,40.801194,-73.950074,31,17.2,0,2,0,12.102409,3509,0.390400


In [13]:
def manhattan_distance(start_lat, start_lon, end_lat, end_lon):
    dist = distance((start_lat, start_lon), (start_lat, end_lon)).miles + \
           distance((end_lat, end_lon), (start_lat, end_lon)).miles
    return dist

In [14]:
data_low = may_day[may_day['avail_bikes_proportion'] <= 1/3]

In [15]:
data_high = may_day[may_day['avail_bikes_proportion'] >= 2/3]

In [16]:
data_low['deficit'] = round((1/3 - data_low['avail_bikes_proportion']) * data_low['tot_docks']).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_low['deficit'] = round((1/3 - data_low['avail_bikes_proportion']) * data_low['tot_docks']).astype('int')


In [17]:
data_high['surplus'] = round((data_high['avail_bikes_proportion'] - 2/3) * data_high['tot_docks']).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_high['surplus'] = round((data_high['avail_bikes_proportion'] - 2/3) * data_high['tot_docks']).astype('int')


In [18]:
data_low = data_low.sort_values(by = 'deficit', ascending = False)

In [19]:
data_high = data_high.sort_values(by = 'surplus', ascending = False)

In [20]:
rebalancing_dict = {}

In [21]:
low_copy = data_low.copy()
high_copy = data_high.copy()

In [22]:
bikes_rebalanced = 0
for low in low_copy.index:
    if low_copy.loc[low, 'deficit'] == 0:
        continue
    for high in high_copy.index:
        if high_copy.loc[high, 'surplus'] == 0:
            continue
        
        if manhattan_distance(low_copy.loc[low, 'latitude'], low_copy.loc[low, 'longitude'], 
                              high_copy.loc[high, 'latitude'], high_copy.loc[high, 'longitude']) < 2:
            stations_key = (low_copy.loc[low, 'dock_id'], high_copy.loc[high, 'dock_id'])
            change = min(low_copy.loc[low, 'deficit'], high_copy.loc[high, 'surplus'])
            low_copy.loc[low, 'deficit'] -= change
            high_copy.loc[high, 'surplus'] -= change
            bikes_rebalanced += change
            if stations_key in rebalancing_dict.keys():
                rebalancing_dict[stations_key] += change
            else:
                rebalancing_dict[stations_key] = change
            if low_copy.loc[low, 'deficit'] == 0:
                break

In [23]:
rebalancing_dict

{(469, 3162): 6,
 (469, 3171): 6,
 (469, 3176): 4,
 (469, 3226): 2,
 (3141, 3305): 4,
 (3141, 3226): 2,
 (3141, 3369): 1,
 (3141, 3137): 1,
 (3134, 3175): 1,
 (253, 295): 3,
 (253, 331): 2,
 (253, 408): 2,
 (253, 400): 1,
 (3153, 3320): 5,
 (3375, 3301): 6,
 (3375, 3328): 4,
 (3452, 3067): 5,
 (3452, 3063): 2,
 (3452, 2002): 2,
 (3294, 3437): 9,
 (3294, 3350): 1,
 (3336, 3350): 7,
 (3336, 3307): 3,
 (316, 3254): 7,
 (3362, 3307): 1,
 (3362, 3391): 3,
 (3362, 3374): 2,
 (3362, 3390): 1,
 (337, 3315): 1,
 (3418, 3409): 4,
 (3418, 3249): 2,
 (406, 3371): 2,
 (406, 3373): 1,
 (406, 3332): 1,
 (406, 241): 1,
 (397, 3044): 3,
 (3493, 3496): 4,
 (3493, 3497): 1,
 (3503, 3497): 3,
 (3503, 3491): 1,
 (3310, 3477): 2}

In [24]:
sorted_rebalancing = dict(sorted(rebalancing_dict.items(), key=lambda x: x[1], reverse = True))

In [25]:
filtered_rebalancing = {key: value for key, value in sorted_rebalancing.items() if value >= 1}

In [26]:
filtered_rebalancing

{(3294, 3437): 9,
 (3336, 3350): 7,
 (316, 3254): 7,
 (469, 3162): 6,
 (469, 3171): 6,
 (3375, 3301): 6,
 (3153, 3320): 5,
 (3452, 3067): 5,
 (469, 3176): 4,
 (3141, 3305): 4,
 (3375, 3328): 4,
 (3418, 3409): 4,
 (3493, 3496): 4,
 (253, 295): 3,
 (3336, 3307): 3,
 (3362, 3391): 3,
 (397, 3044): 3,
 (3503, 3497): 3,
 (469, 3226): 2,
 (3141, 3226): 2,
 (253, 331): 2,
 (253, 408): 2,
 (3452, 3063): 2,
 (3452, 2002): 2,
 (3362, 3374): 2,
 (3418, 3249): 2,
 (406, 3371): 2,
 (3310, 3477): 2,
 (3141, 3369): 1,
 (3141, 3137): 1,
 (3134, 3175): 1,
 (253, 400): 1,
 (3294, 3350): 1,
 (3362, 3307): 1,
 (3362, 3390): 1,
 (337, 3315): 1,
 (406, 3373): 1,
 (406, 3332): 1,
 (406, 241): 1,
 (3493, 3497): 1,
 (3503, 3491): 1}

In [27]:
rebalancing_df = pd.DataFrame(filtered_rebalancing.items(), columns = ['dock_ids', 'num_bikes'])

In [28]:
rebalancing_df[['dock_id_receive', 'dock_id_give']] = rebalancing_df['dock_ids'].tolist()
rebalancing_df.drop(['dock_ids'], axis = 1, inplace = True)

In [29]:
data_df = may_day[['dock_id', 'latitude', 'longitude']]

In [30]:
rebalancing_df = rebalancing_df.merge(data_df, how = 'left', left_on = 'dock_id_receive', right_on = 'dock_id').rename(
    columns = {'latitude': 'latitude_receive', 'longitude': 'longitude_receive'})
rebalancing_df = rebalancing_df.merge(data_df, how = 'left', left_on = 'dock_id_give', right_on = 'dock_id').rename(
    columns = {'latitude': 'latitude_give', 'longitude': 'longitude_give'})
rebalancing_df.drop(['dock_id_x', 'dock_id_y'], axis = 1, inplace = True)

In [31]:
GREEN_RGB = [0, 255, 0, 150]
RED_RGB = [240, 100, 0, 150]

# Specify a deck.gl ArcLayer
arc_layer = pdk.Layer(
    "ArcLayer",
    data = rebalancing_df,
    get_width="num_bikes",
    get_source_position=["longitude_give", "latitude_give"],
    get_target_position=["longitude_receive", "latitude_receive"],
    get_tilt=15,
    get_source_color=RED_RGB,
    get_target_color=GREEN_RGB,
    pickable=True,
    auto_highlight=True,
)

view_state = pdk.ViewState(latitude=40.74, longitude=-74, bearing=45, pitch=50, zoom=8,)


TOOLTIP_TEXT = {"html": "{num_bikes} rebalanced"}
r = pdk.Deck(arc_layer, initial_view_state=view_state, tooltip=TOOLTIP_TEXT, map_style = 'light')
r

In [32]:
bikes_rebalanced

119

In [33]:
data_low['deficit'].sum()

711

In [34]:
data_high['surplus'].sum()

119

In [35]:
data_low.head(5)

Unnamed: 0,month,num_day,hour,latitude,longitude,tot_docks,tmp,rain,weekday_cluster,weekend_cluster,predictions,dock_id,avail_bikes_proportion,deficit
74,7,7,9,40.763441,-73.982681,57,17.2,0,0,1,1.312517,469,0.023027,18
135,7,7,9,40.765005,-73.958185,59,17.2,0,5,1,3.398841,3141,0.057607,16
6,7,7,9,40.753231,-73.970325,47,17.2,0,5,1,0.703201,164,0.014962,15
129,7,7,9,40.763126,-73.965269,51,17.2,0,0,1,1.895574,3134,0.037168,15
77,7,7,9,40.755273,-73.983169,57,17.2,0,5,1,4.707953,524,0.082596,14


In [36]:
data_high.head(5)

Unnamed: 0,month,num_day,hour,latitude,longitude,tot_docks,tmp,rain,weekday_cluster,weekend_cluster,predictions,dock_id,avail_bikes_proportion,surplus
267,7,7,9,40.793135,-73.977004,39,17.2,0,1,2,35.133469,3437,0.900858,9
206,7,7,9,40.797372,-73.970412,39,17.2,0,1,2,33.539651,3350,0.859991,8
160,7,7,9,40.692317,-74.014866,29,17.2,0,3,3,25.931811,3254,0.8942,7
171,7,7,9,40.791956,-73.968087,39,17.2,0,1,2,32.198181,3301,0.825594,6
141,7,7,9,40.7834,-73.980931,39,17.2,0,1,2,31.934161,3162,0.818825,6
