In [1]:
#import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt
import requests
import collections
import re
import json
import glob
import math
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
from pandas.io.json import json_normalize
from scipy.spatial.distance import cdist, pdist, squareform
import pandas.tseries.holiday as hol
from sklearn.linear_model import LinearRegression
from pygam import LinearGAM, LogisticGAM, s, f
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
# Read in raw and live station data
bike_raw=requests.get("https://member.bluebikes.com/data/stations.json")
bike_data=bike_raw.json()
station=json_normalize(bike_data["stations"])

# Clean column names
station=station[["id", "s", "la", "lo", "d", "ba","da"]]
station["#_of_Docks"] = station.ba + station.da
#station=station.drop(["da"], axis=1)
station.columns=["ID", "Station", "Latitude", "Longitude", "Municipality", "Current_Bikes","Empty_Docks","Num_of_Docks"]


In [8]:
# Some stations are not yet operating
station[station.Num_of_Docks==0]

Unnamed: 0,ID,Station,Latitude,Longitude,Municipality,Current_Bikes,Empty_Docks,Num_of_Docks
27,32,Landmark Center - Brookline Ave at Park Dr,42.345194,-71.101697,Boston,0,0,0
37,44,Congress St at North St,42.360418,-71.057522,Boston,0,0,0
38,45,Jersey St. at Boylston St.,42.344681,-71.097853,Boston,0,0,0
182,207,Faneuil St at Market St,42.35484,-71.150226,Boston,0,0,0
212,268,Centre St at Knoll St,42.2938,-71.136941,Boston,0,0,0


In [9]:
# save the dataframe after sorting the ID
station = station.sort_values(by=['ID'])

# remove any stations that are not in operation yet
station=station[station.Num_of_Docks!=0]

station = station.reset_index()
station = station.drop("index", axis=1)

In [12]:
# 305 stations 
station.tail()

Unnamed: 0,ID,Station,Latitude,Longitude,Municipality,Current_Bikes,Empty_Docks,Num_of_Docks
300,413,Kennedy-Longfellow School 158 Spring St,42.369553,-71.08579,Cambridge,0,19,19
301,414,Discovery Park - 30 Acorn Park Drive,42.397908,-71.147971,Cambridge,3,20,23
302,415,Stuart St at Berkeley St,42.349544,-71.072421,Boston,1,18,19
303,416,Blossom St at Charles St,42.364356,-71.069594,Boston,1,14,15
304,417,Columbus Ave at W. Canton St,42.344742,-71.076482,Boston,2,17,19


In [13]:
station.to_csv("../Data_processed/station.csv")

In [14]:
API_KEY=""
zipcodelist=list()

# Use Reverse geolocation feature of Google API to get the zipcode for each bike station. 
for i in range(len(station)):
    testlat = str(station.iloc[i].Latitude)
    testlong= str(station.iloc[i].Longitude)
    rev_geo = requests.get("https://maps.googleapis.com/maps/api/geocode/json?latlng="+testlat+","+testlong+"&key="+API_KEY)
    rev_geo_json = rev_geo.json()
    zipcode=rev_geo_json["results"][0]["formatted_address"].split()[-2][:-1]
    zipcodelist.append(zipcode)
    
station_zip=pd.concat([station, pd.Series(zipcodelist).rename("zip")], axis=1)
station_zip.head(3)

Unnamed: 0,ID,Station,Latitude,Longitude,Municipality,Current_Bikes,Empty_Docks,Num_of_Docks,zip
0,3,Colleges of the Fenway - Fenway at Avenue Loui...,42.340115,-71.100619,Boston,11,0,11,2115
1,4,Tremont St at E Berkeley St,42.345392,-71.069616,Boston,8,5,13,2116
2,5,Northeastern University - North Parking Lot,42.341814,-71.090179,Boston,4,5,9,2115


In [19]:
station_zip[station_zip.zip=="Unite"]

Unnamed: 0,ID,Station,Latitude,Longitude,Municipality,Current_Bikes,Empty_Docks,Num_of_Docks,zip
20,24,Seaport Square - Seaport Blvd at Northern Ave,42.351482,-71.044361,Boston,7,9,16,Unite
59,70,Harvard Kennedy School at Bennett St / Eliot St,42.372217,-71.121881,Cambridge,8,13,21,Unite
87,100,Davis Square,42.396969,-71.123024,Somerville,15,10,25,Unite
147,174,Washington St at Brock St,42.348953,-71.160317,Boston,7,5,12,Unite
209,272,Shawmut T Stop,42.292917,-71.06575,Boston,12,5,17,Unite
210,273,Forest Hills,42.300923,-71.114249,Boston,35,2,37,Unite
247,357,Centre St at Seaverns Ave,42.31212,-71.114298,Boston,9,5,14,Unite


In [31]:
fixindex=station_zip[station_zip.zip=="Unite"].index.values
print(fixindex)

[ 20  59  87 147 209 210 247]


In [30]:
# Cleaning stationwithzipcode[stationwithzipcode.zip=="Unite"]
for i in fixindex:
    testlat = str(station.iloc[i].Latitude)
    testlong= str(station.iloc[i].Longitude)
    rev_geo = requests.get("https://maps.googleapis.com/maps/api/geocode/json?latlng="+testlat+","+testlong+"&key="+API_KEY)
    rev_geo_json = rev_geo.json()
    print(rev_geo_json["results"][0]["formatted_address"])

BOST-0602640020, Boston, MA 02210, United States
Bennett St & Eliot, Cambridge, MA 02138, United States
Near, Somerville Community Path, Somerville, MA 02144, United States
BOST-2204104000, Boston, MA 02135, United States
Shawmut, Dayton St &, Clementine Park, Dorchester, MA 02124, United States
Forest Hills, Washington St &, Hyde Park Ave, Jamaica Plain, MA 02130, United States
Box 301209, Centre Street, Jamaica Plain, MA 02130, United States


In [33]:
station_zip.loc[20, "zip"] = "02210"
station_zip.loc[59, "zip"] = "02138"
station_zip.loc[87, "zip"] = "02144"
station_zip.loc[147, "zip"] = "02135"
station_zip.loc[209, "zip"] = "02124"
station_zip.loc[210, "zip"] = "02130"
station_zip.loc[247, "zip"]= "02130"

In [44]:
station_zip.zip.sort_values().unique()

array(['02108', '02109', '02110', '02111', '02113', '02114', '02115',
       '02116', '02118', '02119', '02120', '02121', '02122', '02124',
       '02125', '02126', '02127', '02128', '02129', '02130', '02131',
       '02134', '02135', '02138', '02139', '02140', '02141', '02142',
       '02143', '02144', '02145', '02149', '02163', '02199', '02210',
       '02215', '02445', '02446'], dtype=object)

In [45]:
# https://www.arcgis.com/home/webmap/viewer.html?webmap=8f7e2d1a2b264a589cd45c235c9e90a4&extent=-71.2228,42.2538,-71.0089,42.3472
ziptozone={'02108': 0, '02109': 0, '02110':0, '02111':0, '02113':0, '02114':0, '02115':0, '02116':0,
       '02118':0, '02119':1, '02120':1, '02121':1, '02122':1, '02124':1, '02125':1, '02126':1,
       '02127':0, '02128':2, '02129':2, '02130':1, '02131':1, '02134':3, '02135':3, '02138':4,
       '02139':4, '02140':4, '02141':4, '02142':4, '02143':5, '02144':5, '02145':5, '02149':6,
        '02163':3,'02199':0, '02210':0, '02215':0, '02445':3, '02446':3}

In [46]:
station_zip["zone"] = station_zip.zip.map(ziptozone)

In [49]:
station_zip.tail()

Unnamed: 0,ID,Station,Latitude,Longitude,Municipality,Current_Bikes,Empty_Docks,Num_of_Docks,zip,zone
300,413,Kennedy-Longfellow School 158 Spring St,42.369553,-71.08579,Cambridge,0,19,19,2141,4
301,414,Discovery Park - 30 Acorn Park Drive,42.397908,-71.147971,Cambridge,3,20,23,2140,4
302,415,Stuart St at Berkeley St,42.349544,-71.072421,Boston,1,18,19,2116,0
303,416,Blossom St at Charles St,42.364356,-71.069594,Boston,1,14,15,2114,0
304,417,Columbus Ave at W. Canton St,42.344742,-71.076482,Boston,2,17,19,2116,0


In [51]:
station_zip.to_csv("../Data_processed/station_with_zipzone.csv")