# Load the modules

In [1]:
import cudf
from collections import OrderedDict
import numpy as np
import datetime as dt

In [2]:
print(cudf.__version__)

0.16.0


In [7]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9681 sha256=1e065de8c4ab51a5c659ec29020b3a1c0f112312106d821f2908f97baa1c4c63
  Stored in directory: /root/.cache/pip/wheels/bd/a8/c3/3cf2c14a1837a4e04bd98631724e81f33f462d86a1d895fae0
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


# Download the data
If necessary, download the data from my website and unpack.

In [5]:
import os

directory = os.path.exists('../../data')
archive   = os.path.exists(f'{directory}/parking_MayJun2019.tar.gz')
file      = os.path.exists(f'../data/parking_MayJun2019.csv')

print(f'--->>> Directory exists?: {directory}')
print(f'--->>> File exists?: {file}')
print(f'--->>> Archive exists?: {archive}')

--->>> Directory exists?: True
--->>> File exists?: False
--->>> Archive exists?: False


In [8]:
if not directory:
    os.mkdir('../../data')

if not archive and not file:
    import wget, shutil
    
    def bar_custom(current, total, width=80):
            print('Downloading: %d%% [%d / %d] bytes' % (current / total * 100.0, current, total))
        
    wget.download('http://tomdrabas.com/data/seattle_parking/parking_MayJun2019.tar.gz')
    shutil.move('parking_MayJun2019.tar.gz', '../data/parking_MayJun2019.tar.gz')
    
if not file:
    import tarfile

    tf = tarfile.open('../data/parking_MayJun2019.tar.gz')
    tf.extractall(path='../data/')

KeyboardInterrupt: 

# Read the data

In [3]:
!head -n 10 ../../data/parking_MayJun2019.csv

head: cannot open '../../data/parking_MayJun2019.csv' for reading: No such file or directory


In [2]:
dtypes = OrderedDict([
    ('OccupancyDateTime', 'date'),
    ('PaidOccupancy', 'int64'),
    ('BlockfaceName', 'str'),
    ('SideOfStreet', 'str'),
    ('SourceElementKey', 'int64'),
    ('ParkingTimeLimitCategory', 'int64'),
    ('ParkingSpaceCount', 'int64'),
    ('PaidParkingArea', 'str'),
    ('PaidParkingSubArea', 'str'),
    ('PaidParkingRate', 'int8'),
    ('ParkingCategory', 'str'),
    ('Location', 'str'),
    ('dow', 'int8')
])

df = cudf.read_csv(
    '../../data/parking_MayJun2019.csv'
    , skiprows=1
    , dtype=list(dtypes.values())
    , names=list(dtypes.keys())
)

df = df.fillna({'PaidOccupancy': 0, 'ParkingSpaceCount': 999, 'PaidParkingSubArea': 'UKN'})

In [17]:
print(f'The dataset has {bc.sql("SELECT COUNT(*) FROM parking_transactions").to_pandas().values.tolist()[0][0]:,} records and {df.shape[1]} columns.')

The dataset has 48,675,039 records and 4 columns.


In [12]:
print(df.columns)

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea', 'PaidParkingRate',
       'ParkingCategory', 'Location', 'dow'],
      dtype='object')
time: 2.7 ms


In [18]:
df.head().to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,SourceElementKey,dow
0,2019-05-24 10:57:00,13,35889,4
1,2019-05-24 17:53:00,2,89525,4
2,2019-05-24 10:51:00,2,8374,4
3,2019-05-24 09:37:00,1,58449,4
4,2019-05-24 14:09:00,4,39337,4
5,2019-05-24 16:04:00,2,35889,4
6,2019-06-10 11:54:00,2,35429,0
7,2019-06-10 11:42:00,7,70866,0
8,2019-06-10 10:42:00,16,31509,0
9,2019-06-10 16:27:00,6,70866,0


# Extract date information

In [14]:
df['year'] = df['OccupancyDateTime']._column.year
df['month'] = df['OccupancyDateTime']._column.month
df['day'] = df['OccupancyDateTime']._column.day

df['hour'] = df['OccupancyDateTime']._column.hour
df['minute'] = df['OccupancyDateTime']._column.minute

df[['OccupancyDateTime','year','month','day','hour', 'minute']].head().to_pandas()

Unnamed: 0,OccupancyDateTime,year,month,day,hour,minute
0,2019-05-24 08:35:00,2019,5,24,8,35
1,2019-05-24 19:57:00,2019,5,24,19,57
2,2019-05-24 11:21:00,2019,5,24,11,21
3,2019-05-24 11:56:00,2019,5,24,11,56
4,2019-05-24 10:57:00,2019,5,24,10,57


time: 16.4 ms


In [15]:
counts = df.groupby(['year', 'month', 'day']).agg({'OccupancyDateTime': 'count'})
counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OccupancyDateTime
year,month,day,Unnamed: 3_level_1
2019,5,1,959957
2019,5,2,960058
2019,5,3,959713
2019,5,4,970780
2019,5,6,953855
2019,5,7,953749
2019,5,8,959130
2019,5,9,954661
2019,5,10,959962
2019,5,11,972658


time: 68.1 ms


In [16]:
print('Average number of transactions per day: {0:.0f}'.format(counts['OccupancyDateTime'].mean()))

Average number of transactions per day: 954413
time: 110 ms


# All parking locations

In [17]:
locations = df[['SourceElementKey', 'BlockfaceName', 'SideOfStreet',
       'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea',
       'ParkingCategory', 'Location']].drop_duplicates()

locations.head().to_pandas()

Unnamed: 0,SourceElementKey,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory,Location
4080,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,120,4.0,Pioneer Square,Core,Paid Parking,POINT (-122.33469356 47.6028728)
1336,1002,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,NE,120,8.0,Pioneer Square,Core,Paid Parking,POINT (-122.33451266 47.60294861)
4455,1006,1ST AVE BETWEEN COLUMBIA ST AND MARION ST,NE,120,7.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33514326 47.60367439)
1026,1009,1ST AVE BETWEEN MADISON ST AND SPRING ST,SW,30,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.3366575 47.60501765)
3208,1010,1ST AVE BETWEEN MADISON ST AND SPRING ST,NE,120,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33644748 47.6051007)


time: 8.73 s


In [18]:
print('Number of parking locations in Seattle: {0}'.format(locations.shape[0]))

Number of parking locations in Seattle: 1528
time: 407 µs


In [19]:
def extractLon(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[0]
    return lon.str.stod()

def extractLat(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[1]
    return lon.str.stod()
    
locations['longitude'] = extractLon(locations['Location'])
locations['latitude'] = extractLat(locations['Location'])

locations[['Location', 'longitude', 'latitude']].head().to_pandas()

Unnamed: 0,Location,longitude,latitude
4080,POINT (-122.33469356 47.6028728),-122.334694,47.602873
1336,POINT (-122.33451266 47.60294861),-122.334513,47.602949
4455,POINT (-122.33514326 47.60367439),-122.335143,47.603674
1026,POINT (-122.3366575 47.60501765),-122.336658,47.605018
3208,POINT (-122.33644748 47.6051007),-122.336447,47.605101


time: 37.6 ms


# Average occupancy

In [20]:
def avgOccupancy(PaidOccupancy, ParkingSpaceCount, AvgOccupancy):
    for i, (paid, available) in enumerate(zip(PaidOccupancy, ParkingSpaceCount)):
        AvgOccupancy[i] = min(1.0, paid / available) # cap it at 100%, sometimes we see more paid occupancy than spaces available
        
df = (
    df[['OccupancyDateTime', 'PaidOccupancy', 'ParkingSpaceCount'
              , 'SourceElementKey', 'BlockfaceName', 'SideOfStreet'
              , 'ParkingTimeLimitCategory', 'ParkingSpaceCount'
              , 'PaidParkingArea', 'PaidParkingSubArea', 'ParkingCategory', 'dow', 'year', 'month'
              , 'day', 'hour', 'minute']]
    .apply_rows(
        avgOccupancy
        , incols=['PaidOccupancy', 'ParkingSpaceCount']
        , outcols={'AvgOccupancy': np.float64}
        , kwargs={}
    )
)
df.head()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,ParkingSpaceCount,SourceElementKey,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,PaidParkingArea,PaidParkingSubArea,ParkingCategory,dow,year,month,day,hour,minute,AvgOccupancy
0,2019-05-24 08:35:00,1.0,4.0,35438,SENECA ST BETWEEN BOYLSTON AVE AND HARVARD AVE,SE,120,First Hill,UKN,Paid Parking,4,2019,5,24,8,35,0.25
1,2019-05-24 19:57:00,3.0,3.0,57862,REPUBLICAN ST BETWEEN QUEEN ANNE AVE N AND 1ST...,S,120,Uptown,Core,Paid Parking,4,2019,5,24,19,57,1.0
2,2019-05-24 11:21:00,1.0,4.0,53542,8TH AVE BETWEEN PINE ST AND OLIVE WAY,NE,120,Commercial Core,Retail,Paid Parking,4,2019,5,24,11,21,0.25
3,2019-05-24 11:56:00,2.0,4.0,9354,BLANCHARD ST BETWEEN 1ST AVE AND 2ND AVE,SE,120,Belltown,South,Paid Parking,4,2019,5,24,11,56,0.5
4,2019-05-24 10:57:00,13.0,7.0,35889,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,600,South Lake Union,North,Paid Parking,4,2019,5,24,10,57,1.0


time: 444 ms


In [21]:
search_date_f = dt.datetime.strptime('2019-05-24T10:00:00', '%Y-%m-%dT%H:%M:%S')
search_date_t = dt.datetime.strptime('2019-05-24T10:59:59', '%Y-%m-%dT%H:%M:%S')
df.query('''SourceElementKey == 35889 and OccupancyDateTime >= @search_date_f and OccupancyDateTime <= @search_date_t'''
).sort_values(by='OccupancyDateTime').head(5).to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,ParkingSpaceCount,SourceElementKey,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,PaidParkingArea,PaidParkingSubArea,ParkingCategory,dow,year,month,day,hour,minute,AvgOccupancy
1454442,2019-05-24 10:00:00,13.0,7.0,35889,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,600,South Lake Union,North,Paid Parking,4,2019,5,24,10,0,1.0
1658508,2019-05-24 10:01:00,13.0,7.0,35889,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,600,South Lake Union,North,Paid Parking,4,2019,5,24,10,1,1.0
2273492,2019-05-24 10:02:00,13.0,7.0,35889,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,600,South Lake Union,North,Paid Parking,4,2019,5,24,10,2,1.0
636682,2019-05-24 10:03:00,13.0,7.0,35889,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,600,South Lake Union,North,Paid Parking,4,2019,5,24,10,3,1.0
2022358,2019-05-24 10:04:00,13.0,7.0,35889,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,600,South Lake Union,North,Paid Parking,4,2019,5,24,10,4,1.0


time: 168 ms


In [22]:
def calcMean(AvgOccupancy, ParkingSpaceCount, MeanOccupancy):
    '''
        Calculate mean
    '''
    for i, (avgOccSum, avgCnt) in enumerate(zip(AvgOccupancy, ParkingSpaceCount)):
        MeanOccupancy[i] = float(avgOccSum) / avgCnt

df_agg_dt = (
    df
    .groupby(['SourceElementKey', 'dow','hour'])
    .agg({
          'ParkingSpaceCount': 'count'
        , 'AvgOccupancy': 'sum'
    })
    .reset_index()
)

df_agg_dt = df_agg_dt.apply_rows(
    calcMean
    , incols=['AvgOccupancy', 'ParkingSpaceCount']
    , outcols={'MeanOccupancy':np.float64}
    , kwargs={}
)

df_agg_dt.drop_column('AvgOccupancy')
df_agg_dt.drop_column('ParkingSpaceCount')

df_agg_dt.head().to_pandas()

Unnamed: 0,SourceElementKey,dow,hour,MeanOccupancy
0,1001,0,9,0.0625
1,1001,0,10,0.319643
2,1001,0,11,0.405357
3,1001,0,12,0.625595
4,1001,0,13,0.553571


time: 193 ms


# Find the best parking

In [23]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="todrabas_test")
location = geolocator.geocode("400 Broad St, Seattle, WA 98109") # SPACE NEEDLE

locations['LON_Ref'] = location.longitude
locations['LAT_Ref'] = location.latitude

time: 903 ms


In [24]:
from math import sin, cos, sqrt, atan2, pi

def calculateDistance(latitude, longitude, LAT_Ref, LON_Ref, Distance):
    R = 3958.8 # Earth's radius in miles
    
    for i, (lt, ln, lt_r, ln_r) in enumerate(zip(latitude, longitude, LAT_Ref, LON_Ref)):
        lt_rad = lt / 180.0 * pi
        ln_rad = ln / 180.0 * pi
        
        dlon = (ln_r - ln) / 180.0 * pi
        dlat = (lt_r - lt) / 180.0 * pi
        a = (sin(dlat/2.0))**2 + cos(lt_rad) * cos(lt_rad) * (sin(dlon/2.0))**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        distance = R * c
        Distance[i] = distance * 5280 # in feet
        
locations = locations.apply_rows(
    calculateDistance
    , incols=['latitude', 'longitude', 'LAT_Ref', 'LON_Ref']
    , outcols={'Distance':np.float64}
    , kwargs={}
)

time: 301 ms


In [25]:
# get only meters within 1000 ft
closest = locations.query('Distance < 1000')

closest = (
    closest
    .merge(df_agg_dt, how='inner', on=['SourceElementKey'])
    .query('dow == 3 and hour == 13')
    .sort_values(by='MeanOccupancy')
)

closest_host = closest[['BlockfaceName', 'SideOfStreet'
       , 'ParkingTimeLimitCategory', 'ParkingSpaceCount', 'PaidParkingArea'
       , 'PaidParkingSubArea', 'ParkingCategory', 'Location', 'Distance'
       , 'dow', 'hour', 'MeanOccupancy', 'longitude', 'latitude']
].head().to_pandas()
closest_host

Unnamed: 0,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory,Location,Distance,dow,hour,MeanOccupancy,longitude,latitude
1196,5TH AVE N BETWEEN JOHN ST AND BROAD ST,W,240,3.0,Uptown Triangle,UKN,Paid Parking,POINT (-122.34771286 47.62015028),412.953708,3,13,0.433951,-122.347713,47.62015
289,2ND AVE N BETWEEN JOHN ST AND THOMAS ST,W,240,9.0,Uptown,Edge,Paid Parking,POINT (-122.35293883 47.62036489),895.541027,3,13,0.513426,-122.352939,47.620365
803,3RD AVE BETWEEN CLAY ST AND BROAD ST,NE,240,8.0,Belltown,North,Paid Parking,POINT (-122.35022717 47.61801873),937.902276,3,13,0.518519,-122.350227,47.618019
1641,3RD AVE BETWEEN CLAY ST AND BROAD ST,SW,240,8.0,Belltown,North,Paid Parking,POINT (-122.35037128 47.61790727),986.241548,3,13,0.55162,-122.350371,47.617907
875,CLAY ST BETWEEN 3RD AVE AND 4TH AVE,NW,240,9.0,Belltown,North,Paid Parking,POINT (-122.34947863 47.61806027),895.868223,3,13,0.596914,-122.349479,47.61806


time: 265 ms


# Plot the parking spots on the map

We're using gmaps python package that can be found here: https://github.com/pbugnion/gmaps. Follow the instructions contained within the README.md about how to install the package so the map shows properly in jupyter lab.

In [26]:
closest_host[['BlockfaceName', 'Distance', 'MeanOccupancy']].to_dict('records')
info_box_template = """
<dl>
<dt>Name</dt><dd>{BlockfaceName}</dd>
<dt>Distance</dt><dd>{Distance:.0f}</dd>
<dt>Occupancy (AVG)</dt><dd>{MeanOccupancy:.3f}</dd>
</dl>
"""

parking_info = [info_box_template.format(**parking) for parking in closest_host[['BlockfaceName', 'Distance', 'MeanOccupancy']].to_dict('records')]

time: 10.8 ms


In [27]:
closest_host.to_dict('records')[0]['latitude']

47.62015028000004

time: 10 ms


In [37]:
import gmaps
import gmaps.datasets

with open('gmaps_api_key.txt')
gmaps.configure(api_key="AIzaSyDiyIPOYUARNlDUtKXZmKbZ3WeZh-XnuwI") # Your Google API key, go to https://console.developers.google.com

parking_layer = gmaps.symbol_layer(
    closest_host[['latitude', 'longitude']], fill_color="green", stroke_color="green", scale=3, info_box_content=parking_info
)

destinations_layer = gmaps.symbol_layer(
    [[location.latitude, location.longitude]]
    , info_box_content=['DESTINATION']
    , scale=5
    , fill_color="red"
    , stroke_color="red"
)

parkings = closest_host.to_dict('records')

lines_layer = gmaps.drawing_layer(features=[
    gmaps.Line(
        start= (parking['latitude'], parking['longitude'])
        , end = (location.latitude, location.longitude)
        , stroke_weight=2
        , stroke_color="red"
    )
    for parking in parkings]
)

fig = gmaps.figure(layout={'height': '500px'})
fig.add_layer(parking_layer)
fig.add_layer(destinations_layer)
fig.add_layer(lines_layer)
fig

Figure(layout=FigureLayout(height='500px'))

time: 43.7 ms


In [33]:
!jupyter lab build

[LabBuildApp] JupyterLab 1.2.16
[LabBuildApp] Building in /opt/conda/envs/rapids/share/jupyter/lab
[LabBuildApp] Building jupyterlab assets (build:prod:minimize)
time: 48.3 s
