# Load the modules

In [1]:
import cudf
import nvcategory
import nvtext
import nvstrings
from collections import OrderedDict
import numpy as np
import datetime as dt

%load_ext autotime

# Download the data
If necessary, download the data from my website and unpack.

In [32]:
import os

directory = os.path.exists('../data')
archive   = os.path.exists('../data/parking_MayJun2019.tar.gz')
file      = os.path.exists('../data/parking_MayJun2019.csv')

if not directory:
    os.mkdir('../data')

if not archive:
    import wget, shutil
    
    def bar_custom(current, total, width=80):
            print('Downloading: %d%% [%d / %d] bytes' % (current / total * 100.0, current, total))
        
    wget.download('http://tomdrabas.com/data/seattle_parking/parking_MayJun2019.tar.gz')
    shutil.move('parking_MayJun2019.tar.gz', '../data/parking_MayJun2019.tar.gz')
    
if not file:
    import tarfile

    tf = tarfile.open('../data/parking_MayJun2019.tar.gz')
    tf.extractall(path='../data/')

# Read the data

In [2]:
dtypes = OrderedDict([
    ('OccupancyDateTime', 'date'),
    ('PaidOccupancy', 'int64'),
    ('BlockfaceName', 'str'),
    ('SideOfStreet', 'str'),
    ('SourceElementKey', 'int64'),
    ('ParkingTimeLimitCategory', 'int64'),
    ('ParkingSpaceCount', 'int64'),
    ('PaidParkingArea', 'str'),
    ('PaidParkingSubArea', 'str'),
    ('PaidParkingRate', 'int8'),
    ('ParkingCategory', 'str'),
    ('Location', 'str'),
    ('DayOfWeek', 'int8')
])

df = cudf.read_csv(
    '../data/parking_MayJun2019.csv'
    , skiprows=1
    , dtype=list(dtypes.values())
    , names=list(dtypes.keys())
)

time: 7.76 s


In [3]:
!head -n 10 ../data/parking_Jun2019_check.csv

OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,dow
2019-05-24T08:35:00,1,SENECA ST BETWEEN BOYLSTON AVE AND HARVARD AVE,SE,35438,120,4,First Hill,,,Paid Parking,POINT (-122.32245377 47.61236006),4
2019-05-24T19:57:00,3,REPUBLICAN ST BETWEEN QUEEN ANNE AVE N AND 1ST AVE N,S,57862,120,3,Uptown,Core,,Paid Parking,POINT (-122.35604157 47.62320827),4
2019-05-24T11:21:00,1,8TH AVE BETWEEN PINE ST AND OLIVE WAY,NE,53542,120,4,Commercial Core,Retail,,Paid Parking,POINT (-122.33353011 47.61360677),4
2019-05-24T11:56:00,2,BLANCHARD ST BETWEEN 1ST AVE AND 2ND AVE,SE,9354,120,4,Belltown,South,,Paid Parking,POINT (-122.3446913 47.61292255),4
2019-05-24T10:57:00,13,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7,South Lake Union,North,,Paid Parking,POINT (-122.33168043 47.62602813),4
2019-05-24T09:23:00,1,PIKE ST BETWEEN 4TH AVE AND 5TH AVE,NW

In [25]:
# size of the file
import os
print('Filesize: {0:.2f}GB'.format(os.path.getsize('../data/parking_Jun2019_check.csv') / (1024 ** 3)))

Filesize: 6.78GB
time: 784 µs


In [4]:
df['PaidOccupancy']     = df['PaidOccupancy'].astype('float64')
df['ParkingSpaceCount'] = df['ParkingSpaceCount'].astype('float64')

time: 150 ms


In [5]:
df.dtypes

OccupancyDateTime           datetime64[ms]
PaidOccupancy                      float64
BlockfaceName                       object
SideOfStreet                        object
SourceElementKey                     int64
ParkingTimeLimitCategory             int64
ParkingSpaceCount                  float64
PaidParkingArea                     object
PaidParkingSubArea                  object
PaidParkingRate                       int8
ParkingCategory                     object
Location                            object
DayOfWeek                             int8
dtype: object

time: 7.51 ms


In [6]:
df.shape

(48675039, 13)

time: 5.95 ms


In [7]:
print(df.columns)

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea', 'PaidParkingRate',
       'ParkingCategory', 'Location', 'DayOfWeek'],
      dtype='object')
time: 3.45 ms


In [8]:
df.head().to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,DayOfWeek
0,2019-05-24 08:35:00,1.0,SENECA ST BETWEEN BOYLSTON AVE AND HARVARD AVE,SE,35438,120,4.0,First Hill,,-1,Paid Parking,POINT (-122.32245377 47.61236006),4
1,2019-05-24 19:57:00,3.0,REPUBLICAN ST BETWEEN QUEEN ANNE AVE N AND 1ST...,S,57862,120,3.0,Uptown,Core,-1,Paid Parking,POINT (-122.35604157 47.62320827),4
2,2019-05-24 11:21:00,1.0,8TH AVE BETWEEN PINE ST AND OLIVE WAY,NE,53542,120,4.0,Commercial Core,Retail,-1,Paid Parking,POINT (-122.33353011 47.61360677),4
3,2019-05-24 11:56:00,2.0,BLANCHARD ST BETWEEN 1ST AVE AND 2ND AVE,SE,9354,120,4.0,Belltown,South,-1,Paid Parking,POINT (-122.3446913 47.61292255),4
4,2019-05-24 10:57:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4


time: 407 ms


# Extract date information

In [9]:
df['year'] = df['OccupancyDateTime']._column.year
df['month'] = df['OccupancyDateTime']._column.month
df['day'] = df['OccupancyDateTime']._column.day

df['hour'] = df['OccupancyDateTime']._column.hour
df['minute'] = df['OccupancyDateTime']._column.minute

df[['OccupancyDateTime','year','month','day','hour', 'minute']].head().to_pandas()

Unnamed: 0,OccupancyDateTime,year,month,day,hour,minute
0,2019-05-24 08:35:00,2019,5,24,8,35
1,2019-05-24 19:57:00,2019,5,24,19,57
2,2019-05-24 11:21:00,2019,5,24,11,21
3,2019-05-24 11:56:00,2019,5,24,11,56
4,2019-05-24 10:57:00,2019,5,24,10,57


time: 19.9 ms


In [10]:
df.dtypes

OccupancyDateTime           datetime64[ms]
PaidOccupancy                      float64
BlockfaceName                       object
SideOfStreet                        object
SourceElementKey                     int64
ParkingTimeLimitCategory             int64
ParkingSpaceCount                  float64
PaidParkingArea                     object
PaidParkingSubArea                  object
PaidParkingRate                       int8
ParkingCategory                     object
Location                            object
DayOfWeek                             int8
year                                 int16
month                                int16
day                                  int16
hour                                 int16
minute                               int16
dtype: object

time: 2.5 ms


In [11]:
df.shape

(48675039, 18)

time: 3.5 ms


# All parking locations

In [12]:
locations = df[['SourceElementKey', 'BlockfaceName', 'SideOfStreet',
       'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea',
       'ParkingCategory', 'Location']].drop_duplicates()

locations.head().to_pandas()

Unnamed: 0,SourceElementKey,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory,Location
4080,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,120,4.0,Pioneer Square,Core,Paid Parking,POINT (-122.33469356 47.6028728)
1336,1002,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,NE,120,8.0,Pioneer Square,Core,Paid Parking,POINT (-122.33451266 47.60294861)
4455,1006,1ST AVE BETWEEN COLUMBIA ST AND MARION ST,NE,120,7.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33514326 47.60367439)
1026,1009,1ST AVE BETWEEN MADISON ST AND SPRING ST,SW,30,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.3366575 47.60501765)
3208,1010,1ST AVE BETWEEN MADISON ST AND SPRING ST,NE,120,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33644748 47.6051007)


time: 8.49 s


In [13]:
print('Number of parking locations in Seattle: {0}'.format(locations.shape[0]))

Number of parking locations in Seattle: 1528
time: 597 µs


In [14]:
def extractLon(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[0].str.slice(0,10) # hack for now -- fixed in v 0.9
    return lon.str.stod()

def extractLat(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[1].str.slice(0,8)
    return lon.str.stod()
    
locations['longitude'] = extractLon(locations['Location'])
locations['latitude'] = extractLat(locations['Location'])

locations[['Location', 'longitude', 'latitude']].head().to_pandas()

Unnamed: 0,Location,longitude,latitude
4080,POINT (-122.33469356 47.6028728),-122.33469,47.60287
1336,POINT (-122.33451266 47.60294861),-122.33451,47.60294
4455,POINT (-122.33514326 47.60367439),-122.33514,47.60367
1026,POINT (-122.3366575 47.60501765),-122.33665,47.60501
3208,POINT (-122.33644748 47.6051007),-122.33644,47.6051


time: 95.4 ms


# Average occupancy

In [15]:
def avgOccupancy(PaidOccupancy, ParkingSpaceCount, AvgOccupancy):
    for i, (paid, available) in enumerate(zip(PaidOccupancy, ParkingSpaceCount)):
        AvgOccupancy[i] = min(1.0, paid / available) # cap it at 100%, sometimes we see more paid occupancy than spaces available
        
df = df.apply_rows(
    avgOccupancy
    , incols=['PaidOccupancy', 'ParkingSpaceCount']
    , outcols={'AvgOccupancy': np.float64}
    , kwargs={}
)

df.head().to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,DayOfWeek,year,month,day,hour,minute,AvgOccupancy
0,2019-05-24 08:35:00,1.0,SENECA ST BETWEEN BOYLSTON AVE AND HARVARD AVE,SE,35438,120,4.0,First Hill,,-1,Paid Parking,POINT (-122.32245377 47.61236006),4,2019,5,24,8,35,0.25
1,2019-05-24 19:57:00,3.0,REPUBLICAN ST BETWEEN QUEEN ANNE AVE N AND 1ST...,S,57862,120,3.0,Uptown,Core,-1,Paid Parking,POINT (-122.35604157 47.62320827),4,2019,5,24,19,57,1.0
2,2019-05-24 11:21:00,1.0,8TH AVE BETWEEN PINE ST AND OLIVE WAY,NE,53542,120,4.0,Commercial Core,Retail,-1,Paid Parking,POINT (-122.33353011 47.61360677),4,2019,5,24,11,21,0.25
3,2019-05-24 11:56:00,2.0,BLANCHARD ST BETWEEN 1ST AVE AND 2ND AVE,SE,9354,120,4.0,Belltown,South,-1,Paid Parking,POINT (-122.3446913 47.61292255),4,2019,5,24,11,56,0.5
4,2019-05-24 10:57:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,57,1.0


time: 837 ms


In [16]:
search_date_f = dt.datetime.strptime('2019-05-24T10:00:00', '%Y-%m-%dT%H:%M:%S')
search_date_t = dt.datetime.strptime('2019-05-24T10:59:59', '%Y-%m-%dT%H:%M:%S')
df.query('''SourceElementKey == 35889 and OccupancyDateTime >= @search_date_f and OccupancyDateTime <= @search_date_t'''
).sort_values(by='OccupancyDateTime').head(5).to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,DayOfWeek,year,month,day,hour,minute,AvgOccupancy
1454442,2019-05-24 10:00:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,0,1.0
1658508,2019-05-24 10:01:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,1,1.0
2273492,2019-05-24 10:02:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,2,1.0
636682,2019-05-24 10:03:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,3,1.0
2022358,2019-05-24 10:04:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,4,1.0


time: 7.68 s


In [17]:
def calcMean(AvgOccupancy, ParkingSpaceCount, MeanOccupancy):
    '''
        Calculate mean
    '''
    for i, (avgOccSum, avgCnt) in enumerate(zip(AvgOccupancy, ParkingSpaceCount)):
        MeanOccupancy[i] = float(avgOccSum) / avgCnt

df_agg_dt = (
    df
    .groupby(['SourceElementKey', 'DayOfWeek','hour'])
    .agg({
          'ParkingSpaceCount': 'count'
        , 'AvgOccupancy': 'sum'
    })
    .reset_index()
)

df_agg_dt = df_agg_dt.apply_rows(
    calcMean
    , incols=['AvgOccupancy', 'ParkingSpaceCount']
    , outcols={'MeanOccupancy':np.float64}
    , kwargs={}
)

df_agg_dt.drop_column('AvgOccupancy')
df_agg_dt.drop_column('ParkingSpaceCount')

df_agg_dt.head().to_pandas()

Unnamed: 0,SourceElementKey,DayOfWeek,hour,MeanOccupancy
0,1001,0,9,0.0625
1,1001,0,10,0.319643
2,1001,0,11,0.405357
3,1001,0,12,0.625595
4,1001,0,13,0.553571


time: 306 ms


# Find the best parking

In [18]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="todrabas_test")
location = geolocator.geocode("2300 7th Ave Seattle")

locations['LON_Ref'] = location.longitude
locations['LAT_Ref'] = location.latitude

time: 936 ms


In [19]:
from math import sin, cos, sqrt, atan2, pi

def degToRad(deg):
    return deg / 180.0 * pi

def calculateDistance(latitude, longitude, LAT_Ref, LON_Ref, Distance):
    R = 3958.8 # Earth's radius in miles
    
    for i, (lt, ln, lt_r, ln_r) in enumerate(zip(latitude, longitude, LAT_Ref, LON_Ref)):
        lt_rad = lt / 180.0 * pi
        ln_rad = ln / 180.0 * pi
        
        dlon = (ln_r - ln) / 180.0 * pi
        dlat = (lt_r - lt) / 180.0 * pi
        a = (sin(dlat/2.0))**2 + cos(lt_rad) * cos(lt_rad) * (sin(dlon/2.0))**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        distance = R * c
        Distance[i] = distance * 5280 # in feet
        
locations = locations.apply_rows(
    calculateDistance
    , incols=['latitude', 'longitude', 'LAT_Ref', 'LON_Ref']
    , outcols={'Distance':np.float64}
    , kwargs={}
)

locations.head().to_pandas()

Unnamed: 0,SourceElementKey,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory,Location,longitude,latitude,LON_Ref,LAT_Ref,Distance
4080,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,120,4.0,Pioneer Square,Core,Paid Parking,POINT (-122.33469356 47.6028728),-122.33469,47.60287,-122.341789,47.617858,5739.827725
1336,1002,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,NE,120,8.0,Pioneer Square,Core,Paid Parking,POINT (-122.33451266 47.60294861),-122.33451,47.60294,-122.341789,47.617858,5729.188039
4455,1006,1ST AVE BETWEEN COLUMBIA ST AND MARION ST,NE,120,7.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33514326 47.60367439),-122.33514,47.60367,-122.341789,47.617858,5428.150694
1026,1009,1ST AVE BETWEEN MADISON ST AND SPRING ST,SW,30,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.3366575 47.60501765),-122.33665,47.60501,-122.341789,47.617858,4854.505004
3208,1010,1ST AVE BETWEEN MADISON ST AND SPRING ST,NE,120,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33644748 47.6051007),-122.33644,47.6051,-122.341789,47.617858,4836.605946


time: 1.25 s


In [20]:
# get only meters within 1000 ft
closest = locations.query('Distance < 1000')

closest = closest.merge(df_agg_dt, how='inner', on=['SourceElementKey']).query('DayOfWeek == 3 and hour == 13')
closest = closest.sort_values(by='MeanOccupancy')

closest_host = closest[['BlockfaceName', 'SideOfStreet',
       'ParkingTimeLimitCategory', 'ParkingSpaceCount', 'PaidParkingArea',
       'PaidParkingSubArea', 'ParkingCategory', 'Location', 'Distance', 'DayOfWeek', 'hour', 'MeanOccupancy', 'longitude', 'latitude']].head().to_pandas()
closest_host

Unnamed: 0,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory,Location,Distance,DayOfWeek,hour,MeanOccupancy,longitude,latitude
2994,BELL ST BETWEEN 7TH AVE AND 8TH AVE,NW,240,8.0,Denny Triangle,North,Paid Parking,POINT (-122.34098431 47.61773608),204.278015,3,13,0.0,-122.34098,47.61773
2140,BELL ST BETWEEN 6TH AVE AND 7TH AVE,SE,240,5.0,Denny Triangle,North,Paid Parking,POINT (-122.34168549 47.61701942),310.41047,3,13,0.008333,-122.34168,47.61701
2437,DEXTER AVE N BETWEEN JOHN ST AND THOMAS ST,W,120,7.0,South Lake Union,,Paid Parking,POINT (-122.3425489 47.62030601),909.938923,3,13,0.03869,-122.34254,47.6203
2929,6TH AVE BETWEEN BATTERY ST AND WALL ST,NE,240,6.0,Denny Triangle,North,Paid Parking,POINT (-122.34401361 47.61786447),546.222296,3,13,0.066667,-122.34401,47.61786
2076,JOHN ST BETWEEN 8TH AVE N AND 9TH AVE N,S,600,21.0,South Lake Union,South,Paid Parking,POINT (-122.34052184 47.6196361),717.908349,3,13,0.114815,-122.34052,47.61963


time: 313 ms


# Plot the point on the map

In [21]:
closest_host[['BlockfaceName', 'Distance', 'MeanOccupancy']].to_dict('records')
info_box_template = """
<dl>
<dt>Name</dt><dd>{BlockfaceName}</dd>
<dt>Distance</dt><dd>{Distance:.0f}</dd>
<dt>Occupancy (AVG)</dt><dd>{MeanOccupancy:.3f}</dd>
</dl>
"""

parking_info = [info_box_template.format(**parking) for parking in closest_host[['BlockfaceName', 'Distance', 'MeanOccupancy']].to_dict('records')]

time: 2.71 ms


In [22]:
import gmaps
import gmaps.datasets
gmaps.configure(api_key="..YOUR KEY HERE") # Your Google API key

parking_layer = gmaps.symbol_layer(
    closest_host[['latitude', 'longitude']], fill_color="green", stroke_color="green", scale=3, info_box_content=parking_info
)

destination = gmaps.symbol_layer(
    [[location.latitude, location.longitude]]
    , info_box_content=['DESTINATION']
    , scale=3
    , fill_color="red"
    , stroke_color="red"
)

fig = gmaps.figure()
fig.add_layer(parking_layer)
fig.add_layer(destination)
fig

Figure(layout=FigureLayout(height='420px'))

time: 46.1 ms
