# Load the modules

In [1]:
import cudf
import nvcategory
import nvtext
import nvstrings
from collections import OrderedDict
import numpy as np
import datetime as dt

%load_ext autotime

In [2]:
print(cudf.__version__)

0.9.0
time: 1.42 ms


In [3]:
!nvidia-smi

Mon Sep  2 18:38:13 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.48                 Driver Version: 410.48                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Graphics Device     Off  | 00000000:01:00.0 Off |                  N/A |
| 41%   51C    P2    70W / 280W |      0MiB / 24187MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:03:00.0 Off |                  N/A |
| 41%   40C    P0    54W / 260W |      0MiB / 10989MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

# Download the data
If necessary, download the data from my website and unpack.

In [4]:
import os

directory = os.path.exists('../data')
archive   = os.path.exists('../data/parking_MayJun2019.tar.gz')
file      = os.path.exists('../data/parking_MayJun2019.csv')

if not directory:
    os.mkdir('../data')

if not archive:
    import wget, shutil
    
    def bar_custom(current, total, width=80):
            print('Downloading: %d%% [%d / %d] bytes' % (current / total * 100.0, current, total))
        
    wget.download('http://tomdrabas.com/data/seattle_parking/parking_MayJun2019.tar.gz')
    shutil.move('parking_MayJun2019.tar.gz', '../data/parking_MayJun2019.tar.gz')
    
if not file:
    import tarfile

    tf = tarfile.open('../data/parking_MayJun2019.tar.gz')
    tf.extractall(path='../data/')

time: 1.46 ms


# Read the data

In [5]:
!head -n 10 ../data/parking_MayJun2019.csv

OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,dow
2019-05-24T08:35:00,1,SENECA ST BETWEEN BOYLSTON AVE AND HARVARD AVE,SE,35438,120,4,First Hill,,,Paid Parking,POINT (-122.32245377 47.61236006),4
2019-05-24T19:57:00,3,REPUBLICAN ST BETWEEN QUEEN ANNE AVE N AND 1ST AVE N,S,57862,120,3,Uptown,Core,,Paid Parking,POINT (-122.35604157 47.62320827),4
2019-05-24T11:21:00,1,8TH AVE BETWEEN PINE ST AND OLIVE WAY,NE,53542,120,4,Commercial Core,Retail,,Paid Parking,POINT (-122.33353011 47.61360677),4
2019-05-24T11:56:00,2,BLANCHARD ST BETWEEN 1ST AVE AND 2ND AVE,SE,9354,120,4,Belltown,South,,Paid Parking,POINT (-122.3446913 47.61292255),4
2019-05-24T10:57:00,13,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7,South Lake Union,North,,Paid Parking,POINT (-122.33168043 47.62602813),4
2019-05-24T09:23:00,1,PIKE ST BETWEEN 4TH AVE AND 5TH AVE,NW

In [6]:
dtypes = OrderedDict([
    ('OccupancyDateTime', 'date'),
    ('PaidOccupancy', 'int64'),
    ('BlockfaceName', 'str'),
    ('SideOfStreet', 'str'),
    ('SourceElementKey', 'int64'),
    ('ParkingTimeLimitCategory', 'int64'),
    ('ParkingSpaceCount', 'int64'),
    ('PaidParkingArea', 'str'),
    ('PaidParkingSubArea', 'str'),
    ('PaidParkingRate', 'int8'),
    ('ParkingCategory', 'str'),
    ('Location', 'str'),
    ('DayOfWeek', 'int8')
])

df = cudf.read_csv(
    '../data/parking_MayJun2019.csv'
    , skiprows=1
    , dtype=list(dtypes.values())
    , names=list(dtypes.keys())
)

time: 6.39 s


In [7]:
# size of the file
import os
print('Filesize: {0:.2f}GB'.format(os.path.getsize('../data/parking_MayJun2019.csv') / (1024 ** 3)))

Filesize: 6.78GB
time: 551 µs


In [8]:
df['PaidOccupancy']     = df['PaidOccupancy'].astype('float64')
df['ParkingSpaceCount'] = df['ParkingSpaceCount'].astype('float64')

time: 11.8 ms


In [9]:
df.dtypes

OccupancyDateTime           datetime64[ms]
PaidOccupancy                      float64
BlockfaceName                       object
SideOfStreet                        object
SourceElementKey                     int64
ParkingTimeLimitCategory             int64
ParkingSpaceCount                  float64
PaidParkingArea                     object
PaidParkingSubArea                  object
PaidParkingRate                       int8
ParkingCategory                     object
Location                            object
DayOfWeek                             int8
dtype: object

time: 14.5 ms


In [12]:
print('The dataset has {0} records and {1} columns.'.format(*df.shape))

The dataset has 48675039 records and 13 columns.
time: 2.22 ms


In [11]:
print(df.columns)

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea', 'PaidParkingRate',
       'ParkingCategory', 'Location', 'DayOfWeek'],
      dtype='object')
time: 1.56 ms


In [13]:
df.head().to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,DayOfWeek
0,2019-05-24 08:35:00,1.0,SENECA ST BETWEEN BOYLSTON AVE AND HARVARD AVE,SE,35438,120,4.0,First Hill,,-1,Paid Parking,POINT (-122.32245377 47.61236006),4
1,2019-05-24 19:57:00,3.0,REPUBLICAN ST BETWEEN QUEEN ANNE AVE N AND 1ST...,S,57862,120,3.0,Uptown,Core,-1,Paid Parking,POINT (-122.35604157 47.62320827),4
2,2019-05-24 11:21:00,1.0,8TH AVE BETWEEN PINE ST AND OLIVE WAY,NE,53542,120,4.0,Commercial Core,Retail,-1,Paid Parking,POINT (-122.33353011 47.61360677),4
3,2019-05-24 11:56:00,2.0,BLANCHARD ST BETWEEN 1ST AVE AND 2ND AVE,SE,9354,120,4.0,Belltown,South,-1,Paid Parking,POINT (-122.3446913 47.61292255),4
4,2019-05-24 10:57:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4


time: 495 ms


# Extract date information

In [14]:
df['year'] = df['OccupancyDateTime']._column.year
df['month'] = df['OccupancyDateTime']._column.month
df['day'] = df['OccupancyDateTime']._column.day

df['hour'] = df['OccupancyDateTime']._column.hour
df['minute'] = df['OccupancyDateTime']._column.minute

df[['OccupancyDateTime','year','month','day','hour', 'minute']].head().to_pandas()

Unnamed: 0,OccupancyDateTime,year,month,day,hour,minute
0,2019-05-24 08:35:00,2019,5,24,8,35
1,2019-05-24 19:57:00,2019,5,24,19,57
2,2019-05-24 11:21:00,2019,5,24,11,21
3,2019-05-24 11:56:00,2019,5,24,11,56
4,2019-05-24 10:57:00,2019,5,24,10,57


time: 28.1 ms


In [15]:
df.dtypes

OccupancyDateTime           datetime64[ms]
PaidOccupancy                      float64
BlockfaceName                       object
SideOfStreet                        object
SourceElementKey                     int64
ParkingTimeLimitCategory             int64
ParkingSpaceCount                  float64
PaidParkingArea                     object
PaidParkingSubArea                  object
PaidParkingRate                       int8
ParkingCategory                     object
Location                            object
DayOfWeek                             int8
year                                 int16
month                                int16
day                                  int16
hour                                 int16
minute                               int16
dtype: object

time: 3.43 ms


In [16]:
df.shape

(48675039, 18)

time: 1.64 ms


# All parking locations

In [17]:
locations = df[['SourceElementKey', 'BlockfaceName', 'SideOfStreet',
       'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea',
       'ParkingCategory', 'Location']].drop_duplicates()

locations.head().to_pandas()

Unnamed: 0,SourceElementKey,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory,Location
4080,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,120,4.0,Pioneer Square,Core,Paid Parking,POINT (-122.33469356 47.6028728)
1336,1002,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,NE,120,8.0,Pioneer Square,Core,Paid Parking,POINT (-122.33451266 47.60294861)
4455,1006,1ST AVE BETWEEN COLUMBIA ST AND MARION ST,NE,120,7.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33514326 47.60367439)
1026,1009,1ST AVE BETWEEN MADISON ST AND SPRING ST,SW,30,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.3366575 47.60501765)
3208,1010,1ST AVE BETWEEN MADISON ST AND SPRING ST,NE,120,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33644748 47.6051007)


time: 8.73 s


In [18]:
print('Number of parking locations in Seattle: {0}'.format(locations.shape[0]))

Number of parking locations in Seattle: 1528
time: 395 µs


In [44]:
def extractLon(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[0]
    return lon.str.stod()

def extractLat(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[1]
    return lon.str.stod()
    
locations['longitude'] = extractLon(locations['Location'])
locations['latitude'] = extractLat(locations['Location'])

locations[['Location', 'longitude', 'latitude']].head().to_pandas()

Unnamed: 0,Location,longitude,latitude
4080,POINT (-122.33469356 47.6028728),-122.334694,47.602873
1336,POINT (-122.33451266 47.60294861),-122.334513,47.602949
4455,POINT (-122.33514326 47.60367439),-122.335143,47.603674
1026,POINT (-122.3366575 47.60501765),-122.336658,47.605018
3208,POINT (-122.33644748 47.6051007),-122.336447,47.605101


time: 25.7 ms


# Average occupancy

In [20]:
def avgOccupancy(PaidOccupancy, ParkingSpaceCount, AvgOccupancy):
    for i, (paid, available) in enumerate(zip(PaidOccupancy, ParkingSpaceCount)):
        AvgOccupancy[i] = min(1.0, paid / available) # cap it at 100%, sometimes we see more paid occupancy than spaces available
        
df = df.apply_rows(
    avgOccupancy
    , incols=['PaidOccupancy', 'ParkingSpaceCount']
    , outcols={'AvgOccupancy': np.float64}
    , kwargs={}
)

df.head().to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,DayOfWeek,year,month,day,hour,minute,AvgOccupancy
0,2019-05-24 08:35:00,1.0,SENECA ST BETWEEN BOYLSTON AVE AND HARVARD AVE,SE,35438,120,4.0,First Hill,,-1,Paid Parking,POINT (-122.32245377 47.61236006),4,2019,5,24,8,35,
1,2019-05-24 19:57:00,3.0,REPUBLICAN ST BETWEEN QUEEN ANNE AVE N AND 1ST...,S,57862,120,3.0,Uptown,Core,-1,Paid Parking,POINT (-122.35604157 47.62320827),4,2019,5,24,19,57,
2,2019-05-24 11:21:00,1.0,8TH AVE BETWEEN PINE ST AND OLIVE WAY,NE,53542,120,4.0,Commercial Core,Retail,-1,Paid Parking,POINT (-122.33353011 47.61360677),4,2019,5,24,11,21,
3,2019-05-24 11:56:00,2.0,BLANCHARD ST BETWEEN 1ST AVE AND 2ND AVE,SE,9354,120,4.0,Belltown,South,-1,Paid Parking,POINT (-122.3446913 47.61292255),4,2019,5,24,11,56,
4,2019-05-24 10:57:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,57,


time: 1.09 s


In [27]:
search_date_f = dt.datetime.strptime('2019-05-24T10:00:00', '%Y-%m-%dT%H:%M:%S')
search_date_t = dt.datetime.strptime('2019-05-24T10:59:59', '%Y-%m-%dT%H:%M:%S')
df.query('''SourceElementKey == 35889 and OccupancyDateTime >= @search_date_f and OccupancyDateTime <= @search_date_t'''
).head(5).to_pandas()#.sort_values(by='OccupancyDateTime').head(5).to_pandas()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location,DayOfWeek,year,month,day,hour,minute,AvgOccupancy
4,2019-05-24 10:57:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,57,
50257,2019-05-24 10:18:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,18,
54095,2019-05-24 10:32:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,32,
99470,2019-05-24 10:19:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,19,
199865,2019-05-24 10:44:00,13.0,VALLEY ST BETWEEN MINOR AVE N AND YALE AVE N,N,35889,600,7.0,South Lake Union,North,-1,Paid Parking,POINT (-122.33168043 47.62602813),4,2019,5,24,10,44,


time: 157 ms


In [28]:
def calcMean(AvgOccupancy, ParkingSpaceCount, MeanOccupancy):
    '''
        Calculate mean
    '''
    for i, (avgOccSum, avgCnt) in enumerate(zip(AvgOccupancy, ParkingSpaceCount)):
        MeanOccupancy[i] = float(avgOccSum) / avgCnt

df_agg_dt = (
    df
    .groupby(['SourceElementKey', 'DayOfWeek','hour'])
    .agg({
          'ParkingSpaceCount': 'count'
        , 'AvgOccupancy': 'sum'
    })
    .reset_index()
)

df_agg_dt = df_agg_dt.apply_rows(
    calcMean
    , incols=['AvgOccupancy', 'ParkingSpaceCount']
    , outcols={'MeanOccupancy':np.float64}
    , kwargs={}
)

df_agg_dt.drop_column('AvgOccupancy')
df_agg_dt.drop_column('ParkingSpaceCount')

df_agg_dt.head().to_pandas()

RuntimeError: NVRTC error: NVRTC_ERROR_INVALID_OPTION

time: 69 ms


# Find the best parking

In [40]:
!pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl (100kB)
[K     |################################| 102kB 12.5MB/s ta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/5b/ac/4f348828091490d77899bc74e92238e2b55c59392f21948f296e94e50e2b/geographiclib-1.49.tar.gz
Building wheels for collected packages: geographiclib
  Building wheel for geographiclib (setup.py) ... [?25ldone
[?25h  Created wheel for geographiclib: filename=geographiclib-1.49-cp36-none-any.whl size=35860 sha256=4988f9064d8c1420f4f9a320adaf10f90043126217ac170104a7bdd11d7cebb6
  Stored in directory: /root/.cache/pip/wheels/99/45/d1/14954797e2a976083182c2e7da9b4e924509e59b6e5c661061
Successfully built geographiclib
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.49 geopy-1.20.0
time: 2.12 s


In [41]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="todrabas_test")
location = geolocator.geocode("2300 7th Ave Seattle")

locations['LON_Ref'] = location.longitude
locations['LAT_Ref'] = location.latitude

time: 2.2 s


In [1]:
import pandas as pd
import cudf

gpu_df = cudf.from_pandas(
    pd.DataFrame({
          'id': [1,2,4,6,7,3]
        , 'val': [66,2345,789,11,53,987]
    })
)
gpu_df.sort_values('id')

RuntimeError: NVRTC error: NVRTC_ERROR_INVALID_OPTION

In [45]:
from math import sin, cos, sqrt, atan2, pi

def degToRad(deg):
    return deg / 180.0 * pi

def calculateDistance(latitude, longitude, LAT_Ref, LON_Ref, Distance):
    R = 3958.8 # Earth's radius in miles
    
    for i, (lt, ln, lt_r, ln_r) in enumerate(zip(latitude, longitude, LAT_Ref, LON_Ref)):
        lt_rad = lt / 180.0 * pi
        ln_rad = ln / 180.0 * pi
        
        dlon = (ln_r - ln) / 180.0 * pi
        dlat = (lt_r - lt) / 180.0 * pi
        a = (sin(dlat/2.0))**2 + cos(lt_rad) * cos(lt_rad) * (sin(dlon/2.0))**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        distance = R * c
        Distance[i] = distance * 5280 # in feet
        
locations = locations.apply_rows(
    calculateDistance
    , incols=['latitude', 'longitude', 'LAT_Ref', 'LON_Ref']
    , outcols={'Distance':np.float64}
    , kwargs={}
)

locations.head().to_pandas()

Unnamed: 0,SourceElementKey,BlockfaceName,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory,Location,longitude,latitude,LON_Ref,LAT_Ref,Distance
4080,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,120,4.0,Pioneer Square,Core,Paid Parking,POINT (-122.33469356 47.6028728),-122.334694,47.602873,-122.341789,47.617858,5738.588242
1336,1002,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,NE,120,8.0,Pioneer Square,Core,Paid Parking,POINT (-122.33451266 47.60294861),-122.334513,47.602949,-122.341789,47.617858,5725.99973
4455,1006,1ST AVE BETWEEN COLUMBIA ST AND MARION ST,NE,120,7.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33514326 47.60367439),-122.335143,47.603674,-122.341789,47.617858,5426.381931
1026,1009,1ST AVE BETWEEN MADISON ST AND SPRING ST,SW,30,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.3366575 47.60501765),-122.336658,47.605018,-122.341789,47.617858,4851.330142
3208,1010,1ST AVE BETWEEN MADISON ST AND SPRING ST,NE,120,5.0,Commercial Core,Waterfront,Paid Parking,POINT (-122.33644748 47.6051007),-122.336447,47.605101,-122.341789,47.617858,4835.860012


time: 379 ms


In [46]:
# get only meters within 1000 ft
closest = locations.query('Distance < 1000')

closest = closest.merge(df_agg_dt, how='inner', on=['SourceElementKey']).query('DayOfWeek == 3 and hour == 13')
closest = closest.sort_values(by='MeanOccupancy')

closest_host = closest[['BlockfaceName', 'SideOfStreet',
       'ParkingTimeLimitCategory', 'ParkingSpaceCount', 'PaidParkingArea',
       'PaidParkingSubArea', 'ParkingCategory', 'Location', 'Distance', 'DayOfWeek', 'hour', 'MeanOccupancy', 'longitude', 'latitude']].head().to_pandas()
closest_host

NameError: name 'df_agg_dt' is not defined

time: 225 ms


# Plot the parking spots on the map

In [21]:
closest_host[['BlockfaceName', 'Distance', 'MeanOccupancy']].to_dict('records')
info_box_template = """
<dl>
<dt>Name</dt><dd>{BlockfaceName}</dd>
<dt>Distance</dt><dd>{Distance:.0f}</dd>
<dt>Occupancy (AVG)</dt><dd>{MeanOccupancy:.3f}</dd>
</dl>
"""

parking_info = [info_box_template.format(**parking) for parking in closest_host[['BlockfaceName', 'Distance', 'MeanOccupancy']].to_dict('records')]

time: 2.71 ms


In [22]:
import gmaps
import gmaps.datasets
gmaps.configure(api_key="..YOUR KEY HERE") # Your Google API key

parking_layer = gmaps.symbol_layer(
    closest_host[['latitude', 'longitude']], fill_color="green", stroke_color="green", scale=3, info_box_content=parking_info
)

destination = gmaps.symbol_layer(
    [[location.latitude, location.longitude]]
    , info_box_content=['DESTINATION']
    , scale=3
    , fill_color="red"
    , stroke_color="red"
)

fig = gmaps.figure()
fig.add_layer(parking_layer)
fig.add_layer(destination)
fig

Figure(layout=FigureLayout(height='420px'))

time: 46.1 ms
