In [237]:
import pandas as pd
import numpy as np

In [238]:
df = pd.read_csv("taxi_weather.csv")
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,DATE,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,distance_traveled_km,week_day
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,3.13,0.5,0.0,0,72,57,17.231137,0
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,9.84,0.0,0.0,0,30,20,246.295936,1
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,4.92,0.07,0.0,0,83,67,55.251875,3
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,4.92,0.22,0.0,0,71,54,38.152003,5
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,4.47,0.0,0.0,0,61,44,74.122374,1


Drop the nans from the dataframe

In [239]:
df = df.dropna(axis='rows')
print('New number of observations: %d' % len(df))

New number of observations: 990845


A lot of the coordinates are wrong and would mess up the analysis, so first take those out of the df

After checking the coordinates against a map of NYC, I've determined that reasonable longitude bounds are [-74.20, -73.25] which reasonable latitude bounds are [40.95, 40.50].

In [240]:
# lower bound for latitude
df = df[df['pickup_latitude'] >= 40.5]
df = df[df['dropoff_latitude'] >= 40.5]

# upper bound for latitude
df = df[df['pickup_latitude'] <= 40.95]
df = df[df['dropoff_latitude'] <= 40.95]

# upper bound for longitude
df = df[df['pickup_longitude'] <= -73.75]
df = df[df['dropoff_longitude'] <= -73.75]

# lower bound for longitude
df = df[df['pickup_longitude'] >= -74.2]
df = df[df['dropoff_longitude'] >= -74.2]

In [241]:
len(df)

968328

Convert the lattitudes and longitudes to np arrays of x and y so that I can form a grid. Create a bounding box inside which all of the points will live

In [242]:
y = np.concatenate([df['pickup_latitude'].to_numpy(), df['dropoff_latitude'].to_numpy()])
x = np.concatenate([df['pickup_longitude'].to_numpy(), df['dropoff_longitude'].to_numpy()])

bottom_left, bottom_right = (x.min(), y.min()), (x.max(), y.min())
top_left, top_right = (x.min(), y.max()), (x.max(), y.max())

BBox = (x.min(), x.max(), y.min(),y.max())
BBox

(-74.199701, -73.75006103515625, 40.501978, 40.949817)

Divide the area up into grid of 100 by 100 rectangles

In [243]:
cols = np.linspace(bottom_left[0], bottom_right[0], num=100)
rows = np.linspace(bottom_left[1], top_left[1], num=100)

create columns for each observation's dropoff and pickup places in the grid

In [244]:
df['pick_col'] = np.searchsorted(cols, df['pickup_longitude'])
df['drop_col'] = np.searchsorted(cols, df['dropoff_longitude'])

df['pick_row'] = np.searchsorted(rows, df['pickup_latitude'])
df['drop_row'] = np.searchsorted(rows, df['dropoff_latitude'])

In [245]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,DATE,AWND,PRCP,SNOW,SNWD,TMAX,TMIN,distance_traveled_km,week_day,pick_col,drop_col,pick_row,drop_row
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,3.13,0.5,0.0,0,72,57,17.231137,0,79,79,49,47
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,9.84,0.0,0.0,0,30,20,246.295936,1,41,49,47,62
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,4.92,0.07,0.0,0,83,67,55.251875,3,48,46,58,55
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,4.92,0.22,0.0,0,71,54,38.152003,5,47,46,52,57
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,4.47,0.0,0.0,0,61,44,74.122374,1,51,54,59,63


In [246]:
rows[2]

40.51102525252525

In [247]:
rows[20]

40.592450525252524

In [248]:
bottomRight

(39.7775, 116.58888889)

In [249]:
len(cols)

100

In [250]:
bottom_left

(-74.199701, 40.501978)

In [None]:
bottom_left[0] - bottom_right[0]

In [None]:
top_left[1] - bottom_left[1]

In [None]:
cols = np.linspace(bottomLeft[1], bottomRight[1], num=18)
rows = np.linspace(bottomLeft[0], topLeft[0], num=15)
df['col'] = np.searchsorted(cols, df['long'])
df['row'] = np.searchsorted(rows, df['lat'])

In [None]:


bottomLeft = (39.77750000, 116.17944444)
bottomRight = (39.77750000, 116.58888889)
topLeft = (40.04722222, 116.58888889)
topRight = (40.04722222, 116.17944444)

cols = np.linspace(bottomLeft[1], bottomRight[1], num=18)
rows = np.linspace(bottomLeft[0], topLeft[0], num=15)
df['col'] = np.searchsorted(cols, df['long'])
df['row'] = np.searchsorted(rows, df['lat'])



numpy.linspace

In [None]:
BBox

In [None]:
x_coordinates.min(), x_coordinates.max()

In [None]:
y_coordinates.min(), y_coordinates.max()