In [2]:
import pyodbc                  # package that connects notebook to database
import pandas as pd  # package for data manipulation
import numpy as np
import seaborn as sns          # package for visualziation
import statsmodels.api as sm   # package for lm
import matplotlib.pyplot as plt

In [3]:
zone_data = pd.read_csv("dataset/taxi+_zone_lookup.csv")
zone_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


In [5]:
zone_data['Zone'] = zone_data['Zone'].str.split('/')

# Explode the DataFrame to create separate rows
zone_data = zone_data.explode('Zone')

In [7]:
zone_data.head(20)

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton,Boro Zone
2,3,Bronx,Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
5,6,Staten Island,Arrochar,Boro Zone
5,6,Staten Island,Fort Wadsworth,Boro Zone
6,7,Queens,Astoria,Boro Zone
7,8,Queens,Astoria Park,Boro Zone


In [9]:
from geopy.geocoders import Nominatim
import time

# Initialize the geolocator
geolocator = Nominatim(user_agent="geo_locator")

# Function to get latitude and longitude
def get_lat_long(row, attempt=1, max_attempts = 265):
    time.sleep(0.5)
    location = geolocator.geocode(f"{row['Zone']}, {row['Borough']}, New York, USA")
    try:
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except GeocoderTimedOut:
        if attempt <= max_attempts:
            return do_geocode(address, attempt=attempt+1)
        raise 


In [10]:
# Apply the function to create new 'latitude' and 'longitude' columns
df1 = zone_data[:50]
df1[['latitude', 'longitude']] = df1.apply(get_lat_long, axis=1, result_type='expand')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [11]:
df2 = zone_data[50:80]
df2[['latitude', 'longitude']] = df2.apply(get_lat_long, axis=1, result_type='expand')

In [12]:
df3 = zone_data[81:120]
df3[['latitude', 'longitude']] = df3.apply(get_lat_long, axis=1, result_type='expand')

In [13]:
df4 = zone_data[121:144]
df4[['latitude', 'longitude']] = df4.apply(get_lat_long, axis=1, result_type='expand')

In [14]:
df5 = zone_data[145:160]
df5[['latitude', 'longitude']] = df5.apply(get_lat_long, axis=1, result_type='expand')

In [15]:
df6 = zone_data[161:220]
df6[['latitude', 'longitude']] = df6.apply(get_lat_long, axis=1, result_type='expand')

In [16]:
df7 = zone_data[221:]
df7[['latitude', 'longitude']] = df7.apply(get_lat_long, axis=1, result_type='expand')

In [17]:
zone_data = pd.concat([df1, df2, df3, df4, df5, df6, df7]).reset_index(drop=True)

In [18]:
zone_data.head(10)

Unnamed: 0,LocationID,Borough,Zone,service_zone,latitude,longitude
0,1,EWR,Newark Airport,EWR,,
1,2,Queens,Jamaica Bay,Boro Zone,40.603994,-73.835412
2,3,Bronx,Allerton,Boro Zone,40.86543,-73.867365
3,3,Bronx,Pelham Gardens,Boro Zone,,
4,4,Manhattan,Alphabet City,Yellow Zone,40.725102,-73.979583
5,5,Staten Island,Arden Heights,Boro Zone,40.5637,-74.191603
6,6,Staten Island,Arrochar,Boro Zone,40.598438,-74.072641
7,6,Staten Island,Fort Wadsworth,Boro Zone,40.608993,-74.062641
8,7,Queens,Astoria,Boro Zone,40.772014,-73.930267
9,8,Queens,Astoria Park,Boro Zone,40.778828,-73.922658


In [19]:
zone_data = zone_data.dropna(subset=['latitude'])

In [20]:
zone_data

Unnamed: 0,LocationID,Borough,Zone,service_zone,latitude,longitude
1,2,Queens,Jamaica Bay,Boro Zone,40.603994,-73.835412
2,3,Bronx,Allerton,Boro Zone,40.865430,-73.867365
4,4,Manhattan,Alphabet City,Yellow Zone,40.725102,-73.979583
5,5,Staten Island,Arden Heights,Boro Zone,40.563700,-74.191603
6,6,Staten Island,Arrochar,Boro Zone,40.598438,-74.072641
...,...,...,...,...,...,...
316,259,Bronx,Woodlawn,Boro Zone,40.886272,-73.878581
317,259,Bronx,Wakefield,Boro Zone,40.906197,-73.855511
318,260,Queens,Woodside,Boro Zone,40.745380,-73.905415
319,261,Manhattan,World Trade Center,Yellow Zone,40.711900,-74.012527


In [22]:
zone_data.to_csv('dataset/zone_data.csv', index=False)