In [1]:
import geopandas as gpd
from shapely.geometry import shape, Point
import shapely
import pandas as pd
import numpy as np


In [4]:
# sf stands for shape file
sf = gpd.read_file("./data/NYC Taxi Zones/geo_export_6bb49971-2dc7-4ad9-84f3-972baf69aca8.shp")


# Convert the geometry shape to to latitude and longitude
# From University of Melbourne code
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [12]:
def get_zone(sf, lon, lat):
    '''
    Returns the zone of a specific location, given the longitude and latitude are in the zones outlined by the shapefile.

            Parameters:
                    sf (geopandas.geodataframe.GeoDataFrame): A decimal integer
                    lon (float): Longitude of location
                    lat (float): Latitude of location

            Returns:
                    location_id (float): ID from shapefile for the location of the coordinates
    '''
    point = Point(lon, lat)
    for index, row in sf.iterrows():
        taxi_zone = row['geometry']
        if isinstance(taxi_zone, shapely.geometry.multipolygon.MultiPolygon):
            for sub_zone in list(taxi_zone):
                polygon = shape(sub_zone)
                if polygon.contains(point):

                    return(row['location_i'])
        else:
            polygon = shape(taxi_zone)
            if polygon.contains(point):

                return(row['location_i'])

    return 0

In [14]:
# Read CSV and drop columns that we dont need
COLS_TO_DROP = [
       'id',
       'name', 
       'host_id', 
       'host_name', 
       'neighbourhood_group',
       'neighbourhood',
       'last_review', 
       'calculated_host_listings_count', 
       'availability_365'
       ]

airbnb_df = pd.read_csv('data/nyc_airbnb_data/AirBnB_NYC_2019.csv', parse_dates= ['last_review'], dayfirst=True)
airbnb_df.drop(COLS_TO_DROP, axis=1, inplace=True)

In [None]:
# for each row, give it a taxi zone
airbnb_df['ABLocationID'] = airbnb_df.apply(lambda x: get_zone(sf, x['longitude'], x['latitude']), axis=1)

In [None]:
# create new dataframe with stats per zone

# get number of listings in each zone
zone_df = airbnb_df['ABLocationID'].value_counts().rename_axis('zone').reset_index(name='counts')
zone_df.sort_values(by = 'zone').reset_index(drop = True, inplace = True)

In [None]:
# Get median value for each zone's price,minimum_nights,number_of_reviews,reviews_per_month
zone_airbnb_info = airbnb_df.groupby(['ABLocationID'])[['price','minimum_nights','number_of_reviews','reviews_per_month']].median()

In [None]:
# add the borough and zone name for each zone
zone_airbnb_info = pd.merge(zone_airbnb_info, sf[['location_i', 'borough', 'zone']], left_on='ABLocationID', right_on='location_i')

In [None]:
# add number of listings per zone
zone_airbnb_info.set_index('location_i', inplace = True)
zone_airbnb_info = pd.merge(zone_airbnb_info, zone_df, left_on='location_i', right_on='zone')
zone_airbnb_info.set_index('zone_y', inplace = True)

In [None]:
# fill missing values for 'reviews_per_month' with 0
zone_airbnb_info['reviews_per_month'] = zone_airbnb_info['reviews_per_month'].fillna(value=0)

In [7]:
# ceiling round minimum_nights
zone_airbnb_info['minimum_nights'] = np.ceil(zone_airbnb_info.minimum_nights).astype(int)

In [11]:
# save the output
zone_airbnb_info.to_csv('processed_data/zone_airbnb_info.csv')