<b>Notebook for retrieving and processing Airbnb listing data from Inside Airbnb site (http://insideairbnb.com/)</b>
* All rights reserved to the respective owners.
* The author of this script is not affiliated with Airbnb or any of Airbnb's competitors.
* No private information is being used. Names, photographs, listings and review details are all publicly displayed on the Airbnb site and published by Inside Airbnb.

In [None]:
import pandas as pd
import os
import datetime
import geopandas as gpd
import glob as glob
import shutil
from scripts.A_data_wrangling import listing2gdf, census2gdf,CalculateTouristIntensity,aggregate


In [None]:
#all listing data urls for Amsterdam of every year for the month April
apr2019 = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2019-04-08/data/listings.csv.gz"
apr2018 = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2018-04-07/data/listings.csv.gz"
apr2017 = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2017-04-02/data/listings.csv.gz"
apr2016 = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2016-04-04/data/listings.csv.gz"
apr2015 = "http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2015-04-05/data/listings.csv.gz"
listings_2015_2019 = [apr2019,apr2018,apr2017,apr2016,apr2015 ]

In [None]:
#create new folder in output
out_folder = './output/airbnb_data'
if os.path.exists(out_folder):
    shutil.rmtree(out_folder)
os.makedirs(out_folder)

#retrieve and process listing data
for url in listings_2015_2019:
        
    date = url.split('/')[-3]
    year = date[0:4]
    
    print("Processing: \n" + url)
    
    #load airbnb data and seperate features
    airbnb_gdf = listing2gdf(url)
    room = airbnb_gdf.loc[airbnb_gdf['room_type'].isin(['Private room','Shared room'])]
    entire_home = airbnb_gdf.loc[airbnb_gdf['room_type']=='Entire home/apt']
    superhost = airbnb_gdf.loc[airbnb_gdf['host_is_superhost']=='t']
    illegal = airbnb_gdf.loc[airbnb_gdf['availability_365']>30]
    
    #loading Amsterdam census data
    nbh_gdf = census2gdf("./data/amsterdam_neighbourhoods.geojson")
    
    #aggregate airbnb features and census data
    nbh_gdf = aggregate(airbnb_gdf,nbh_gdf,room,entire_home,superhost,illegal)
    
    #calculate tourist intensity
    nbh_gdf = CalculateTouristIntensity(nbh_gdf,year)
    nbh_gdf['date'] = date
    
    #export listing and airbnb data as geojson files
    nbh_gdf.to_file("./output/airbnb_data/AirbnbPoly_{}.geojson".format(date.replace('-','')), driver="GeoJSON",encoding='utf-8')
    airbnb_gdf.to_file("./output/airbnb_data/AirbnbPoints_{}.geojson".format(date.replace('-','')),driver="GeoJSON",encoding='utf-8')
    
    print("Done!")

In [None]:
#concatenate geojson from different years (poly)
data_poly = []

for poly in glob.glob("./output/airbnb_data/AirbnbPoly_*"):
    gdf = gpd.read_file(poly, driver='GeoJSON')
    pdf = pd.DataFrame(gdf)
    data_poly.append(pdf)
    concat = pd.concat(data_poly, axis=0)
    combined_gdf = gpd.GeoDataFrame(concat)
combined_gdf.to_file("./output/airbnb_data/AirbnbPoly_Concat_2015to2019.geojson", driver="GeoJSON",encoding='utf-8')
