## Airbnb Scraper
Author: Dan Segal ([@djsegal](https://github.com/djsegal))

#### Import needed functionality 

In [None]:
import requests
import datetime
import gzip
import os

import pandas as pd

from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import clean_data

from importlib import reload
reload(clean_data)

import time
import re

import numpy as np

import matplotlib.pyplot as plt

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


#### Download zipped folders

In [None]:
soup = BeautifulSoup(open("data/airbnb/airbnb.html"), "html.parser")

rentals_table = soup.find("table", {"id": "js-detailed-table"})
rental_rows = rentals_table.find("tbody").find_all("tr")

for row in tqdm(rental_rows):
    cols = row.find_all("td")
    row_date, row_link = cols[0].text, cols[1].find("a")["href"]

    work_date = datetime.datetime.strptime(row_date, '%d %B, %Y')
    date_string = work_date.strftime("%Y_%m_%d")
    
    file_name, file_type = row_link.split("/")[-1].split(".",1)
    cur_file = f"{file_name}_{date_string}.{file_type}"
     
    abs_file_path = f'data/airbnb/zipped_data/{cur_file}'
    if os.path.exists(abs_file_path): continue
        
    cur_request = requests.get(row_link, allow_redirects=True)
    open(abs_file_path, 'wb').write(cur_request.content)
    

#### Prune rental files

In [None]:
for filename in tqdm(sorted(os.listdir("data/airbnb/zipped_data"))):
    if not filename.endswith(".csv.gz"): continue 
    
    cur_file = f'data/airbnb/zipped_data/{filename}'
    with gzip.open(cur_file, 'rb') as f:
        file_data = pd.read_csv(cur_file, low_memory=False);
        file_data = clean_data.clean_airbnb_data(file_data)
        
        work_date = datetime.datetime.strptime(
            re.search("\d{4}_\d{2}_\d{2}", filename)[0], '%Y_%m_%d'
        ).replace(day=1)
        
        if work_date.month == 1:
            work_date = work_date.replace(month=12, year=work_date.year-1)
        else:
            work_date = work_date.replace(month=work_date.month-1)
          
        file_data["date"] = work_date
        file_data["month"] = work_date.month
        file_data["year"] = work_date.year

        csv_file = f'data/airbnb/csv_data/{filename.replace(".gz", "")}'
        file_data.to_csv(csv_file, index=False)
        

#### Compile airbnb database

In [None]:
airbnb_data = None

for filename in tqdm(sorted(os.listdir("data/airbnb/csv_data"))):
    if not filename.endswith(".csv"): continue 
    
    cur_file = f'data/airbnb/csv_data/{filename}'
    work_data = pd.read_csv(cur_file, low_memory=False)
    
    if airbnb_data is None:
        airbnb_data = work_data
    else:
        airbnb_data = airbnb_data.append(work_data)

airbnb_data.sort_values(["neighbourhood_group", "id", "date"], ascending=False, inplace=True)

backup_data_1 = airbnb_data.copy()


#### Test database uniformity

In [None]:
test_columns = [
    'id', 'neighbourhood', 'neighbourhood_group',
    'latitude', 'longitude', 'property_type',
    'accommodates', 'bathrooms', 'bedrooms',
    'beds', 'price', 'number_of_reviews',
    'review_scores_rating', 'date', 'month', 'year'
]

assert list(airbnb_data.columns) == test_columns

airbnb_data

#### Remove bad data

In [None]:
airbnb_data = backup_data_1.copy()


In [None]:
unique_ids = sorted(airbnb_data.id.unique())

novice_ids = []
unreviewed_ids = []
bad_ids = []
moved_ids = []    
slow_ids = []
geo_ids = []

for unique_id in tqdm(unique_ids):
    sub_data = airbnb_data[airbnb_data.id == unique_id]
    
    if len(sub_data) < 3: novice_ids.append(unique_id)
    if len(sub_data.neighbourhood_group.unique()) > 1: moved_ids.append(unique_id)
        
    if sub_data.review_scores_rating.iloc[0] < 50: bad_ids.append(unique_id)
    if sub_data.number_of_reviews.max() < 5: unreviewed_ids.append(unique_id)
        
    max_reviews = sub_data.number_of_reviews.max()
    min_reviews = sub_data.number_of_reviews.min()

    cur_rate = ( max_reviews - min_reviews ) / len(sub_data)
    if cur_rate < 0.16: slow_ids.append(unique_ids)
   
    lat_err = (1-sub_data["latitude"]/sub_data["latitude"].mean()).abs().max()
    if lat_err > 1e-5: 
        geo_ids.append(unique_id)
        continue
        
    lon_err = (1-sub_data["longitude"]/sub_data["longitude"].mean()).abs().max()
    if lon_err > 5e-5: 
        geo_ids.append(unique_id)
        continue

old_count = len(airbnb_data)
airbnb_data = airbnb_data[~airbnb_data.id.isin(novice_ids)]
airbnb_data = airbnb_data[~airbnb_data.id.isin(unreviewed_ids)]
airbnb_data = airbnb_data[~airbnb_data.id.isin(bad_ids)]
airbnb_data = airbnb_data[~airbnb_data.id.isin(moved_ids)]
airbnb_data = airbnb_data[~airbnb_data.id.isin(slow_ids)]
airbnb_data = airbnb_data[~airbnb_data.id.isin(geo_ids)]
new_count = len(airbnb_data)

time.sleep(0.5)

print(f"beg: {old_count}")
print(f"novice: {len(novice_ids)}")
print(f"unreviewed: {len(unreviewed_ids)}")
print(f"bad: {len(bad_ids)}")
print(f"moved: {len(moved_ids)}")
print(f"slow: {len(slow_ids)}")
print(f"geo: {len(geo_ids)}")
print(f"end: {new_count}")


#### Clean up data

In [None]:
airbnb_data["month_count"] = np.nan

unique_ids = sorted(airbnb_data.id.unique())

for unique_id in tqdm(unique_ids):
    sub_data = airbnb_data[airbnb_data.id == unique_id]

    cur_count = len(sub_data)
   
    cur_type = sub_data.property_type.iloc[0]
    cur_neighbourhood = sub_data.neighbourhood.iloc[0]
 
    cur_review_count = sub_data.number_of_reviews.iloc[0]
    cur_review_score = sub_data.review_scores_rating.iloc[0]
    
    cur_latitude = np.mean(sub_data.latitude.mode())
    cur_longitude = np.mean(sub_data.longitude.mode())
    
    assert len(sub_data.latitude.unique()) <= len(sub_data.latitude)
    assert len(sub_data.longitude.unique()) <= len(sub_data.longitude)
    
    cur_rental_dict = {
        "property_type": cur_type,
        "latitude": cur_latitude,
        "longitude": cur_longitude,
        "month_count": cur_count,
        "neighbourhood": cur_neighbourhood,
        "number_of_reviews": cur_review_count,
        "review_scores_rating": cur_review_score
    }
    
    for cur_key, cur_value in cur_rental_dict.items():
        airbnb_data.loc[airbnb_data.id == unique_id, cur_key] = cur_value
   
airbnb_data.dropna(subset=["review_scores_rating"], inplace=True)

airbnb_data.sort_values(["neighbourhood_group", "month_count", "id", "date"], ascending=False, inplace=True)

airbnb_data

### Add new columns

In [None]:
airbnb_data["log_reviews"] = np.log10(airbnb_data.number_of_reviews)
airbnb_data["log_price"] = np.log10(airbnb_data.price)

airbnb_data["years_since_2015"] = airbnb_data.year - 2015
airbnb_data["years_since_2015_squared"] = airbnb_data.years_since_2015 ** 2

airbnb_data["bedrooms_per_bathrooms"] = airbnb_data.bedrooms / airbnb_data.bathrooms
airbnb_data["log_beds_per_bedrooms"] = np.log10( airbnb_data.beds / airbnb_data.bedrooms )

airbnb_data["log_beds_per_bedrooms_squared"] = airbnb_data.beds_per_bedrooms ** 2
airbnb_data["accommodates_squared"] = airbnb_data.accommodates ** 2
airbnb_data["month_squared"] = airbnb_data.month ** 2

airbnb_data["is_summer"] = False
airbnb_data["is_winter"] = False

airbnb_data.loc[airbnb_data.month.isin([1,2,11,12]), "is_winter"] = True
airbnb_data.loc[airbnb_data.month.isin([6,7,8,9]), "is_summer"] = True

airbnb_data["is_january"] = airbnb_data.month == 1
airbnb_data["is_february"] = airbnb_data.month == 2
airbnb_data["is_march"] = airbnb_data.month == 3
airbnb_data["is_april"] = airbnb_data.month == 4
airbnb_data["is_may"] = airbnb_data.month == 5
airbnb_data["is_june"] = airbnb_data.month == 6
# airbnb_data["is_july"] = airbnb_data.month == 7
airbnb_data["is_august"] = airbnb_data.month == 8
airbnb_data["is_september"] = airbnb_data.month == 9
airbnb_data["is_october"] = airbnb_data.month == 10
airbnb_data["is_november"] = airbnb_data.month == 11
airbnb_data["is_december"] = airbnb_data.month == 12


### Finalize initial dataset

In [None]:
airbnb_data.sort_index(axis=1, inplace=True)
airbnb_data.reset_index(drop=True, inplace=True)

reordered_columns = [
    "id", "neighbourhood_group", "neighbourhood", 
    "price", "number_of_reviews", "review_scores_rating", 
    "property_type", "month_count", "date", "year"
]

data_columns = airbnb_data.columns.tolist()
for reordered_column in reversed(reordered_columns):
    data_columns.insert(0, data_columns.pop(data_columns.index(reordered_column)))
    airbnb_data = airbnb_data.reindex(columns=data_columns)

backup_data_2 = airbnb_data.copy()


### Print database summaries

In [None]:
airbnb_data = backup_data_2.copy()


In [None]:
list(airbnb_data.columns)


In [None]:
unique_ids = sorted(airbnb_data.id.unique())
len(unique_ids)


In [None]:
airbnb_data.describe()


In [None]:
airbnb_data.info()


### Plot useful information

In [None]:
%matplotlib inline

cur_latitudes = [cur_item[0] for cur_item in airbnb_data.groupby("id").latitude.unique()]
cur_longitudes = [cur_item[0] for cur_item in airbnb_data.groupby("id").longitude.unique()]

plt.scatter(cur_longitudes, cur_latitudes,alpha=0.01);
plt.axes().set_aspect('equal', 'datalim')

plt.xlabel("longitude")
plt.ylabel("latitude")


In [None]:
def rental_plot(id_index, borough):
    cur_id = airbnb_data[airbnb_data.neighbourhood_group == borough].id.unique()[id_index]
    
    rental_rows = airbnb_data[
        (airbnb_data.id == cur_id) & 
        (airbnb_data.neighbourhood_group == borough)
    ]
    
    cur_x = [ 
        datetime.datetime(year=year, month=month, day=1) 
        for (year, month) in zip(rental_rows.year, rental_rows.month) 
    ]
    
    cur_y = rental_rows.price
    
    plt.plot(cur_x, cur_y)
    plt.scatter(cur_x, cur_y)
    
    plt.ylim(0,1.1*cur_y.max())
    

In [None]:
%matplotlib inline

interact(
    rental_plot, 
    borough=fixed("Manhattan"), 
    id_index=widgets.IntSlider(
        min=0, step=1, value=0, 
        max=len(airbnb_data[airbnb_data.neighbourhood_group == "Manhattan"].id.unique())-1
    )
);


In [None]:
%matplotlib inline

interact(
    rental_plot, 
    borough=fixed("Brooklyn"), 
    id_index=widgets.IntSlider(
        min=0, step=1, value=0, 
        max=len(airbnb_data[airbnb_data.neighbourhood_group == "Brooklyn"].id.unique())-1
    )
);


In [None]:
%matplotlib notebook

plt.subplot(3,4,1)
plt.hist(airbnb_data.log_price);
plt.title("log price")
plt.yticks([], [])

plt.subplot(3,4,2)
plt.hist(airbnb_data.log_reviews);
plt.title("log reviews")
plt.yticks([], [])

plt.subplot(3,4,3)
plt.hist(airbnb_data.month_count);
plt.title("month count")
plt.yticks([], [])

plt.subplot(3,4,4)
plt.hist(airbnb_data.year);
plt.title("year")
plt.yticks([], [])

plt.subplot(3,4,5)
plt.hist(airbnb_data.bathrooms);
plt.ylabel("bathrooms")
plt.yticks([], [])

plt.subplot(3,4,6)
plt.hist(airbnb_data.bedrooms);
plt.ylabel("bedrooms")
plt.yticks([], [])

plt.subplot(3,4,7)
plt.hist(airbnb_data.beds);
plt.ylabel("beds")
plt.yticks([], [])

plt.subplot(3,4,8)
plt.hist(airbnb_data.month);
plt.ylabel("month")
plt.yticks([], [])

plt.subplot(3,4,9)
plt.hist(airbnb_data.log_beds_per_bedrooms);
plt.xlabel("log_beds_per_rooms")
plt.yticks([], [])

plt.subplot(3,4,10)
plt.hist(airbnb_data.bedrooms_per_bathrooms);
plt.xlabel("bed_per_bath")
plt.yticks([], [])

plt.subplot(3,4,11)
plt.hist(airbnb_data.latitude);
plt.xlabel("latitude")
plt.yticks([], [])

plt.subplot(3,4,12)
plt.hist(airbnb_data.longitude);
plt.xlabel("longitude")
plt.yticks([], []);
