# Airbnb Web Scraping

### Sources

InsideAirbnb.com

### Approach

This Web Scraping downloads files from all available cities and organizes them into folders.

For the sake of this project I will be running the scraping for only "San Francisco", however, if run on the full list of links it would download all csv and zips, expand zips and save as csvs, and create and put them into the correct folder (../country/city/files).

In [15]:
import pandas as pd
import glob as glob
import requests
import urllib
from bs4 import BeautifulSoup

# 1. Inside Airbnb

## a. Get All Download Links

In [16]:
def url_to_links_array(url):
    # 
    r = requests.get(url)

    # Beautify
    soup = BeautifulSoup(r.content)

    # find all links on web-page 
    links = soup.findAll('a')

    links_array = []
    for a in links:
        try:
            links_array.append(a['href'])
        except:
            pass
    return links_array

In [17]:
url = "http://insideairbnb.com/get-the-data.html"
airbnb_full_links = url_to_links_array(url)

## b. Organize in Table

In [18]:
def links_array_to_link_descriptions_dataframe(links_array):
    # DataFrame of Links
    links_df = pd.DataFrame(links_array)
    links_df.columns = ['link']

    # Get only Data Download Links
    links_df = links_df[links_df['link'].str.contains("http://data.insideairbnb.com/")].reset_index(drop=True)
    
    # Extract Information From URLs
    links_df['country'] = links_df['link'].str.split('/').apply(lambda x: x[3])
    links_df['region'] = links_df['link'].str.split('/').apply(lambda x: x[4])
    links_df['city'] = links_df['link'].str.split('/').apply(lambda x: x[5])
    links_df['date'] = links_df['link'].str.split('/').apply(lambda x: x[6])
    links_df['file_name'] = links_df['link'].str.split('/').apply(lambda x: x[8])
    
    # Get Most Recent based on link date
    most_recent_links = links_df.groupby(['city','file_name'])[['date']].max().reset_index()

    # Only Most Recent Dates
    recent_links_df = links_df.merge(most_recent_links)
    
    return recent_links_df

In [19]:
#
airbnb_data_links = links_array_to_link_descriptions_dataframe(airbnb_full_links)

In [20]:
airbnb_data_links.head()

Unnamed: 0,link,country,region,city,date,file_name
0,http://data.insideairbnb.com/the-netherlands/n...,the-netherlands,north-holland,amsterdam,2019-03-07,listings.csv.gz
1,http://data.insideairbnb.com/the-netherlands/n...,the-netherlands,north-holland,amsterdam,2019-03-07,calendar.csv.gz
2,http://data.insideairbnb.com/the-netherlands/n...,the-netherlands,north-holland,amsterdam,2019-03-07,reviews.csv.gz
3,http://data.insideairbnb.com/the-netherlands/n...,the-netherlands,north-holland,amsterdam,2019-03-07,listings.csv
4,http://data.insideairbnb.com/the-netherlands/n...,the-netherlands,north-holland,amsterdam,2019-03-07,reviews.csv


## c. Download Files

In [21]:
airbnb_data_links.shape

(637, 6)

In [22]:
import urllib
import gzip
import io
import os

def download_files(dataset):
    for index_val in dataset.index:
        # Get Row
        row = dataset.loc[index_val]
        
        #Get file variables
        download_link = row.link
        country = row.country
        city = row.city
        file_type = row.file_name
                
        # Directory
        directory = 'data/' + country + '/' + city + '/'
        
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        try:
            #Download details
            file_name = file_type.split('.')[0]

            """Process differently depending on file type"""
            if file_type[-3:] == ".gz":                    
                r = requests.get(download_link, timeout=30, stream=True)
                csv_gz_file = r.content # Content in bytes from requests.get

                f = io.BytesIO(csv_gz_file)
                with gzip.GzipFile(fileobj=f) as fh:
                    # Read unzipped csv to DataFrame
                    dataframe = pd.read_csv(fh, low_memory=False)
                    dataframe.to_csv(directory + file_name + "_full.csv")

            elif file_type[-4:] == ".csv":
                if len(glob.glob(directory + file_name + "_sample.csv")) == 0:
                    open_file = urllib.URLopener()
                    open_file.retrieve(download_link, directory + file_name + "_sample.csv")
            else:
                if len(glob.glob(directory + file_type)) == 0:
                    open_file = urllib.URLopener()
                    open_file.retrieve(download_link, directory + file_type)
        except:
            pass

### Only Select San Francisco Files

In [23]:
# Only select San Francisco Links
san_francisco_links = airbnb_data_links[airbnb_data_links.city == 'san-francisco']

san_francisco_links.head()

Unnamed: 0,link,country,region,city,date,file_name
476,http://data.insideairbnb.com/united-states/ca/...,united-states,ca,san-francisco,2019-03-06,listings.csv.gz
477,http://data.insideairbnb.com/united-states/ca/...,united-states,ca,san-francisco,2019-03-06,calendar.csv.gz
478,http://data.insideairbnb.com/united-states/ca/...,united-states,ca,san-francisco,2019-03-06,reviews.csv.gz
479,http://data.insideairbnb.com/united-states/ca/...,united-states,ca,san-francisco,2019-03-06,listings.csv
480,http://data.insideairbnb.com/united-states/ca/...,united-states,ca,san-francisco,2019-03-06,reviews.csv


In [24]:
san_francisco_links.shape

(7, 6)

In [25]:
# Download into Files
download_files(san_francisco_links)

## Now To Clean The Data