In [1]:
import os
import shutil
import requests
import warnings
import pandas as pd

from bs4 import BeautifulSoup

In [2]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

## Web Scraping

In [3]:
url = 'http://insideairbnb.com/get-the-data.html'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)

In [4]:
# extract all href links in <a> tags that contain 'listings.csv' or 'reviews.csv' (but not the visualizations) and store in a list
# all links are in gzip files
zipped_links = []
for link in soup.find_all('a'):
    link_url = link.get('href')
    # only choose csv files that are listings or review data in LA, exclude visualizations
    if (link_url is not None) and ('listings.csv' in link_url or 'reviews.csv' in link_url) and ('los-angeles' in link_url) and ('visualisations' not in link_url):
        zipped_links.append(link_url)

In [5]:
# function to write information from the links to their own files
def write_files(ls, directory):
    for link in ls:
        file_url_split = link.split('/')
        filename = file_url_split[-4] + '_' + file_url_split[-3] + '_' + file_url_split[-1]
        # if the file doesn't exist in our directory, write to the file
        if(not os.path.isfile(directory + filename)):
            with open(directory + filename, "wb") as f:
                r = requests.get(link)
                f.write(r.content)

In [6]:
# implement function in script
directory = '/Users/limesncoconuts2/datasets/airbnb-web/'

# remove files that are 0 bytes (program timed out while they were being written previously)
for file in os.listdir(directory):
    if os.path.getsize(directory + file) == 0:
        os.remove(directory + file)

# check if all files have been written
# if not, run writing function again
# if so, print affirmative statement
if len(zipped_links) != len(os.listdir(directory)):
    write_files(zipped_links, directory)
print('ALL FILES WRITTEN!')

ALL FILES WRITTEN!


In [7]:
# some links to files on the website are broken, so we have to exclude these from our data
# remove files that are of less than 1kb (not actually csv.gz files because of broken url)
for file in os.listdir(directory):
    if os.path.getsize(directory + file) < 1000:
        os.remove(directory + file)

## Combining Data

In [8]:
def consolidate_data(city, directory, destination):
    """ Checks if the csv file for either listings
        or reviews data has been created for the designated
        city in the destination folder.
        If the file has not been created, run the combine_listings
        or combine_reviews function for that city, and then create
        the csv file for that city.
    """
    
    filename = city + '_listings.csv'
    # if listings file for this city doesn't already exist, create listings_df and save as csv
    if(not os.path.isfile(destination + filename)):
        listings_df = combine_files(city, directory, 'listings')
        export_csv(filename, listings_df, destination)
    
    filename = city + '_reviews.csv'
    # if reviews file for this city doesn't already exist, create reviews_df and save as csv
    if(not os.path.isfile(destination + filename)):
        reviews_df = combine_files(city, directory, 'reviews')
        export_csv(filename, reviews_df, destination)

In [9]:
def combine_files(city, directory, kind):
    """ Goes through files in the directory and checks for the
        designated city files of the specified kind. Appends the names of the
        files of that city to a list, and passes the list and the directory 
        name to the concat_files function.
    """
    target_files = []
    
    for file in os.listdir(directory):
        # check if file from the target city and is listings data
        if city in file and kind in file:
            # add to list of target files
            target_files.append(file)
    # concatenate files in list
    return concat_files(target_files, directory) 

In [10]:
def concat_files(file_list, directory):
    """Creates a pandas dataframe for each file name in the 
       list of files. Appends the dataframe to a list of dataframes. After all files
       in the list have been converted to pandas dataframes,
       concatenate the dataframes together, drop duplicates,
       and reset the dataframe index.
    """
    ### ADD THINGS TO MAKE DATAFRAMES MORE EFFICIENT ###
    # change datatypes to be more efficient
    all_dfs = []
        
    for file in file_list:
        # make into a pandas dataframe
        df = pd.read_csv(directory + file)
        
        # append to a list of dataframes
        all_dfs.append(df)
    
    # append dataframes together along x-axis
    concat_all = pd.concat(all_dfs)
    
    # remove duplicate rows
    concat_all.drop_duplicates(inplace=True)

    # reset index
    concat_all.reset_index(drop=True, inplace=True)
        
    return concat_all

In [11]:
def export_csv(filename, df, destination):
    """ If the desired csv file does not exist in the current
        working directory, convert the dataframe to a csv file
        and move the the desired folder in the destination directory.
    """
    current_dir = os.getcwd() + '/' + filename
    # export dataframe to csv if file doesn't already exist
    if(not os.path.isfile(current_dir)):
        df.to_csv(filename, index=False)
        # move csv to destination directory
        shutil.move(os.path.join(current_dir), os.path.join(destination, filename))

In [12]:
# identify the directory and destination folder
directory = '/Users/limesncoconuts2/datasets/airbnb-web/'
destination = '/Users/limesncoconuts2/datasets/airbnb/'

In [13]:
city = 'los-angeles'
# if both files haven't been created, continue to create the consolidated csv files for that city
if(not os.path.isfile(destination + city + '_listings.csv') or not os.path.isfile(destination + city + '_reviews.csv')):
    consolidate_data(city, directory, destination)