In [1]:
import csv
import os
import requests
from bs4 import BeautifulSoup
import wget
import urllib
import gzip
import datetime
import os
import sys
import shutil
import glob

In [2]:
def create_data_dir():
    """Create needed directory structure for data to be stored locally.
    
    Parameters
    ----------
    
    Returns
    -------
    """
    
    root_dir = sys.path[0] + "/../"
    
    data_dir = root_dir + "data/"
    raw_dir = data_dir + "raw/"
    processed_dir = data_dir + "processed/"
    
    try:
        os.mkdir(data_dir)
    except:
        print("Data directory exists")
    try:
        os.mkdir(raw_dir)
    except:
        print("Raw data directory exists")
    try:
        os.mkdir(processed_dir)
    except:
        print("processed data directory exists")

In [3]:
def get_date(date, url_list, city, print_option):
    """Retreives the date with data most close to the provided date
    Parameters
    ----------
    date: datetime.datetime.date
        Datetime object that is the desired date for data
    url_list: list
        List of URLs to be parsed
    city: string
        City of desired data, 'san-francisco' default
    print_option: bool
        True or False indicating whether you want the dates available for city fo be printed
        
    Returns
    -------
    closest_date: string
        string of closest date to provided date data is avaiable
    """
    
    if isinstance(date, datetime.date) == False:
        print("date object must be datetime.date")
        return
        
    if len(url_list) == 0:
        print("Please provide non-empty list of urls")
        return
                
    # Get list of unique dates data is available
    date_list = set([datetime.datetime.strptime(url.split("/")[-3], "%Y-%m-%d") for url in url_list])
    
    # Print dates data is available
    if print_option==True:
        print("hi")
        print(f"Data for {city} is available for:")
        print("")
        for d in date_list:
            print(d)
        
    # Find data closest to desired date
    min_dist = 10_000
    closest_date = ''
    if len(date_list) > 0:
        for data_date in date_list:
            delta = abs((date - data_date).days)
            if delta < min_dist:
                min_dist = delta
                closest_date = data_date
                
    return(closest_date.strftime("%Y-%m-%d"))

In [4]:
def download_data(data_urls, clear_dir=False):
    """Download the data to local machine
    
    Parameters
    ----------
    data_urls: list
        List of urls for city at closest to desired date
    clear_dir: bool
        Boolean indicating whether the raw data should be cleared out.  False by default.
        
    Returns
    -------
    None
    
    Some code borrowed from the following:
    https://stackabuse.com/download-files-with-python/
    https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python
    """
    
    if clear_dir == True:
        raw_dir = sys.path[0] + "/../data/raw/"
        files = glob.glob(raw_dir + "*")
        for f in files:
            os.remove(f)
            
    for file in data_urls:
        file_name = file.split("/")[-1]
        file_path = sys.path[0] + "/../data/raw/" + file_name
        if os.path.isfile(file_path):
            next
        else:
            wget.download(file, file_path)
    
        if '.gz' in file_name:
            with gzip.open(file_path, 'rb') as f_in:
                with open(file_path[:-3], 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
                
            os.remove(file_path)

In [5]:
# Download airbnb content from html
url = "http://insideairbnb.com/get-the-data.html"
response = requests.get(url)
soup = BeautifulSoup(response.content)

In [6]:
# Get all urls associated with city
data_links = []
city = "san-francisco"
for link in soup.find_all('a', href=True):
    url = link['href']
    if (city in url) and (".csv" in url):
        data_links.append(url)

In [7]:
# Get links closest to selected date
date = datetime.datetime.strptime("2021-08-04", "%Y-%m-%d")
closest_date = get_date(date, data_links, city, print_option=False)
final_data_links = [link for link in data_links if closest_date in link]

In [8]:
# Download data
download_data(final_data_links)