### This imports the required libraries

In [1]:
#import kaggle

import pandas as pd

# This is for pulling from Google Drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

# For some file navigation to download the files to the correct spots
import os
from pathlib import Path
import zipfile

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# On windows, if using the pip install method, you must: 
#    manually download GDAL and Fiona packages from https://www.lfd.uci.edu/~gohlke/pythonlibs/
#    go to directory where they are downloaded, pip install <filename>.whl
#    after that, 'pip install geopandas' works normally.
import geopandas as gpd

# import requests
# from bs4 import BeautifulSoup

# This downloads the kaggle data set (Not used as far as I know)

In [None]:
'''

#Kaggle data api
kaggle.api.authenticate()

#download the Kaggle Dataset
kaggle.api.dataset_download_files('dgomonov/new-york-city-airbnb-open-data', path='C:\\Users\\darie\\Desktop\\Python Practice\\Airbnb-Price-Prediction\\data\\new-york-city-airbnb-open-data', unzip=True)

'''

# This goes to the Airbnb website, and pulls the html from the page. #
**From there it finds the new-york-city table, and gets the urls for the first 7 csvs, anything past that is historical data.**

In [None]:
# get the latest csv file urls for the data sets

''' COMMENTED OUT TO AVOID ACCIDENTALLY RUNNING IT; SITE GOT OVERLOADED BY DOWNLOADS; SO ONLY DO THIS IF NECESSARY

# To do this, I go to the website that lists all the downloads ...

airbnb_data_url = 'http://insideairbnb.com/get-the-data.html'
soup = BeautifulSoup(requests.get(airbnb_data_url).text)

# Find the table with the new york city data sets ...
new_york_table = soup.find_all('table',{'class':'new-york-city'})

# Get the first 7 rows in the table (excluding the header row) ...
csv_files = new_york_table[0].find_all('tr')[1:8]

'''

**Once we have the csv paths, we can import them using pandas or geopandas for the geojson file.**

In [None]:
# read the csvs into their own variables depending on the path
# Note: the gz files are zipped, so we need to unzip them when we read them in.
'''

for x in csv_files:
    href = x.find('a')['href']
    print(href)
    
    if 'data/listings.csv.gz' in href:
        airbnb_listings_data = pd.read_csv(href, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
        
    elif 'data/calendar.csv.gz' in href:
        airbnb_calendar_data = pd.read_csv(href, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
        
    elif 'data/reviews.csv.gz' in href:
        airbnb_reviews_data = pd.read_csv(href, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
        
    elif 'visualisations/listings.csv' in href:
        #airbnb_listings_visual = pd.read_csv(href)
        
    elif 'visualisations/reviews.csv' in href:
        #airbnb_reviews_visual = pd.read_csv(href)
        
    elif 'visualisations/neighbourhoods.csv' in href:
        airbnb_neighbourhoods_visual = pd.read_csv(href)
        
    elif 'visualisations/neighbourhoods.geojson' in href:
        airbnb_neighbourhoods_visual_json = gpd.read_file(href)
        
'''

# Pull files from Google Drive #

In [None]:
# created this to find the main Airbnb folder
def GoToParentFolder(folder_name):
    path = Path(os.getcwd())
    path_string = str(path)

    while path_string[-len(folder_name):] != folder_name:
        path = Path(path.parent)
        path_string = str(path)
        print(path_string)
        
    return(str(path))

In [None]:
# Change working directory to the data folder
airbnb_main_directory = GoToParentFolder('Airbnb-Price-Prediction')

data_folder = os.path.join(airbnb_main_directory,'data','Google Drive Docs')

if os.path.exists(data_folder) == False:
    os.mkdir(data_folder)
    missing_folder = True

else:
    missing_folder = False
    
os.chdir(data_folder)

In [None]:
# Main data files
listings_path = os.path.join(data_folder, 'listings.csv.gz')
reviews_path = os.path.join(data_folder, 'reviews.csv.gz')
calendar_path = os.path.join(data_folder, 'calendar.csv.gz')

# supplimentary files
neighbourhoods_geo_path = os.path.join(data_folder, 'neighbourhoods.geojson')
neighbourhoods_csv_path = os.path.join(data_folder, 'neighbourhoods.csv')
federal_holidays_path = os.path.join(data_folder, 'federal-holidays-usa-19662020.zip')

# Checks if files are present or not
if (os.path.exists(listings_path) == False or
    os.path.exists(reviews_path) == False or
    os.path.exists(calendar_path) == False or
    os.path.exists(neighbourhoods_geo_path) == False or 
    os.path.exists(neighbourhoods_csv_path) == False or
    os.path.exists(federal_holidays_path) == False or
    missing_folder == True):
    need_to_pull_data = True

else:
    need_to_pull_data = False

In [None]:
if need_to_pull_data == True:
    print("One or more missing files don't appear to be downloaded. The next few cells will download them...")
    drive_download = "Y"
    
elif need_to_pull_data == False:
    drive_download = input("Would you like to download the data files from Google Drive? Y/N").upper()
    
    if drive_download == "Y":
        override_download = input("Where files already exist on your machine, would you like to overwrite them? Y/N").upper()
    else:
        print("Files will NOT be downloaded...")

In [None]:
if drive_download == "Y":
    # Do google authorization to access the google drive.
    # This requires setting up an Google OAuth client IDs key for google drive (https://console.developers.google.com/)
    #     then downloading and adding the client_secrets.json to the Google Drive Docs folder created in the above cell

    # 1. We need to have authorization for the drive because its no Georgetown's Google Drive
    gauth = GoogleAuth()
    gauth.LocalWebserverAuth()
    
    try:
        drive = GoogleDrive(gauth)
    except:
        print('You must download')

In [None]:
if drive_download == "Y":
    # 2. Auto-iterate using the query syntax
    #    https://developers.google.com/drive/v2/web/search-parameters
    file_list = drive.ListFile(
        {'q': "'1wKkNN17AUxKU0dCYxmu_iUBe89QuUOQ4' in parents"}).GetList()  #use your own folder ID here

    for f in file_list:
        # 3. Create & download by id.

        print('\n title: %s, id: %s' % (f['title'], f['id']))
        fname = f['title']

        if os.path.exists(os.path.join(data_folder,f['title'])) == False or override_download == "Y":
            try:
                print("downloading '{}'...".format(fname))
                f_ = drive.CreateFile({'id': f['id']})
                f_.GetContentFile(fname)

            except:
                print(color.BOLD + color.RED +"** '{}' Failed to download".format(fname)  + color.END)

        else:
            print(" ** '{}' already exists. Nothing was done.".format(fname))

In [None]:
# 4. Load CSVs into pandas data frames

# Main data files
listings_data = pd.read_csv(listings_path, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
reviews_data = pd.read_csv(reviews_path, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)
calendar_data = pd.read_csv(calendar_path, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)

# supplimentary files
neighbourhoods_geo_data = gpd.read_file(neighbourhoods_geo_path)
neighbourhoods_csv_data = pd.read_csv(neighbourhoods_csv_path)

zf = zipfile.ZipFile(federal_holidays_path) 
federal_holidays_data = pd.read_csv(zf.open('usholidays.csv'))

In [None]:
#Just checking that they are all imported correctly

'''

print(listings_data.head(10))
print('\n+++++++++++++++++++++++++++++++++++++++++++++\n')

print(reviews_data.head(10))
print('\n+++++++++++++++++++++++++++++++++++++++++++++\n')

print(calendar_data.head(10))
print('\n+++++++++++++++++++++++++++++++++++++++++++++\n')

# supplimentary files
print(neighbourhoods_geo_data.head(10))
print('\n+++++++++++++++++++++++++++++++++++++++++++++\n')

print(neighbourhoods_csv_data.head(10))
print('\n+++++++++++++++++++++++++++++++++++++++++++++\n')

print(federal_holidays_data.head(10))
print('\n+++++++++++++++++++++++++++++++++++++++++++++\n')

'''