#### Installing and importing the necessary libraries 

In [42]:
pip install selenium beautifulsoup4 webdriver_manager

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import requests

In [44]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from urllib.parse import urlparse

#### Opening the website using ChromeDriver

In [45]:
# Defining the path to chromedriver
driver_path = r'C:\Users\Trabalho\Downloads\chromedriver_win32\chromedriver.exe'
download_dir = os.path.join(os.getcwd(), 'Airbnb_data')

In [46]:
# The Service class is used to start the Chrome WebDriver instance
service = Service()

# webdriver.ChromeOptions is used to define the preference for the Chrome browser
options = webdriver.ChromeOptions()
prefs = {'download.default_directory': download_dir} # Define the download directory

# Start the  Chrome WebDriver instance
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [47]:
# Open the page we want
url = 'https://insideairbnb.com/get-the-data/'

driver.get(url)

#### Downloading and separating the files

In [48]:
# Get the HTML 
html = driver.page_source

# Analyze the HTML using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Find all the links that starts with "https://data.insideairbnb.com/italy" to extract all data from Italy
links = soup.find_all('a', href=True)
download_links = [link['href'] for link in links if link['href'].startswith('https://data.insideairbnb.com/italy')]

# Create a subfolder to save all the downloaded files
os.makedirs('downloads', exist_ok=True)

# Download all the files separating them by cities
for link in download_links:
    # Extract the city name from the URL
    parsed_url = urlparse(link)
    cidade = parsed_url.path.split('/')[3]  # Index 3 has the city name
    file_name = link.split('/')[-1]  # Subfile name
    folder_path = os.path.join('downloads', cidade)
    os.makedirs(folder_path, exist_ok=True)  # Create a folder for each city
    file_path = os.path.join(folder_path, file_name)
    response = requests.get(link)
    with open(file_path, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} in {cidade}')

# Close the browser
driver.quit()

Downloaded listings.csv.gz in bergamo
Downloaded calendar.csv.gz in bergamo
Downloaded reviews.csv.gz in bergamo
Downloaded listings.csv in bergamo
Downloaded reviews.csv in bergamo
Downloaded neighbourhoods.csv in bergamo
Downloaded neighbourhoods.geojson in bergamo
Downloaded listings.csv.gz in bologna
Downloaded calendar.csv.gz in bologna
Downloaded reviews.csv.gz in bologna
Downloaded listings.csv in bologna
Downloaded reviews.csv in bologna
Downloaded neighbourhoods.csv in bologna
Downloaded neighbourhoods.geojson in bologna
Downloaded listings.csv.gz in florence
Downloaded calendar.csv.gz in florence
Downloaded reviews.csv.gz in florence
Downloaded listings.csv in florence
Downloaded reviews.csv in florence
Downloaded neighbourhoods.csv in florence
Downloaded neighbourhoods.geojson in florence
Downloaded listings.csv.gz in milan
Downloaded calendar.csv.gz in milan
Downloaded reviews.csv.gz in milan
Downloaded listings.csv in milan
Downloaded reviews.csv in milan
Downloaded neighb

#### Unzziping the files

In [None]:
import gzip
import shutil
import logging

In [None]:
# Logging configuration to receive the feedbacks step by step from the code
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Path to the main folder where all the subfolders with the .gz files are
downloads_dir = r'D:\Nuvem\OneDrive - Indra\Área de Trabalho\DATAGLOWUP_34\downloads'

# List of .gz file names and their new names after unzipping. The file_mapping maps the .gz file's name
file_mappings = {
    'calendar.csv.gz': 'calendar_2.csv',
    'listings.csv.gz': 'listings_2.csv',
    'reviews.csv.gz': 'reviews_2.csv'
}

# Function to unzip a .gz file, rename it and delete the original
def decompress_and_rename(gz_file_path, output_file_path):
    try:
        with gzip.open(gz_file_path, 'rb') as f_in:
            with open(output_file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        logging.info(f'Descompactação bem-sucedida: {gz_file_path} para {output_file_path}')
        
        # Delete the original .gz file
        os.remove(gz_file_path)
        logging.info(f'Arquivo original excluído: {gz_file_path}')
    except Exception as e:
        logging.error(f'Erro ao descompactar {gz_file_path}: {e}')


# Run through all the subfolders in the main folder
for root, dirs, files in os.walk(downloads_dir):
    for file in files:
        if file in file_mappings:
            gz_file_path = os.path.join(root, file)
            new_file_name = file_mappings[file]
            output_file_path = os.path.join(root, new_file_name)
            logging.info(f'Descompactando {gz_file_path} para {output_file_path}')
            decompress_and_rename(gz_file_path, output_file_path)