In [None]:
# Author: Muhammad Fathur Rizky

In [None]:
import os
import requests
import gzip
import shutil
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://insideairbnb.com/get-the-data/"

response = requests.get(BASE_URL)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

tables = soup.find_all('table', class_='data')
cities = []

for table in tables:
    rows = table.select('tbody tr')
    for row in rows:
        cols = row.find_all('td')
        if cols:
            city = cols[0].text.strip()
            if city and city not in cities:
                cities.append(city)

print(cities)

['Trentino', 'Twin Cities MSA', 'Valencia', 'Vancouver', 'Vaud', 'Venice', 'Victoria', 'Vienna', 'Washington, D.C.', 'Western Australia', 'Winnipeg', 'Zurich', 'Ireland', 'Malta', 'New Zealand']


0

In [None]:
for table in tables:
    rows = table.select('tbody tr')[:3]
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) < 2:
            continue 
        
        city = cols[0].text.strip()
        link_tag = cols[1].find('a')
        if not link_tag:
            continue
        
        file_url = urljoin(BASE_URL, link_tag['href'].strip())
        filename = link_tag.text.strip()

        safe_city = city.replace('/', '_').replace('\\', '_')
            
        os.makedirs(safe_city, exist_ok=True)
        file_path = os.path.join(safe_city, filename)

        print(f"Downloading {file_url} -> {file_path}")
        try:
            file_response = requests.get(file_url)
            file_response.raise_for_status()
            with open(file_path, 'wb') as f:
                f.write(file_response.content)
        except Exception as e:
            print(f"Failed to download {file_url}: {e}")
            continue
        
        if file_path.endswith('.gz'):
            extracted_csv_path = file_path[:-3]
            print(f"Extracting {file_path} -> {extracted_csv_path}")
            try:
                with gzip.open(file_path, 'rb') as f_in:
                    with open(extracted_csv_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                os.remove(file_path)
            except Exception as e:
                print(f"Failed to extract {file_path}: {e}")

print("All downloads and extractions complete.")

Downloading https://data.insideairbnb.com/italy/trentino-alto-adige-sÃ¼dtirol/trentino/2024-12-31/data/listings.csv.gz -> Trentino\listings.csv.gz
Failed to download https://data.insideairbnb.com/italy/trentino-alto-adige-sÃ¼dtirol/trentino/2024-12-31/data/listings.csv.gz: 403 Client Error: Forbidden for url: https://data.insideairbnb.com/italy/trentino-alto-adige-s%C3%83%C2%BCdtirol/trentino/2024-12-31/data/listings.csv.gz
Downloading https://data.insideairbnb.com/italy/trentino-alto-adige-sÃ¼dtirol/trentino/2024-12-31/data/calendar.csv.gz -> Trentino\calendar.csv.gz
Failed to download https://data.insideairbnb.com/italy/trentino-alto-adige-sÃ¼dtirol/trentino/2024-12-31/data/calendar.csv.gz: 403 Client Error: Forbidden for url: https://data.insideairbnb.com/italy/trentino-alto-adige-s%C3%83%C2%BCdtirol/trentino/2024-12-31/data/calendar.csv.gz
Downloading https://data.insideairbnb.com/italy/trentino-alto-adige-sÃ¼dtirol/trentino/2024-12-31/data/reviews.csv.gz -> Trentino\reviews.csv.gz