<h2>Here we scrape</h2> weather underground for historical weather data from the Haifa Airport. They long since eliminated their free API and I heard their pricing now starts at about 800 USD per month, even though I wasn't able to find any mention of their API period. So I scrape.

In [4]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from io import StringIO 
from tqdm.notebook import tqdm

import datetime
import time
from datetime import datetime, timedelta
from os import listdir
from os.path import isfile, join

In [3]:
# Make the list of dates

start_date = datetime(2015, 1, 1)
end_date = datetime.now()

date_list = [(start_date + timedelta(days=i)).strftime('%Y-%m-%d')
             for i in range((end_date - start_date).days + 1)]

In [55]:
# Thank you QHarr on stackoverflow https://stackoverflow.com/questions/55306320/scraping-wunderground-without-api-using-python
# Updated the expected condition from the original answer to make it a bit quicker and not wait for all tables to load
# (which probably included ads)

base_url = 'https://www.wunderground.com/history/daily/il/haifa/LLHA/date/'

driver = webdriver.Chrome()

def scrape_pages(date_list, ignore_timeouts=True):
    for date in tqdm(date_list, desc="Scraping Pages"):
        try:
            url = f"{base_url}{date}"
            driver.get(url)
            tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "observation-table")))
            for table in tables:
                newTable = pd.read_html(StringIO(table.get_attribute('outerHTML')))
                if newTable:
                    newTable[0].to_csv(f"saved_tables//{date}.csv")
            time.sleep(1)
        except TimeoutException:
            if ignore_timeouts:
                pass
            else:
                print("Failed to scrape ", url)
                break

In [None]:
scrape_pages(date_list)

Scraping Pages:   0%|          | 0/2639 [00:00<?, ?it/s]

<h3>Let's see what pages are missing</h3>

In [40]:
# Cool one-liner by pycruft on stackoverflow
# https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory

path = "saved_tables"

files = [f for f in listdir(path) if isfile(join(path, f))]

In [47]:
scraped_dates = [filename[:-4] for filename in files]

missing_dates = list(set(date_list) - set(scraped_dates))

In [56]:
scrape_pages(missing_dates, ignore_timeouts=False)

Scraping Pages:   0%|          | 0/24 [00:00<?, ?it/s]

Failed to scrape  https://www.wunderground.com/history/daily/il/haifa/LLHA/date/2024-06-29


Some pages are missing because the data is missing on the website, so we check every timeout manually

In [57]:
scrape_pages(missing_dates[1:])

Scraping Pages:   0%|          | 0/23 [00:00<?, ?it/s]