In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time


def scrape_weather_data(city, year):
    """
    Scrapes weather data from Weather Underground for a given city and year.

    Args:
        city (str): The name of the city.
        year (int): The year for which to scrape data.

    Returns:
        pandas.DataFrame: A DataFrame containing the scraped weather data, or None if an error occurs.
    """
    url = f"https://www.wunderground.com/history/daily/{city}/date/{year}-1"

    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4XX, 5XX)
    except requests.exceptions.RequestException as e:
        print(f"Error during requests to {url} : {e}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find("lib-city-history-observation").find(
        "table", class_="mat-table cdk-table"
    )
    if not table:
        print("Could not find the table containing weather data.")
        return None

    headers = [th.text.strip() for th in table.find_all("th")]
    data = []

    for tr in table.find("tbody").find_all("tr"):
        row = [td.text.strip() for td in tr.find_all("td")]
        data.append(row)

    df = pd.DataFrame(data, columns=headers)
    return df


def main(city, start_year, end_year):
    """
    Scrapes and combines weather data for a city across a range of years.

    Args:
        city (str): The name of the city to scrape data for.
        start_year (int): The first year to scrape data from.
        end_year (int): The last year to scrape data from.
    """
    all_data = []
    for year in range(start_year, end_year + 1):
        print(f"Scraping data for {city} in {year}...")
        df = scrape_weather_data(city, year)
        if df is not None:
            all_data.append(df)
            print(f"Successfully scraped data for {city} in {year}.")
            time.sleep(5)  # Be nice to the server
        else:
            print(f"Failed to scrape data for {city} in {year}.")

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.to_csv(f"{city}_weather_{start_year}_{end_year}.csv", index=False)
        print(
            f"Data for {city} from {start_year} to {end_year} has been saved to {city}_weather_{start_year}_{end_year}.csv"
        )
    else:
        print("No data was scraped.")


if __name__ == "__main__":
    city = "riga"  # Replace with the desired city
    start_year = 2021
    end_year = 2024
    main(city, start_year, end_year)

Scraping data for riga in 2021...
Could not find the table containing weather data.
Failed to scrape data for riga in 2021.
Scraping data for riga in 2022...
Could not find the table containing weather data.
Failed to scrape data for riga in 2022.
Scraping data for riga in 2023...
Could not find the table containing weather data.
Failed to scrape data for riga in 2023.
Scraping data for riga in 2024...
Could not find the table containing weather data.
Failed to scrape data for riga in 2024.
No data was scraped.
