# Web Scraping

This code includes a couple of functions that are used to scrape historical weather data from online.

#### Import libraries

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

## Automate

### Single Station, Single Day

In [39]:
def scrape_wunderground(station_url, date):
    
    """
    Function to scrape hourly data from a speicified station on a specified date.
    """
    
    # Set the URL
    url = 'https://www.wunderground.com/history/daily/br/' + station_url + '/date/' + date
    
    # Set the driver
    driver = webdriver.Chrome()
    driver.get(url)
    
    # Get the tables on the webpage
    tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
    
    # Read in the second table (hourly data) as Pandas DataFrame
    df = pd.read_html(tables[1].get_attribute('outerHTML'))
    df = df[0].dropna()
    
    # Return DataFrame
    return df

In [40]:
df_1 = scrape_wunderground('são-paulo/SBSP', '2024-8-3')
df_1.head()

Unnamed: 0,Time,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precip.,Condition
0,12:00 AM,66 °F,52 °F,60 °%,ENE,9 °mph,0 °mph,27.47 °in,0.0 °in,Fair
1,1:00 AM,66 °F,52 °F,60 °%,ENE,7 °mph,0 °mph,27.47 °in,0.0 °in,Fair
2,2:00 AM,64 °F,54 °F,68 °%,ENE,5 °mph,0 °mph,27.47 °in,0.0 °in,Fair
3,3:00 AM,63 °F,54 °F,72 °%,ENE,5 °mph,0 °mph,27.44 °in,0.0 °in,Fair
4,4:00 AM,63 °F,54 °F,72 °%,ENE,6 °mph,0 °mph,27.44 °in,0.0 °in,Fair


### Multiple Stations, Multiple Days

In [41]:
# Create a list of the weather stations and the dates that we want to select
station_names = ['são-paulo/SBSP', 'guarulhos/SBGR']
dates         = ['2024-8-3', '2024-8-2', '2024-7-24', '2024-2-1']

In [42]:
def scrape_wunderground_multiple(station_list, date_list):
    
    """
    Function to scrape hourly data from multiple stations on multiple dates.
    """
    
    # Set the driver
    driver = webdriver.Chrome()
    
    # Create an empty dictionary
    dfs = {}
    
    # For every station
    for station in station_list:
        
        # For every date
        for date in date_list:
            
            # Set the URL
            url = 'https://www.wunderground.com/history/daily/br/' + station + '/date/' + date
            driver.get(url)
            
            # Get the tables on the webpage
            tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
            
            # Read in the second table (hourly data) as Pandas DataFrame
            df = pd.read_html(tables[1].get_attribute('outerHTML'))
            df = df[0].dropna()
            
            # Set a new name for each station + date
            df_name = 'df' + station[-4:] + date.replace("-", "_")
            
            # Add to dictionary
            dfs[df_name] = df
    
    # Return the dictionary
    return dfs                

In [43]:
# Run the function
df_2 = scrape_wunderground_multiple(station_names, dates)

In [46]:
# See all keys in dictionary (should be 2 stations and 4 dates for each station - 8 in total)
df_2.keys()

dict_keys(['dfSBSP2024_8_3', 'dfSBSP2024_8_2', 'dfSBSP2024_7_24', 'dfSBSP2024_2_1', 'dfSBGR2024_8_3', 'dfSBGR2024_8_2', 'dfSBGR2024_7_24', 'dfSBGR2024_2_1'])

In [47]:
# Extract a specific station on a specific date
df_2['dfSBGR2024_2_1']

Unnamed: 0,Time,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precip.,Condition
0,12:00 AM,66 °F,61 °F,83 °%,ENE,5 °mph,0 °mph,27.45 °in,0.0 °in,Fair
1,1:00 AM,66 °F,61 °F,83 °%,SE,3 °mph,0 °mph,27.48 °in,0.0 °in,Fair
2,2:00 AM,64 °F,63 °F,94 °%,E,9 °mph,0 °mph,27.48 °in,0.0 °in,Fair
3,3:00 AM,64 °F,63 °F,94 °%,ENE,7 °mph,0 °mph,27.48 °in,0.0 °in,Mostly Cloudy
4,3:30 AM,63 °F,63 °F,100 °%,ESE,2 °mph,0 °mph,27.48 °in,0.0 °in,Mostly Cloudy
5,4:00 AM,63 °F,63 °F,100 °%,CALM,0 °mph,0 °mph,27.48 °in,0.0 °in,Fair
6,5:00 AM,61 °F,61 °F,100 °%,W,3 °mph,0 °mph,27.48 °in,0.0 °in,Fair
7,6:00 AM,61 °F,61 °F,100 °%,CALM,0 °mph,0 °mph,27.48 °in,0.0 °in,Fair
8,7:00 AM,64 °F,63 °F,94 °%,ENE,2 °mph,0 °mph,27.51 °in,0.0 °in,Fair
9,8:00 AM,68 °F,63 °F,83 °%,SW,1 °mph,0 °mph,27.54 °in,0.0 °in,Fair
