In [37]:
# Import relevant libraries
from datetime import datetime
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [38]:
# Function to initialize the browser and visit the page
def initialize_browser(url):
    browser = Browser('firefox')
    browser.visit(url)
    time.sleep(5)  # Allow time for page load (adjust as necessary)
    return browser

In [39]:
# Function to scrape tables and extract URLs
def scrape_table_and_urls(browser, url):
    
    # Visit the page
    browser.visit(url)
    time.sleep(5)  # Allow time for page to load

    # Create a BeautifulSoup object
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Find the table (customize this selector if necessary)
    table = soup.find('table', {'role': 'grid', 'aria-labelledby': 'Storms'})
    
    # Extract headers
    headers = table.find_all('th')
    header_texts = [header.find('div', class_='mat-sort-header-content').text for header in headers]

    # Extract rows
    rows = table.find_all('tr')
    
    data = []
    year_endpoints = []
    retired_names_endpoints = []

    # Loop through each row and extract data and URLs
    for row in rows:
        cols = row.find_all('td')
        if cols:
            col_data = [col.text.strip() for col in cols]
            data.append(col_data)

            # Extract Year URL
            year_column = cols[0]
            year_link = year_column.find('a')
            # Check if the link exists and has an 'href' attribute
            if year_link and 'href' in year_link.attrs:
                year_endpoints.append(year_link['href'])

            # Extract Retired Names URLs
            retired_names_column = cols[-1]
            retired_names_links = retired_names_column.find_all('a')
            if retired_names_links:
                retired_names_endpoints.append([link['href'] for link in retired_names_links if 'href' in link.attrs])

    return data, year_endpoints, retired_names_endpoints, header_texts

In [40]:
# Begin scraping process
base_url = "https://www.wunderground.com"
archive_url = f"{base_url}/hurricane/archive"

# Initialize browser
browser = Browser('firefox')

# Scrape main table and URLs
main_data, year_endpoints, retired_names_endpoints, archive_header = scrape_table_and_urls(browser, archive_url)

# Create DataFrame for the main archive table
archive_summary_df = pd.DataFrame(main_data, columns=archive_header)

# Initialize empty DataFrame for storing data
year_dataframes = []

# Extract years from the endpoints using regex and remove entries older than 1971
filtered_endpoints = [endpoint for endpoint in year_endpoints if int(re.search(r'/(\d{4})$', endpoint).group(1)) >= 1971]

# Find the latest year from the filtered list
years = [int(re.search(r'/(\d{4})$', endpoint).group(1)) for endpoint in filtered_endpoints]
latest_year = max(years)

# Find the current year
current_year = datetime.now().year

# Append missing entries from the latest year up to the current year
for year in range(latest_year + 1, current_year + 1):
    new_endpoint = f'/hurricane/archive/AL/{year}'
    filtered_endpoints.append(new_endpoint)

# Print extracted URLs
print("Year Endpoints:", filtered_endpoints)

# Scrape tables from year URLs
for year_endpoint in filtered_endpoints:
    year_url = f"{base_url}{year_endpoint}"
    year = year_endpoint.split('/')[-1]
    print(f"Scraping year URL: {year_url}")

    try:
        # Try scraping the table and handling potential missing 'href' cases
        year_data, _, _, year_header = scrape_table_and_urls(browser, year_url)
        # Convert scraped year data into a DataFrame
        year_df = pd.DataFrame(year_data, columns=year_header)
        year_df.insert(0, 'Year', year)
        
        # Append the DataFrame to the list
        year_dataframes.append(year_df)
    
    except KeyError as e:
        print(f"Error scraping {year_url}: {e}")
    except AttributeError as a:
        print(f"Error scraping {year_url}: {a}")

# Concatenate all year data into a single DataFrame
all_year_data_df = pd.concat(year_dataframes, ignore_index=True)

Year Endpoints: ['/hurricane/archive/AL/2018', '/hurricane/archive/AL/2017', '/hurricane/archive/AL/2016', '/hurricane/archive/AL/2015', '/hurricane/archive/AL/2014', '/hurricane/archive/AL/2013', '/hurricane/archive/AL/2012', '/hurricane/archive/AL/2011', '/hurricane/archive/AL/2010', '/hurricane/archive/AL/2009', '/hurricane/archive/AL/2008', '/hurricane/archive/AL/2007', '/hurricane/archive/AL/2006', '/hurricane/archive/AL/2005', '/hurricane/archive/AL/2004', '/hurricane/archive/AL/2003', '/hurricane/archive/AL/2002', '/hurricane/archive/AL/2001', '/hurricane/archive/AL/2000', '/hurricane/archive/AL/1999', '/hurricane/archive/AL/1998', '/hurricane/archive/AL/1997', '/hurricane/archive/AL/1996', '/hurricane/archive/AL/1995', '/hurricane/archive/AL/1994', '/hurricane/archive/AL/1993', '/hurricane/archive/AL/1992', '/hurricane/archive/AL/1991', '/hurricane/archive/AL/1990', '/hurricane/archive/AL/1989', '/hurricane/archive/AL/1988', '/hurricane/archive/AL/1987', '/hurricane/archive/AL/

In [12]:
# Print extracted URLs
print("Retired Names Endpoints:", retired_names_endpoints)

# Initialize empty DataFrame for storing data
retired_names_dataframes = []

# Scrape tables from retired names URLs
for retired_names_endpoint in retired_names_endpoints:
    for retired_endpoint in retired_names_endpoint:
        retired_url = f"{base_url}{retired_endpoint}"
         # Extract the second-to-last part, which is the hurricane name
        hurricane_name = retired_endpoint.split('/')[-2].split('-')[1]
        print(f"Scraping retired names URL: {retired_url} - Hurricane name: {hurricane_name}")

        try:
            # Try scraping the table and handling potential missing 'href' cases
            retired_data, _, _, retired_header = scrape_table_and_urls(browser, retired_url)
            
            # Convert scraped retired names data into a DataFrame
            retired_df = pd.DataFrame(retired_data, columns=retired_header)
            retired_df.insert(0, 'Hurricane_Name', hurricane_name)
            
            # Append the DataFrame to the list
            retired_names_dataframes.append(retired_df)
        
        except KeyError as e:
            print(f"Error scraping {retired_url}: {e}")
        except AttributeError as a:
            print(f"Error scraping {year_url}: {a}")

# Concatenate all retired names data into a single DataFrame
all_retired_names_data_df = pd.concat(retired_names_dataframes, ignore_index=True)

# Quit browser
browser.quit()

Retired Names Endpoints: [['/hurricane/archive/AL/2018/Hurricane-Florence/2018242N13343', '/hurricane/archive/AL/2018/Hurricane-Michael/2018280N18273'], ['/hurricane/archive/AL/2017/Hurricane-Harvey/2017228N14314', '/hurricane/archive/AL/2017/Hurricane-Irma/2017242N16333', '/hurricane/archive/AL/2017/Hurricane-Maria/2017260N12310', '/hurricane/archive/AL/2017/Hurricane-Nate/2017277N11279'], ['/hurricane/archive/AL/2016/Hurricane-Matthew/2016273N13300', '/hurricane/archive/AL/2016/Hurricane-Otto/2016323N13279'], ['/hurricane/archive/AL/2015/Hurricane-Erika/2015237N14315', '/hurricane/archive/AL/2015/Hurricane-Joaquin/2015270N27291'], ['/hurricane/archive/AL/2013/Hurricane-Ingrid/2013255N19268'], ['/hurricane/archive/AL/2012/Hurricane-Sandy/2012296N14283'], ['/hurricane/archive/AL/2011/Hurricane-Irene/2011233N15301'], ['/hurricane/archive/AL/2010/Hurricane-Igor/2010251N14337', '/hurricane/archive/AL/2010/Hurricane-Tomas/2010302N09306'], ['/hurricane/archive/AL/2008/Hurricane-Gustav/20082

In [33]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# Function to initialize the browser
def initialize_browser():
    return webdriver.Firefox()

# List to capture clicked URLs
clicked_urls = []

# Loop through the desired years
for year in range(2019, 2024):
    url = f'https://www.wunderground.com/hurricane/archive/AL/{year}'  # Base URL

    # Initialize the Selenium WebDriver
    driver = initialize_browser()

    # Open the page
    driver.get(url)

    # Allow time for JavaScript to load the table
    time.sleep(5)

    # Get the HTML of the fully rendered page
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # Now try finding the table and URLs
    table = soup.find('table', {'role': 'grid', 'aria-labelledby': 'Storms'})
    if table:
        rows = table.find_all('tr')

        print(year)

        # Loop through the rows and extract potential URLs
        for row in rows:
            cols = row.find_all('td')
            if cols:
                hurricane_name = cols[0].text.strip()
                print(f"\t{hurricane_name}")  # Modify based on what you find

                try:
                    # Find and click the link for the hurricane
                    link = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.LINK_TEXT, hurricane_name))
                    )
                    link.click()

                    # Capture the resulting URL
                    current_url = driver.current_url
                    clicked_urls.append(current_url)  # Append the URL to the list
                    print(f"The URL is: {current_url}")

                    # Close the browser
                    driver.quit()

                    # Reinitialize the browser and navigate back to the starting page
                    driver = initialize_browser()
                    driver.get(url)

                    # Allow time for the page to fully load again
                    time.sleep(5)

                except Exception as e:
                    print(f"An error occurred: {e}")

    # Close the browser after processing each year
    driver.quit()

# Output the collected URLs
print("Collected URLs:")
for url in clicked_urls:
    print(url)


2019
	ANDREA
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Andrea/2019141N28291
	BARRY
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Barry/2019192N29274
	CHANTAL
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Chantal/2019233N40300
	DORIAN
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Dorian/2019236N10314
	ERIN
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Erin/2019239N32287
	FERNAND
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Fernand/2019246N24266
	GABRIELLE
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Gabrielle/2019246N16330
	HUMBERTO
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Humberto/2019256N23286
	IMELDA
The URL is: https://www.wunderground.com/hurricane/archive/AL/2019/Hurricane-Imelda/2019261N28264
	JERRY
The URL is: https://www.wu

In [45]:
# Initialize browser
browser = Browser('firefox')

# Initialize empty DataFrame for storing data
named_dataframes = []

for named_url in clicked_urls:
    
    hurricane_name = named_url.split('/')[-2]

    try:
        # Try scraping the table and handling potential missing 'href' cases
        named_data, _, _, named_header = scrape_table_and_urls(browser, named_url)

        # Convert scraped retired names data into a DataFrame
        named_df = pd.DataFrame(named_data, columns=named_header)
        named_df.insert(0, 'Hurricane_Name', hurricane_name)

        # Append the DataFrame to the list
        named_dataframes.append(named_df)

    except KeyError as e:
            print(f"Error scraping {retired_url}: {e}")
    except AttributeError as a:
        print(f"Error scraping {year_url}: {a}")

# Concatenate all retired names data into a single DataFrame
all_named_data_df = pd.concat(named_dataframes, ignore_index=True)

browser.quit()

In [68]:
# archive_summary_df.to_csv('data/hurricane_archive_summary.csv', index=False)
# all_year_data_df.to_csv('data/hurricane_summary.csv', index=False)
# all_retired_names_data_df.to_csv('data/hurricane_details_retired.csv', index=False)
# all_named_data_df.to_csv('data/hurricane_details.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7917 entries, 0 to 7916
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Hurricane_Name  7917 non-null   object
 1   Date            7917 non-null   object
 2    Time           7917 non-null   object
 3    Lat            7917 non-null   object
 4    Lon            7917 non-null   object
 5    Wind(mph)      7917 non-null   object
 6    Pressure(mb)   7917 non-null   object
 7    Storm Type     7917 non-null   object
dtypes: object(8)
memory usage: 494.9+ KB
