In [1]:
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser
from selenium import webdriver
import time
import pandas as pd

In [2]:
# Launch browser
browser = Browser('chrome') 
url = "https://www.aoml.noaa.gov/hrd/hurdat/most_intense.html"
browser.visit(url)
time.sleep(2) 

In [3]:
html = browser.html
soup = bs(html, 'html.parser')
time.sleep(2) 

hurricanes = []

# Find the hurricane data table
try:
    table = soup.find('table', class_='content').find('td', id='tdcontent')
    rows = table.find_all('tr')

    # Initialize a flag to skip the first row
    first_row = True
    
    for row in rows:
        data = row.find_all('td')
        record = [d.get_text(strip=True) for d in data]
        
        # Skip the first row
        if first_row:
            first_row = False
            continue
            
        # Check if the row contains the "Notes" section
        if any("Notes:" in cell for cell in record):
            break
        
        # Skip the row if it's empty
        if any(record):
            hurricanes.append(record)
            
except Exception as e:
    print(f"An error occurred: {e}")

# Close the browser
browser.quit()

In [4]:
# Print the extracted data
for hurricane in hurricanes:
    print(hurricane)

['Rank', '#', 'Date', 'Time', 'Latitude', 'Longitude', 'Max Winds(kt)', 'SSHWS', 'RMWnm', 'CentralPressure(mb)', 'States Affected', 'Name']
['1', '3', '9/3/1935', '0200Z', '24.8N', '80.8W', '160', '5', '5', '892', 'CFL5,BFL5', '"Labor Day"']
['2', '9', '8/18/1969', '0400Z', '30.3N', '89.4W', '150', '5', '10', '900', 'MS5,LA5,AL1', 'Camille']
['3', '4', '8/26/1992', '0905Z', '25.5N', '80.3W', '145', '5', '10', '922', 'CFL5,BFL4', 'Andrew']
['4', '14', '10/10/2018', '1730Z', '30.0N', '85.5W', '140', '5', '10', '919', 'AFL5,I-GA2', 'Michael']
['5', '1', '8/10/1856$', '1800Z', '29.2N', '91.1W', '130', '4', '10', '934', 'LA4', '"Last Island"']
['5', '5', '8/20/1886', '1300Z', '28.1N', '96.8W', '130', '4', '15', '925', 'BTX4', '"Indianola"']
['5', '2', '9/10/1919', '0700Z', '24.6N', '82.9W', '130', '4', '15', '927', 'BFL4,CFL2', '---------']
['5', '2', '8/14/1932', '0400Z', '29.0N', '95.2W', '130', '4', '10', '935', 'CTX4,BTX1', '"Freeport"']
['5', '3', '8/13/2004', '1945Z', '26.6N', '82.2W'

In [7]:
# Create hrricane dataframe
hurricane_df = pd.DataFrame(hurricanes,
                           columns = [
                           'Rank',
                            '#',
                            'Date',
                            'Time',
                            'Latitude',
                            'Longitude',
                            'Max Winds(kt)',
                            'SSHWS',
                            'RMWnm',
                            'CentralPressure(mb)',
                            'States Affected',
                            'Name'])
hurricane_df

Unnamed: 0,Rank,#,Date,Time,Latitude,Longitude,Max Winds(kt),SSHWS,RMWnm,CentralPressure(mb),States Affected,Name
0,Rank,#,Date,Time,Latitude,Longitude,Max Winds(kt),SSHWS,RMWnm,CentralPressure(mb),States Affected,Name
1,1,3,9/3/1935,0200Z,24.8N,80.8W,160,5,5,892,"CFL5,BFL5","""Labor Day"""
2,2,9,8/18/1969,0400Z,30.3N,89.4W,150,5,10,900,"MS5,LA5,AL1",Camille
3,3,4,8/26/1992,0905Z,25.5N,80.3W,145,5,10,922,"CFL5,BFL4",Andrew
4,4,14,10/10/2018,1730Z,30.0N,85.5W,140,5,10,919,"AFL5,I-GA2",Michael
...,...,...,...,...,...,...,...,...,...,...,...,...
92,11,6,9/6/1996,0030Z,33.9N,78.0W,100,3,40,954,NC3,Fran
93,11,3,8/23/1999,0000Z,26.9N,97.4W,100,3,10,951,ATX3,Bret
94,11,18,9/24/2005,0740Z,29.7N,93.7W,100,3,20,937,"LA3,CTX2",Rita
95,11,28,10/28/2020,2100Z,29.2N,90.6W,100,3,20,970,"LA3,MS2,I-AL1",Zeta


In [8]:
# Save huricane data for future use
hurricane_df.to_csv('../01_Extract/Output/hurricane_df.csv',index=False)