In [2]:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd

In [3]:
# URL to fetch data from
url = 'https://www.aoml.noaa.gov/hrd/hurdat/hurdat2.html'

# Send a GET request to fetch the data
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the first <pre> tag
    pre_tag = soup.find('pre')

    # Extract the text from the <pre> tag
    if pre_tag:
        data_between_pre_tags = pre_tag.get_text()
        
        # Save the text to a CSV file
        with open('hurricane_data.csv', 'w', encoding='utf-8') as file:
            file.write(data_between_pre_tags)

        print("Data has been saved to 'hurricane_data.csv'")
    else:
        print("No <pre> tags found in the HTML content.")
else:
    print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

Data has been saved to 'hurricane_data.csv'


In [4]:
# Read the CSV file, skipping the first 7 rows and selecting every other row starting from row 8
df = pd.read_csv('hurricane_data.csv', skiprows=6, header=None, skip_blank_lines=True)

# Reset index
df.reset_index(drop=True, inplace=True)

# Display the cleaned DataFrame
df

  df = pd.read_csv('hurricane_data.csv', skiprows=6, header=None, skip_blank_lines=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,18510625,0600,,HU,28.0N,95.4W,80,-999.0,-999.0,-999.0,...,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999.0
1,18510625,1200,,HU,28.0N,96.0W,80,-999.0,-999.0,-999.0,...,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999.0
2,18510625,1800,,HU,28.1N,96.5W,80,-999.0,-999.0,-999.0,...,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999.0
3,18510625,2100,L,HU,28.2N,96.8W,80,-999.0,-999.0,-999.0,...,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999.0
4,18510626,0000,,HU,28.2N,97.0W,70,-999.0,-999.0,-999.0,...,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999,-999.0,-999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55946,20221110,1900,L,TS,29.2N,83.0W,40.0,989.0,200.0,60,...,130.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0
55947,20221111,0000,L,TS,30.1N,84.0W,35.0,992.0,200.0,60,...,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0
55948,20221111,0600,,TD,31.2N,84.6W,30.0,996.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
55949,20221111,1200,,TD,33.2N,84.6W,25.0,999.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0


In [11]:

# Convert the 'date' column to datetime format, handling errors by coercing
df[0] = pd.to_datetime(df[0], format='%Y%m%d', errors='coerce')

# Filter out data prior to the year 1924
filtered_df = df[df[0].dt.year == 1977]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
34318,1977-06-13,1200,,TD,25.0N,94.5W,20.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
34319,1977-06-13,1800,,TD,26.0N,94.8W,25.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
34320,1977-06-14,0000,,TD,27.0N,95.2W,25.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
34321,1977-06-14,0600,,TD,28.0N,95.6W,25.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
34322,1977-06-14,1200,,TD,29.0N,96.0W,25.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34501,1977-10-24,1800,,TD,26.5N,94.2W,25.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
34502,1977-10-25,0000,,TD,27.5N,92.8W,30.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
34503,1977-10-25,0600,,TD,28.5N,91.4W,30.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
34504,1977-10-25,1200,,TD,29.5N,90.0W,25.0,-999.0,-999.0,-999,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [9]:
# Create a new DataFrame by dropping columns 6 to 20
new_df = filtered_df.iloc[:, :6]  # Select columns from index 0 to 4 (columns 1-5)

# Display the new DataFrame
new_df

Unnamed: 0,0,1,2,3,4,5
33372,1975-06-24,1200,,TD,32.5N,52.0W
33373,1975-06-24,1800,,TD,32.6N,52.6W
33374,1975-06-25,0000,,TD,32.7N,53.2W
33375,1975-06-25,0600,,TD,32.8N,53.2W
33376,1975-06-25,1200,,TD,33.0N,54.5W
...,...,...,...,...,...,...
55946,2022-11-10,1900,L,TS,29.2N,83.0W
55947,2022-11-11,0000,L,TS,30.1N,84.0W
55948,2022-11-11,0600,,TD,31.2N,84.6W
55949,2022-11-11,1200,,TD,33.2N,84.6W
