In [19]:
from ftplib import FTP
import xml.etree.ElementTree as ET
import pandas as pd
import re

# Connect to FTP and download the XML file
ftp = FTP('ftp.bom.gov.au')
ftp.login()
ftp.cwd('/anon/gen/fwo/')
with open('IDN11060.xml', 'wb') as file:
    ftp.retrbinary('RETR IDN11060.xml', file.write)
ftp.quit()

# Parse the XML file
tree = ET.parse('IDN11060.xml')
root = tree.getroot()

# Prepare an empty list to store the data
data = []

# Loop through each 'area' element to get all forecast data
for area in root.findall('.//area'):
    location = area.attrib.get('description')
    
    # Loop through each forecast period in the location
    for period in area.findall('.//forecast-period'):
        start_time = period.attrib.get('start-time-local')
        
        # Initialize a dictionary to hold the data for this forecast period
        forecast_data = {
            'Location': location,
            'Date': start_time
        }
        
        # Loop through all 'element' tags
        for element in period.findall('element'):
            param_type = element.attrib.get('type')
            value = element.text
            units = element.attrib.get('units', '')
            forecast_data[f'{param_type} ({units})'] = value
        
        # Loop through all 'text' tags
        for text in period.findall('text'):
            text_type = text.attrib.get('type')
            text_value = text.text
            forecast_data[text_type] = text_value
        
        # Append the forecast data to the list
        data.append(forecast_data)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

df = df[['Location', 'Date', 
             'air_temperature_maximum (Celsius)', 'air_temperature_minimum (Celsius)', 
             'precipitation_range ()']].copy()

# Renaming the columns to the desired format
df.columns = ['Location', 'Date', 'Temp_Max', 'Temp_Min', 'Rain']

def extract_rain_value(rain):
    if pd.isnull(rain):
        return 0  # If NaN, return 0
    # Extract the larger number from strings like "0 to 2 mm"
    rain_values = re.findall(r'\d+', rain)
    if rain_values:
        return int(rain_values[-1])  # Return the largest number (last in the list)
    return 0  # Default to 0 if no numbers are found

# Apply the extraction function to the 'Rain' column
df['Rain'] = df['Rain'].apply(extract_rain_value)


df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

# Convert 'Temp_Max' and 'Temp_Min' columns to numeric, forcing errors to NaN if conversion fails
df['Temp_Max'] = pd.to_numeric(df['Temp_Max'], errors='coerce')
df['Temp_Min'] = pd.to_numeric(df['Temp_Min'], errors='coerce')

# Calculating the difference between Temp_Max and Temp_Min for each row
df['Temp_Diff'] = df['Temp_Max'] - df['Temp_Min']

# Calculating the average difference between Temp_Max and Temp_Min (excluding NaN values)
avg_temp_diff = df['Temp_Diff'].mean()

# Filling the NaN values in Temp_Min by subtracting the average temperature difference from Temp_Max
df['Temp_Min'].fillna(df['Temp_Max'] - avg_temp_diff, inplace=True)

# Dropping the Temp_Diff column as it's no longer needed
df.drop(columns=['Temp_Diff'], inplace=True)

df_sydney = df[df['Location'] == 'Sydney'].copy()
# Connect to FTP and download the XML file
ftp = FTP('ftp.bom.gov.au')
ftp.login()
ftp.cwd('/anon/gen/fwo/')
with open('IDN11050.xml', 'wb') as file:
    ftp.retrbinary('RETR IDN11050.xml', file.write)
ftp.quit()

# Parse the XML file
tree = ET.parse('IDN11050.xml')
root = tree.getroot()

# Prepare an empty list to store the data
data = []

# Loop through each 'area' element to get all forecast data
for area in root.findall('.//area'):
    location = area.attrib.get('description')
    
    # Loop through each forecast period in the location
    for period in area.findall('.//forecast-period'):
        start_time = period.attrib.get('start-time-local')
        
        # Initialize a dictionary to hold the data for this forecast period
        forecast_data = {
            'Location': location,
            'Date': start_time
        }
        
        # Loop through all 'element' tags
        for element in period.findall('element'):
            param_type = element.attrib.get('type')
            value = element.text
            units = element.attrib.get('units', '')
            forecast_data[f'{param_type} ({units})'] = value
        
        # Loop through all 'text' tags
        for text in period.findall('text'):
            text_type = text.attrib.get('type')
            text_value = text.text
            forecast_data[text_type] = text_value
        
        # Append the forecast data to the list
        data.append(forecast_data)

# Create a DataFrame from the list of dictionaries
df_wind = pd.DataFrame(data)


# Mapping wind direction text to degrees
wind_direction_degrees = {
    'northerly': 0, 'north to northeasterly': 22.5, 'northeasterly': 45, 'east to northeasterly': 67.5,
    'easterly': 90, 'east to southeasterly': 112.5, 'southeast': 135, 'south to southeast': 157.5,
    'southerly': 180, 'south to southwesterly': 202.5, 'southwesterly': 225, 'west-southwesterly': 247.5,
    'westerly': 270, 'west to northwesterly': 292.5, 'northwesterly': 315, 'north to northwesterly': 337.5
}

# Function to extract wind speed and direction from the 'Forecast' column
def extract_wind_info(forecast):
    if pd.isnull(forecast):
        return None, None, None  # Handle None or NaN values
    
    
    # Regular expression to handle both "Winds ..." and "becoming ..."
    wind_info = re.search(r'(Winds|becoming)\s([a-zA-Z\s]+)\s(\d+)(?:\sto\s(\d+))?\skm/h', forecast)
    
    if wind_info:
        wind_direction = wind_info.group(2).strip().lower()
        wind_speed_min = int(wind_info.group(3))
        wind_speed_max = int(wind_info.group(4)) if wind_info.group(4) else None
        
        # Convert the wind direction to degrees if it's in the mapping
        for direction, degrees in wind_direction_degrees.items():
            if direction in wind_direction:
                wind_direction_degrees_value = degrees
                break
        else:
            wind_direction_degrees_value = None  # Handle unknown direction
        
        return wind_direction_degrees_value, wind_speed_min, wind_speed_max
    
    return None, None, None

# Apply the function to the 'Forecast' column and store the results in new columns
df_wind['Wind Direction (Degrees)'], df_wind['Wind Speed Min'], df_wind['Wind Speed Max'] = zip(*df_wind['forecast'].apply(extract_wind_info))

def kmh_to_ms(speed_kmh):
    return round(speed_kmh * 0.27778, 2) if speed_kmh is not None else None

# Apply the conversion after extracting the wind data
df_wind['Wind Speed Min (m/s)'] = df_wind['Wind Speed Min'].apply(kmh_to_ms)
df_wind['Wind Speed Max (m/s)'] = df_wind['Wind Speed Max'].apply(kmh_to_ms)
df_wind['Date'] = pd.to_datetime(df_wind['Date']).dt.strftime('%Y-%m-%d')

# Display the extracted wind information
df_wind=df_wind[['Location','Date','Wind Direction (Degrees)', 'Wind Speed Min (m/s)', 'Wind Speed Max (m/s)','forecast']]

df_wind_sydney = df_wind[df_wind['Location'] == 'Sydney'][['Date','Wind Direction (Degrees)', 'Wind Speed Min (m/s)', 'Wind Speed Max (m/s)','forecast']].copy()
df_forecast_sydney = pd.merge(df_sydney, df_wind_sydney, on='Date', how='inner')

df_forecast_sydney = df_forecast_sydney.drop(columns=['Location'])  # Drop the "Location" column
df_forecast_sydney['Date'] = pd.to_datetime(df_forecast_sydney['Date'])  # Convert 'Date' column to datetime
df_forecast_sydney = df_forecast_sydney.set_index('Date')  # Set 'Date' as the index


In [18]:
df_wind.head()

Unnamed: 0,Location,Date,Wind Direction (Degrees),Wind Speed Min (m/s),Wind Speed Max (m/s)
0,Wollongong,2024-10-12,180.0,6.94,11.11
1,Wollongong,2024-10-13,45.0,6.94,9.72
2,Wollongong,2024-10-14,0.0,4.17,6.94
3,Wollongong,2024-10-15,180.0,5.56,8.33
4,Wollongong,2024-10-16,0.0,4.17,6.94


In [14]:
long.head()

Unnamed: 0,Location,Date,forecast,fire_danger,uv_alert
0,Wollongong,2024-10-12T00:00:00+11:00,"Cloudy. Medium chance of showers this morning,...",Moderate,"Sun protection 9:30am to 4:00pm, UV Index pred..."
1,Wollongong,2024-10-13T00:00:00+11:00,Cloud clearing. Slight chance of a shower in t...,,
2,Wollongong,2024-10-14T00:00:00+11:00,Partly cloudy. High chance of showers. The cha...,,
3,Wollongong,2024-10-15T00:00:00+11:00,Partly cloudy. Medium chance of showers. Winds...,,
4,Wollongong,2024-10-16T00:00:00+11:00,Partly cloudy. Medium chance of showers. Light...,,


In [20]:
df_forecast_sydney.head(10)

Unnamed: 0_level_0,Temp_Max,Temp_Min,Rain,Wind Direction (Degrees),Wind Speed Min (m/s),Wind Speed Max (m/s),forecast
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-10-12,20,7.583333,1,180.0,4.17,6.94,"Cloudy morning, becoming partly cloudy this af..."
2024-10-13,22,14.0,0,45.0,5.56,8.33,"Partly cloudy morning, clearing to a mostly su..."
2024-10-14,25,14.0,10,0.0,4.17,5.56,"Partly cloudy. High chance of showers, most li..."
2024-10-15,21,16.0,7,180.0,4.17,6.94,Cloudy. High chance of showers. Winds southerl...
2024-10-16,22,13.0,3,22.5,4.17,5.56,Partly cloudy. Medium chance of showers. Light...
2024-10-17,24,15.0,1,90.0,4.17,6.94,Partly cloudy. Medium chance of showers. Light...
2024-10-18,26,16.0,10,22.5,4.17,6.94,Cloudy. High chance of showers. The chance of ...
