In [26]:
import requests
import numpy as np
import json
import pandas as pd
from datetime import datetime, timedelta
import time
from ftplib import FTP
import xml.etree.ElementTree as ET
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
import re
import streamlit as st

# Define the base URL for the POST request
url = 'https://data.airquality.nsw.gov.au/api/Data/get_Observations'

# Set start and end dates dynamically
start_date = datetime.strptime("2015-04-01", "%Y-%m-%d")
end_date = datetime.today()  # End date is today
max_chunk_days = 112  # Maximum number of days per API call
retries = 3  # Retry attempts for failed iterations

# Function to make the API request and process the data
def process_iteration(start_date_str, end_date_str):
    payload = {
        "Parameters": ["PM10", "PM2.5", "CO", "NH3", "NO", "NO2", "SO2", "OZONE", "TSPd", "RAIN", 
                       "SOLAR", "TEMP", "SD1", "WDR", "WSP", "Humid", "NEPH"],
        "Sites": [39],  # List of site IDs
        "StartDate": start_date_str,
        "EndDate": end_date_str,
        "Categories": ["Averages"],
        "SubCategories": ["Hourly"],
        "Frequency": ["Hourly Average"]
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, headers=headers, data=json.dumps(payload))

    if response.status_code == 200:
        data = response.json()
        return pd.DataFrame(data)
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")

# Fetch Air Quality Data dynamically with automatic chunking
def fetch_air_quality_data():
    global start_date, end_date  # Use global to modify the start_date/end_date if necessary
    data_frames = []
    
    # Continue fetching data until start_date exceeds end_date
    while start_date < end_date:
        start_date_str = start_date.strftime("%Y-%m-%d")
        end_date_chunk = start_date + timedelta(days=max_chunk_days)
        
        # Ensure end_date_chunk doesn't exceed today's date
        if end_date_chunk > end_date:
            end_date_chunk = end_date
        
        end_date_str = end_date_chunk.strftime("%Y-%m-%d")
        attempt = 0
        
        while attempt < retries:
            try:
                # Process each iteration for the specified date chunk
                df = process_iteration(start_date_str, end_date_str)
                data_frames.append(df)
                print(f"Data fetched from {start_date_str} to {end_date_str}")
                break
            except Exception as e:
                attempt += 1
                print(f"Error fetching data: {e}. Retrying...")
                time.sleep(5)

        # Update start_date to be the day after the current chunk
        start_date = end_date_chunk + timedelta(days=1)
        time.sleep(2)  # Sleep to avoid hitting rate limits
    
    # Combine all the fetched data into a single DataFrame
    combined_df = pd.concat(data_frames, ignore_index=True)
    combined_df = combined_df.infer_objects()  # Ensure data types are inferred correctly
    print(f"Air quality data collection completed at {datetime.now()}")
    combined_df.to_csv('combined_df.csv')
    return combined_df

# Process air quality data to daily aggregation
def process_air_quality_data(df):
    df['ParameterCode'] = df['Parameter'].apply(lambda x: x.get('ParameterCode') if isinstance(x, dict) else None)
    df['ParameterDescription'] = df['Parameter'].apply(lambda x: x.get('ParameterDescription') if isinstance(x, dict) else None)

    df_wide = df.pivot_table(index=['Site_Id', 'Date', 'Hour', 'HourDescription'],
                             columns='ParameterCode', 
                             values='Value', 
                             aggfunc='first').reset_index()

    df_wide.interpolate(method='linear', axis=0, inplace=True)
    df_wide['datetime'] = pd.to_datetime(df_wide['Date']) + pd.to_timedelta(df_wide['Hour'], unit='h')
    df_wide.set_index('datetime', inplace=True)

    aggregation_rules_mean = {
        'CO': 'mean', 'HUMID': 'mean', 'NEPH': 'mean', 'NO': 'mean', 'NO2': 'mean', 'OZONE': 'mean', 
        'SO2': 'mean', 'PM10': 'mean', 'PM2.5': 'mean', 'RAIN': 'sum', 'TEMP': ['min', 'max'], 
        'WSP': 'max', 'SD1': 'mean', 'WDR': 'mean'
    }
    daily_aggregated_mean = df_wide.resample('D').agg(aggregation_rules_mean)
    daily_aggregated_mean.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in daily_aggregated_mean.columns]
    daily_aggregated_mean.interpolate(method='linear', inplace=True)

    return daily_aggregated_mean

# Process traffic data
def process_traffic_data():
    df = pd.read_csv('/Users/evankerivan/Desktop/IOD/capstone/traffic_victoria_road.csv')
    
    df_melted = pd.melt(df, 
                        id_vars=['year', 'date', 'cardinal_direction_seq', 'classification_seq', 'public_holiday', 'school_holiday'],
                        value_vars=[f'hour_{str(i).zfill(2)}' for i in range(24)],
                        var_name='hour', 
                        value_name='traffic_count')

    df_melted['hour'] = df_melted['hour'].str.replace('hour_', '').astype(int)
    df_pivoted = df_melted.pivot_table(index=['year', 'date', 'hour', 'public_holiday', 'school_holiday'],
                                       columns='classification_seq', 
                                       values='traffic_count').reset_index()

    df_pivoted.rename(columns={'Heavy Vehicles': 'heavy_vehicle', 'Light Vehicles': 'light_vehicle'}, inplace=True)
    df_clean = df_pivoted.dropna()

    df_clean.loc[:, 'datetime'] = pd.to_datetime(df_clean['date']) + pd.to_timedelta(df_clean['hour'], unit='h')
    df_clean.set_index('datetime', inplace=True)

    daily_traffic_data = df_clean.resample('D').agg({
        'public_holiday': 'max', 'school_holiday': 'max', 'heavy_vehicle': 'sum', 'light_vehicle': 'sum'
    })

    return daily_traffic_data

# LSTM to impute missing traffic data and predict 7 days ahead
def lstm_traffic_imputation(daily_traffic_data):
    # Handle missing values before scaling
    daily_traffic_data = daily_traffic_data.interpolate().fillna(method='bfill').fillna(method='ffill')

    # Scale the traffic data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(daily_traffic_data[['heavy_vehicle', 'light_vehicle']])

    # Prepare sequences (7-day sequences for LSTM)
    def create_sequences(data, seq_length):
        X, y = [], []
        for i in range(seq_length, len(data)):
            X.append(data[i-seq_length:i])
            y.append(data[i])
        return np.array(X), np.array(y)

    sequence_length = 7
    X, y = create_sequences(scaled_data, sequence_length)

    # Split the data into training and testing
    split_index = int(len(X) * 0.8)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=(sequence_length, X.shape[2])))
    model.add(LSTM(64))
    model.add(Dense(2))  # Predict heavy_vehicle and light_vehicle
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Predict missing traffic data for 7 days
    last_sequence = X_test[-1]
    predictions = []
    for _ in range(7):  # Predict 7 days ahead
        pred = model.predict(np.expand_dims(last_sequence, axis=0))
        predictions.append(pred[0])
        last_sequence = np.vstack([last_sequence[1:], pred[0]])

    # Inverse transform the predictions
    predictions = scaler.inverse_transform(predictions)
    return pd.DataFrame(predictions, columns=['heavy_vehicle', 'light_vehicle'], index=pd.date_range(start=daily_traffic_data.index[-1], periods=7))

# Feature selection using RFE


# Function for feature selection with missing value handling
def feature_selection(X, y):
    # Fill missing values (NaNs) in X and y using interpolation or drop NaNs
    X = X.interpolate().fillna(method='bfill').fillna(method='ffill')  # Interpolate, then backfill and forward fill
    y = y.interpolate().fillna(method='bfill').fillna(method='ffill')  # Interpolate, then backfill and forward fill
    
    # Alternatively, you can use dropna if you want to remove rows with NaN values
    # X = X.dropna()
    # y = y.dropna()

    # RandomForest for feature selection
    model = RandomForestRegressor()
    rfe = RFE(model, n_features_to_select=5)  # Adjust n_features_to_select as needed
    fit = rfe.fit(X, y)

    # Print and return selected features
    selected_features = [f for f, selected in zip(X.columns, fit.support_) if selected]
    print(f"Selected Features: {selected_features}")
    return selected_features


# Create LSTM model with parameters for grid search
def create_lstm_model(units=64, optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units, return_sequences=True, input_shape=(7, len(selected_features))))
    model.add(LSTM(units))
    model.add(Dense(len(target_columns)))  # Output air quality features
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Grid search for LSTM model parameters
def grid_search_lstm(X_train, y_train):
    model = KerasRegressor(build_fn=create_lstm_model, verbose=0)
    param_grid = {
        'units': [32, 64, 128],
        'batch_size': [16, 32, 64],
        'epochs': [10, 20],
        'optimizer': ['adam', 'rmsprop']
    }
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=TimeSeriesSplit(n_splits=3))
    grid_result = grid.fit(X_train, y_train)
    print(f"Best Params: {grid_result.best_params_}")
    return grid_result.best_estimator_

# LSTM for air quality prediction using weather, traffic, and air quality data
def lstm_air_quality_prediction(combined_data):
    # Handle missing values before scaling
    combined_data = combined_data.interpolate().fillna(method='bfill').fillna(method='ffill')

    # Define features and target columns
    features = ['TEMP_max', 'TEMP_min', 'RAIN_sum', 'WSP_max', 'WDR_mean', 'heavy_vehicle', 'light_vehicle']  # Add other features if needed
    target_columns = ['PM10_mean', 'PM2.5_mean', 'CO_mean', 'NO_mean', 'SO2_mean']

    # Perform feature selection
    X = combined_data[features]
    y = combined_data[target_columns]
    selected_features = feature_selection(X, y)

    # Scale the selected features and target
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X[selected_features])
    y_scaled = scaler.fit_transform(y)

    # Prepare sequences (7-day sequences for LSTM)
    def create_sequences(data, seq_length):
        X, y = [], []
        for i in range(seq_length, len(data)):
            X.append(data[i-seq_length:i])
            y.append(data[i])
        return np.array(X), np.array(y)

    sequence_length = 7
    X_seq, y_seq = create_sequences(X_scaled, sequence_length)

    # Split the data into training and testing
    split_index = int(len(X_seq) * 0.8)
    X_train, X_test = X_seq[:split_index], X_seq[split_index:]
    y_train, y_test = y_seq[:split_index], y_seq[split_index:]

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=(sequence_length, X_train.shape[2])))
    model.add(LSTM(64))
    model.add(Dense(y_train.shape[1]))  # Output for all target variables
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Predict air quality data for 7 days
    last_sequence = X_test[-1]
    predictions = []
    for _ in range(7):  # Predict 7 days ahead
        pred = model.predict(np.expand_dims(last_sequence, axis=0))
        predictions.append(pred[0])
        last_sequence = np.vstack([last_sequence[1:], pred[0]])

    # Inverse transform the predictions
    predictions = scaler.inverse_transform(predictions)
    return pd.DataFrame(predictions, columns=target_columns, index=pd.date_range(start=combined_data.index[-1], periods=7))

# Process weather forecast data
def process_weather_forecast():
    # Connect to FTP and download the XML file
    ftp = FTP('ftp.bom.gov.au')
    ftp.login()
    ftp.cwd('/anon/gen/fwo/')
    with open('IDN11060.xml', 'wb') as file:
        ftp.retrbinary('RETR IDN11060.xml', file.write)
    ftp.quit()

    # Parse the XML file
    tree = ET.parse('IDN11060.xml')
    root = tree.getroot()

    # Prepare an empty list to store the data
    data = []
    for area in root.findall('.//area'):
        location = area.attrib.get('description')
        for period in area.findall('.//forecast-period'):
            start_time = period.attrib.get('start-time-local')
            forecast_data = {'Location': location, 'Date': start_time}
            for element in period.findall('element'):
                param_type = element.attrib.get('type')
                value = element.text
                units = element.attrib.get('units', '')
                forecast_data[f'{param_type} ({units})'] = value
            for text in period.findall('text'):
                text_type = text.attrib.get('type')
                text_value = text.text
                forecast_data[text_type] = text_value
            data.append(forecast_data)

    df = pd.DataFrame(data)
    df = df[['Location', 'Date', 'air_temperature_maximum (Celsius)', 'air_temperature_minimum (Celsius)', 'precipitation_range ()']].copy()

    df.columns = ['Location', 'Date', 'Temp_Max', 'Temp_Min', 'Rain']
    df['Rain'] = df['Rain'].apply(lambda rain: max(map(int, re.findall(r'\d+', rain))) if pd.notna(rain) else 0)
    df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')
    df['Temp_Max'] = pd.to_numeric(df['Temp_Max'], errors='coerce')
    df['Temp_Min'] = pd.to_numeric(df['Temp_Min'], errors='coerce')

    avg_temp_diff = df['Temp_Max'] - df['Temp_Min']
    avg_temp_diff = avg_temp_diff.mean()
    df['Temp_Min'].fillna(df['Temp_Max'] - avg_temp_diff, inplace=True)
    
    return df[df['Location'] == 'Sydney'][['Date', 'Temp_Max', 'Temp_Min', 'Rain']].copy()

# Combine all data (traffic, weather, and air quality)
def combine_all_data(daily_air_quality, daily_traffic, daily_forecast):
    # Convert 'Date' column in daily_forecast to datetime to ensure uniformity in index
    daily_forecast['Date'] = pd.to_datetime(daily_forecast['Date'])
    
    # Set 'Date' as the index for daily_forecast if it's not already
    daily_forecast = daily_forecast.set_index('Date')
    
    # Combine all the data using outer join
    combined_data = pd.concat([daily_air_quality, daily_traffic, daily_forecast], axis=1, join='outer')
    
    # Ensure the index is in datetime format
    combined_data.index = pd.to_datetime(combined_data.index, errors='coerce')

    # Sort the index after ensuring it's all datetime
    combined_data = combined_data.sort_index()
    
    return combined_data

# Main workflow
def main():
    # Fetch and process data
    air_quality_df = fetch_air_quality_data()
    daily_air_quality = process_air_quality_data(air_quality_df)
    daily_traffic = process_traffic_data()
    daily_forecast = process_weather_forecast()

    # Combine all data
    combined_data = combine_all_data(daily_air_quality, daily_traffic, daily_forecast)

    # LSTM traffic imputation for the next 7 days
    traffic_predictions = lstm_traffic_imputation(daily_traffic)

    # LSTM air quality prediction for the next 7 days
    air_quality_predictions = lstm_air_quality_prediction(combined_data)

    combined_predictions = pd.concat([traffic_predictions, air_quality_predictions], axis=1)
    print("Combined predictions for the next 7 days:")
    print(combined_predictions)

    return combined_predictions

# Run the main workflow
predictions = main()



KeyboardInterrupt: 

In [21]:
combined_df=pd.read_csv('combined_df.csv')

In [19]:
combined_df.head()

Unnamed: 0.1,Unnamed: 0,Site_Id,Parameter,Date,Hour,HourDescription,Value,AirQualityCategory,DeterminingPollutant
0,0,39,"{'ParameterCode': 'CO', 'ParameterDescription'...",2015-04-01,1,12 am - 1 am,0.754289,,
1,1,39,"{'ParameterCode': 'HUMID', 'ParameterDescripti...",2015-04-01,1,12 am - 1 am,94.199,,
2,2,39,"{'ParameterCode': 'NEPH', 'ParameterDescriptio...",2015-04-01,1,12 am - 1 am,0.232,,
3,3,39,"{'ParameterCode': 'NO', 'ParameterDescription'...",2015-04-01,1,12 am - 1 am,6.026604,,
4,4,39,"{'ParameterCode': 'NO2', 'ParameterDescription...",2015-04-01,1,12 am - 1 am,1.959503,GOOD,


In [22]:

# Define the base URL for the POST request
url = 'https://data.airquality.nsw.gov.au/api/Data/get_Observations'

# Set start and end dates dynamically
start_date = datetime.strptime("2015-04-01", "%Y-%m-%d")
end_date = datetime.today()  # End date is today
max_chunk_days = 112  # Maximum number of days per API call
retries = 3  # Retry attempts for failed iterations

# Function to make the API request and process the data
def process_iteration(start_date_str, end_date_str):
    payload = {
        "Parameters": ["PM10", "PM2.5", "CO", "NH3", "NO", "NO2", "SO2", "OZONE", "TSPd", "RAIN", 
                       "SOLAR", "TEMP", "SD1", "WDR", "WSP", "Humid", "NEPH"],
        "Sites": [39],  # List of site IDs
        "StartDate": start_date_str,
        "EndDate": end_date_str,
        "Categories": ["Averages"],
        "SubCategories": ["Hourly"],
        "Frequency": ["Hourly Average"]
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, headers=headers, data=json.dumps(payload))

    if response.status_code == 200:
        data = response.json()
        return pd.DataFrame(data)
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")

# Fetch Air Quality Data dynamically with automatic chunking
def fetch_air_quality_data():
    global start_date, end_date  # Use global to modify the start_date/end_date if necessary
    data_frames = []
    
    # Continue fetching data until start_date exceeds end_date
    while start_date < end_date:
        start_date_str = start_date.strftime("%Y-%m-%d")
        end_date_chunk = start_date + timedelta(days=max_chunk_days)
        
        # Ensure end_date_chunk doesn't exceed today's date
        if end_date_chunk > end_date:
            end_date_chunk = end_date
        
        end_date_str = end_date_chunk.strftime("%Y-%m-%d")
        attempt = 0
        
        while attempt < retries:
            try:
                # Process each iteration for the specified date chunk
                df = process_iteration(start_date_str, end_date_str)
                data_frames.append(df)
                print(f"Data fetched from {start_date_str} to {end_date_str}")
                break
            except Exception as e:
                attempt += 1
                print(f"Error fetching data: {e}. Retrying...")
                time.sleep(5)

        # Update start_date to be the day after the current chunk
        start_date = end_date_chunk + timedelta(days=1)
        time.sleep(2)  # Sleep to avoid hitting rate limits
    
    # Combine all the fetched data into a single DataFrame
    combined_df = pd.concat(data_frames, ignore_index=True)
    combined_df = combined_df.infer_objects()  # Ensure data types are inferred correctly
    print(f"Air quality data collection completed at {datetime.now()}")
    combined_df.to_csv('combined_df.csv')
    return combined_df

In [27]:
import requests
import numpy as np
import json
import pandas as pd
from datetime import datetime, timedelta
import time
from ftplib import FTP
import xml.etree.ElementTree as ET
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
import re
import streamlit as st


# Process traffic data
def process_traffic_data():
    df = pd.read_csv('/Users/evankerivan/Desktop/IOD/capstone/traffic_victoria_road.csv')
    df_melted = pd.melt(df, 
                        id_vars=['year', 'date', 'cardinal_direction_seq', 'classification_seq', 'public_holiday', 'school_holiday'],
                        value_vars=[f'hour_{str(i).zfill(2)}' for i in range(24)],
                        var_name='hour', 
                        value_name='traffic_count')

    df_melted['hour'] = df_melted['hour'].str.replace('hour_', '').astype(int)
    df_pivoted = df_melted.pivot_table(index=['year', 'date', 'hour', 'public_holiday', 'school_holiday'],
                                       columns='classification_seq', 
                                       values='traffic_count').reset_index()

    df_pivoted.rename(columns={'Heavy Vehicles': 'heavy_vehicle', 'Light Vehicles': 'light_vehicle'}, inplace=True)
    df_clean = df_pivoted.dropna()

    df_clean.loc[:, 'datetime'] = pd.to_datetime(df_clean['date']) + pd.to_timedelta(df_clean['hour'], unit='h')
    df_clean.set_index('datetime', inplace=True)

    daily_traffic_data = df_clean.resample('D').agg({
        'public_holiday': 'max', 'school_holiday': 'max', 'heavy_vehicle': 'sum', 'light_vehicle': 'sum'
    })

    return daily_traffic_data

# Function to parse weather data and extract wind speed and direction
def process_weather_forecast():
    # Connect to FTP and download the XML file
    ftp = FTP('ftp.bom.gov.au')
    ftp.login()
    ftp.cwd('/anon/gen/fwo/')
    with open('IDN11060.xml', 'wb') as file:
        ftp.retrbinary('RETR IDN11060.xml', file.write)
    ftp.quit()

    # Parse the XML file
    tree = ET.parse('IDN11060.xml')
    root = tree.getroot()

    # Prepare an empty list to store the data
    data = []
    for area in root.findall('.//area'):
        location = area.attrib.get('description')
        for period in area.findall('.//forecast-period'):
            start_time = period.attrib.get('start-time-local')
            forecast_data = {'Location': location, 'Date': start_time}
            for element in period.findall('element'):
                param_type = element.attrib.get('type')
                value = element.text
                units = element.attrib.get('units', '')
                forecast_data[f'{param_type} ({units})'] = value
            for text in period.findall('text'):
                text_type = text.attrib.get('type')
                text_value = text.text
                forecast_data[text_type] = text_value
            data.append(forecast_data)

    df = pd.DataFrame(data)

    # Extract the relevant columns for weather data
    df = df[['Location', 'Date', 'air_temperature_maximum (Celsius)', 'air_temperature_minimum (Celsius)', 'precipitation_range ()', 'forecast']].copy()
    df.columns = ['Location', 'Date', 'Temp_Max', 'Temp_Min', 'Rain', 'Forecast']

    # Process temperature and precipitation data
    df['Rain'] = df['Rain'].apply(lambda rain: max(map(int, re.findall(r'\d+', rain))) if pd.notna(rain) else 0)
    df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')
    df['Temp_Max'] = pd.to_numeric(df['Temp_Max'], errors='coerce')
    df['Temp_Min'] = pd.to_numeric(df['Temp_Min'], errors='coerce')

    avg_temp_diff = df['Temp_Max'] - df['Temp_Min']
    avg_temp_diff = avg_temp_diff.mean()
    df['Temp_Min'].fillna(df['Temp_Max'] - avg_temp_diff, inplace=True)

    # Extract wind information (direction and speed)
    df['Wind_Direction'], df['Wind_Speed_Min'], df['Wind_Speed_Max'] = zip(*df['Forecast'].apply(extract_wind_info))
    
    # Convert wind direction to degrees
    df['Wind_Direction_Degrees'] = df['Wind_Direction'].apply(wind_direction_to_degrees)

    # Convert wind speed from km/h to m/s
    df['Wind_Speed_Min (m/s)'] = df['Wind_Speed_Min'].apply(kmh_to_ms)
    df['Wind_Speed_Max (m/s)'] = df['Wind_Speed_Max'].apply(kmh_to_ms)

    # Filter the data for Sydney
    df_sydney = df[df['Location'] == 'Sydney'].copy()

    # Drop unnecessary columns
    df_sydney.drop(columns=['Location', 'Forecast'], inplace=True)

    # Convert 'Date' column to datetime format and set it as the index
    df_sydney['Date'] = pd.to_datetime(df_sydney['Date'])
    df_sydney.set_index('Date', inplace=True)

    # Rename columns to match desired format
    df_sydney.rename(columns={
        'Temp_Max': 'TEMP_max',
        'Temp_Min': 'TEMP_min',
        'Rain': 'RAIN_sum',
        'Wind_Direction_Degrees': 'WDR_mean',
        'Wind_Speed_Max (m/s)': 'WSP_max'
    }, inplace=True)

    return df_sydney[['TEMP_max', 'TEMP_min', 'RAIN_sum', 'WSP_max', 'WDR_mean']]

# LSTM model to predict air quality based on past air quality, weather, and traffic data
def lstm_air_quality_prediction(combined_data, forecasted_weather, traffic_predictions):
    features = ['TEMP_min', 'TEMP_max', 'RAIN_sum', 'WSP_max', 'WDR_mean', 'heavy_vehicle', 'light_vehicle', 
                'PM10_mean', 'PM2.5_mean', 'OZONE_mean']
    target_columns = ['PM10_mean', 'PM2.5_mean', 'OZONE_mean']

    # Feature selection can be applied here if necessary
    X = combined_data[features]
    y = combined_data[target_columns]
    
    # Handling missing values
    X.fillna(X.mean(), inplace=True)
    y.fillna(y.mean(), inplace=True)

    # Scaling the data
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Prepare sequences (for LSTM input)
    sequence_length = 7
    def create_sequences(data, seq_length):
        X_seq, y_seq = [], []
        for i in range(seq_length, len(data)):
            X_seq.append(data[i-seq_length:i])
            y_seq.append(data[i])
        return np.array(X_seq), np.array(y_seq)
    
    X_seq, y_seq = create_sequences(X_scaled, sequence_length)

    # Split into training and testing sets (80% train, 20% test)
    split_idx = int(len(X_seq) * 0.8)
    X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
    y_train, y_test = y_seq[:split_idx], y_seq[split_idx:]

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=(sequence_length, X_train.shape[2])))
    model.add(LSTM(64))
    model.add(Dense(len(target_columns)))
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the LSTM model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Predict for the next 7 days using forecasted weather and traffic
    last_sequence = X_test[-1]  # Start with the last available sequence
    predictions = []
    for _ in range(7):
        pred = model.predict(np.expand_dims(last_sequence, axis=0))
        predictions.append(pred[0])
        last_sequence = np.vstack([last_sequence[1:], pred[0]])

    return pd.DataFrame(predictions, columns=target_columns, index=pd.date_range(start=combined_data.index[-1], periods=7))

# Main workflow to combine all components
def main():
    # Fetch and process air quality data
    air_quality_df = pd.read_csv('combined_df.csv', parse_dates=['Date'], index_col='Date')
    
    numeric_columns = air_quality_df.select_dtypes(include=['number']).columns

    # Resample and aggregate only numeric columns
    daily_air_quality = air_quality_df[numeric_columns].resample('D').mean()

    # Merge the non-numeric columns back if needed
    non_numeric_columns = air_quality_df.select_dtypes(exclude=['number']).columns
    
    daily_air_quality = daily_air_quality.merge(air_quality_df[non_numeric_columns], left_index=True, right_index=True, how='left')
    # Fetch and process traffic data
    daily_traffic = process_traffic_data()

    # Fetch and process weather forecast data
    forecasted_weather = process_weather_forecast()

    # Combine data
    combined_data = pd.concat([daily_air_quality, daily_traffic, forecasted_weather], axis=1, join='outer')

    # Make predictions for air quality using LSTM model
    air_quality_predictions = lstm_air_quality_prediction(combined_data, forecasted_weather, daily_traffic)

    print("Air Quality Predictions for the next 7 days:")
    print(air_quality_predictions)

# Run the workflow
if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['datetime'] = pd.to_datetime(df_clean['date']) + pd.to_timedelta(df_clean['hour'], unit='h')


KeyError: "['forecast'] not in index"

In [28]:
df.columns

NameError: name 'df' is not defined

In [29]:
# Process traffic data
def process_traffic_data():
    df = pd.read_csv('/Users/evankerivan/Desktop/IOD/capstone/traffic_victoria_road.csv')
    df_melted = pd.melt(df, 
                        id_vars=['year', 'date', 'cardinal_direction_seq', 'classification_seq', 'public_holiday', 'school_holiday'],
                        value_vars=[f'hour_{str(i).zfill(2)}' for i in range(24)],
                        var_name='hour', 
                        value_name='traffic_count')

    df_melted['hour'] = df_melted['hour'].str.replace('hour_', '').astype(int)
    df_pivoted = df_melted.pivot_table(index=['year', 'date', 'hour', 'public_holiday', 'school_holiday'],
                                       columns='classification_seq', 
                                       values='traffic_count').reset_index()

    df_pivoted.rename(columns={'Heavy Vehicles': 'heavy_vehicle', 'Light Vehicles': 'light_vehicle'}, inplace=True)
    df_clean = df_pivoted.dropna()

    df_clean.loc[:, 'datetime'] = pd.to_datetime(df_clean['date']) + pd.to_timedelta(df_clean['hour'], unit='h')
    df_clean.set_index('datetime', inplace=True)

    daily_traffic_data = df_clean.resample('D').agg({
        'public_holiday': 'max', 'school_holiday': 'max', 'heavy_vehicle': 'sum', 'light_vehicle': 'sum'
    })

    return daily_traffic_data

# Function to parse weather data and extract wind speed and direction
def process_weather_forecast():
    # Connect to FTP and download the XML file
    ftp = FTP('ftp.bom.gov.au')
    ftp.login()
    ftp.cwd('/anon/gen/fwo/')
    with open('IDN11060.xml', 'wb') as file:
        ftp.retrbinary('RETR IDN11060.xml', file.write)
    ftp.quit()

    # Parse the XML file
    tree = ET.parse('IDN11060.xml')
    root = tree.getroot()

    # Prepare an empty list to store the data
    data = []
    for area in root.findall('.//area'):
        location = area.attrib.get('description')
        for period in area.findall('.//forecast-period'):
            start_time = period.attrib.get('start-time-local')
            forecast_data = {'Location': location, 'Date': start_time}
            for element in period.findall('element'):
                param_type = element.attrib.get('type')
                value = element.text
                units = element.attrib.get('units', '')
                forecast_data[f'{param_type} ({units})'] = value
            for text in period.findall('text'):
                text_type = text.attrib.get('type')
                text_value = text.text
                forecast_data[text_type] = text_value
            data.append(forecast_data)

    df = pd.DataFrame(data)

    # Extract the relevant columns for weather data
    # Checking column names with print(df.columns) to ensure "forecast" exists
    print(df.columns)

    # Adjusted to match actual column names in your dataset.
    if 'forecast' in df.columns:
        df = df[['Location', 'Date', 'air_temperature_maximum (Celsius)', 'air_temperature_minimum (Celsius)', 'precipitation_range ()', 'forecast']].copy()
    else:
        df = df[['Location', 'Date', 'air_temperature_maximum (Celsius)', 'air_temperature_minimum (Celsius)', 'precipitation_range ()']].copy()

    df.columns = ['Location', 'Date', 'Temp_Max', 'Temp_Min', 'Rain']

    # Process temperature and precipitation data
    df['Rain'] = df['Rain'].apply(lambda rain: max(map(int, re.findall(r'\d+', rain))) if pd.notna(rain) else 0)
    df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')
    df['Temp_Max'] = pd.to_numeric(df['Temp_Max'], errors='coerce')
    df['Temp_Min'] = pd.to_numeric(df['Temp_Min'], errors='coerce')

    avg_temp_diff = df['Temp_Max'] - df['Temp_Min']
    avg_temp_diff = avg_temp_diff.mean()
    df['Temp_Min'].fillna(df['Temp_Max'] - avg_temp_diff, inplace=True)

    # Extract wind information (direction and speed)
    df['Wind_Direction'], df['Wind_Speed_Min'], df['Wind_Speed_Max'] = zip(*df['Forecast'].apply(extract_wind_info) if 'Forecast' in df.columns else (None, None, None))
    
    # Convert wind direction to degrees
    df['Wind_Direction_Degrees'] = df['Wind_Direction'].apply(wind_direction_to_degrees)

    # Convert wind speed from km/h to m/s
    df['Wind_Speed_Min (m/s)'] = df['Wind_Speed_Min'].apply(kmh_to_ms)
    df['Wind_Speed_Max (m/s)'] = df['Wind_Speed_Max'].apply(kmh_to_ms)

    # Filter the data for Sydney
    df_sydney = df[df['Location'] == 'Sydney'].copy()

    # Drop unnecessary columns
    df_sydney.drop(columns=['Location', 'Forecast'], inplace=True)

    # Convert 'Date' column to datetime format and set it as the index
    df_sydney['Date'] = pd.to_datetime(df_sydney['Date'])
    df_sydney.set_index('Date', inplace=True)

    # Rename columns to match desired format
    df_sydney.rename(columns={
        'Temp_Max': 'TEMP_max',
        'Temp_Min': 'TEMP_min',
        'Rain': 'RAIN_sum',
        'Wind_Direction_Degrees': 'WDR_mean',
        'Wind_Speed_Max (m/s)': 'WSP_max'
    }, inplace=True)

    return df_sydney[['TEMP_max', 'TEMP_min', 'RAIN_sum', 'WSP_max', 'WDR_mean']]

# Main workflow to combine all components
def main():
    # Fetch and process air quality data
    air_quality_df = pd.read_csv('combined_df.csv', parse_dates=['Date'], index_col='Date')
    
    numeric_columns = air_quality_df.select_dtypes(include=['number']).columns

    # Resample and aggregate only numeric columns
    daily_air_quality = air_quality_df[numeric_columns].resample('D').mean()

    # Merge the non-numeric columns back if needed
    non_numeric_columns = air_quality_df.select_dtypes(exclude=['number']).columns
    
    daily_air_quality = daily_air_quality.merge(air_quality_df[non_numeric_columns], left_index=True, right_index=True, how='left')

    # Fetch and process traffic data
    daily_traffic = process_traffic_data()

    # Fetch and process weather forecast data
    forecasted_weather = process_weather_forecast()

    # Combine data
    combined_data = pd.concat([daily_air_quality, daily_traffic, forecasted_weather], axis=1, join='outer')

    # Make predictions for air quality using LSTM model
    air_quality_predictions = lstm_air_quality_prediction(combined_data, forecasted_weather, daily_traffic)

    print("Air Quality Predictions for the next 7 days:")
    print(air_quality_predictions)

# Run the workflow
if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.loc[:, 'datetime'] = pd.to_datetime(df_clean['date']) + pd.to_timedelta(df_clean['hour'], unit='h')


Index(['Location', 'Date', 'forecast_icon_code ()', 'precis',
       'probability_of_precipitation', 'air_temperature_minimum (Celsius)',
       'air_temperature_maximum (Celsius)', 'precipitation_range ()'],
      dtype='object')


TypeError: 'NoneType' object is not iterable