In [1]:
### Load and Process the Files

import pandas as pd

# Define file paths
file_mean = "/Users/dianenacario/scikit_learn_data/uol_group_d/datasets_mean_median/LAEI_2019_NA_FILLED_WITH_MEAN.csv"
file_median = "/Users/dianenacario/scikit_learn_data/uol_group_d/datasets_mean_median/LAEI_2019_NA_FILLED_WITH_MEDIAN.csv"

# Load the datasets
mean_df = pd.read_csv(file_mean)
median_df = pd.read_csv(file_median)

# Check the first few rows of each dataset
print("Data with Mean Imputation:\n", mean_df.head())
print("\nData with Median Imputation:\n", median_df.head())

Data with Mean Imputation:
    Year  Grid ID 2019  LAEI 1km2 ID  Easting  Northing  Borough     Zone  \
0  2030             1          5910   510500    203500  Non GLA  Non GLA   
1  2030             2          5911   511500    203500  Non GLA  Non GLA   
2  2030             3          5912   512500    203500  Non GLA  Non GLA   
3  2030             4          5915   515500    203500  Non GLA  Non GLA   
4  2030             5          5916   516500    203500  Non GLA  Non GLA   

  Main Source Category   Sector        Source  ...  n2o  nh3  nmvoc  nox  pb  \
0             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
1             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
2             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
3             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
4             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   

   pcb      pm10     pm2.5  so2  E

In [2]:
### Filter Data for 2013, 2016, 2019

# Filter dataset to include only years 2013, 2016, and 2019 for training
train_data_mean = mean_df[mean_df['Year'].isin([2013, 2016, 2019])]
train_data_median = median_df[median_df['Year'].isin([2013, 2016, 2019])]

# Check if filtering worked
print("Filtered Data (Mean Imputation):\n", train_data_mean.head())
print("Filtered Data (Median Imputation):\n", train_data_median.head())

Filtered Data (Mean Imputation):
         Year  Grid ID 2019  LAEI 1km2 ID  Easting  Northing  Borough     Zone  \
285264  2019             1          5910   510500    203500  Non GLA  Non GLA   
285265  2019             2          5911   511500    203500  Non GLA  Non GLA   
285266  2019             3          5912   512500    203500  Non GLA  Non GLA   
285267  2019             4          5915   515500    203500  Non GLA  Non GLA   
285268  2019             5          5916   516500    203500  Non GLA  Non GLA   

       Main Source Category   Sector        Source  ...  n2o  nh3  nmvoc  nox  \
285264             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN   
285265             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN   
285266             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN   
285267             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN   
285268             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  

In [4]:
### Process Categorical Variables

from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoder
encoder = LabelEncoder()

# Encode 'Main Source Category' using explicit row and column indexing
train_data_mean.loc[:, "main_source_encoded"] = encoder.fit_transform(train_data_mean.loc[:, "Main Source Category"])
train_data_median.loc[:, "main_source_encoded"] = encoder.fit_transform(train_data_median.loc[:, "Main Source Category"])


In [5]:
### Handle Missing Values

# Fill missing values with the mean for the mean-imputed dataset
for col in ["nox", "pm10", "pm2.5", "co2"]:
    train_data_mean.loc[:, col] = train_data_mean[col].fillna(train_data_mean[col].mean())

# Fill missing values with the median for the median-imputed dataset
for col in ["nox", "pm10", "pm2.5", "co2"]:
    train_data_median.loc[:, col] = train_data_median[col].fillna(train_data_median[col].median())

In [7]:
### Prepare Data for Model Training
# Define the years for training and testing
train_years = [2013, 2016, 2019]
test_year = 2025

# Define the pollutants to predict
pollutants = ["nox", "pm10", "pm2.5", "co2"]

# Prepare the features (X) by dropping target variables and Year
X_train_mean = train_data_mean[train_data_mean["Year"].isin(train_years)].drop(columns=pollutants + ["Year"])
X_test_mean = train_data_mean[train_data_mean["Year"] == test_year].drop(columns=pollutants + ["Year"])

X_train_median = train_data_median[train_data_median["Year"].isin(train_years)].drop(columns=pollutants + ["Year"])
X_test_median = train_data_median[train_data_median["Year"] == test_year].drop(columns=pollutants + ["Year"])

# Prepare the target variables (y) for both the training and testing sets
# Using a dictionary to store the targets for each pollutant
y_train_mean = {pollutant: train_data_mean[train_data_mean["Year"].isin(train_years)][pollutant] for pollutant in pollutants}
y_test_mean = {pollutant: train_data_mean[train_data_mean["Year"] == test_year][pollutant] for pollutant in pollutants}

y_train_median = {pollutant: train_data_median[train_data_median["Year"].isin(train_years)][pollutant] for pollutant in pollutants}
y_test_median = {pollutant: train_data_median[train_data_median["Year"] == test_year][pollutant] for pollutant in pollutants}


In [8]:
### One-Hot Encoding of Categorical Variables
# Since the Main Source Category and potentially other categorical variables are present, 
# we need to encode these for model training using One-Hot Encoding

# One-Hot Encode the categorical variables in both training and testing sets
X_train_mean_encoded = pd.get_dummies(X_train_mean, drop_first=True)
X_test_mean_encoded = pd.get_dummies(X_test_mean, drop_first=True)

X_train_median_encoded = pd.get_dummies(X_train_median, drop_first=True)
X_test_median_encoded = pd.get_dummies(X_test_median, drop_first=True)

# Ensure the columns in the training and testing sets match
X_train_mean_encoded, X_test_mean_encoded = X_train_mean_encoded.align(X_test_mean_encoded, join='left', axis=1, fill_value=0)
X_train_median_encoded, X_test_median_encoded = X_train_median_encoded.align(X_test_median_encoded, join='left', axis=1, fill_value=0)

In [10]:
# Prepare the target variables (y) for both the training and testing sets
pollutants = ["nox", "pm10", "pm2.5", "co2"]

y_train_mean = {pollutant: train_data_mean[train_data_mean["Year"].isin(train_years)][pollutant] for pollutant in pollutants}
y_test_mean = {pollutant: train_data_mean[train_data_mean["Year"] == test_year][pollutant] for pollutant in pollutants}

y_train_median = {pollutant: train_data_median[train_data_median["Year"].isin(train_years)][pollutant] for pollutant in pollutants}
y_test_median = {pollutant: train_data_median[train_data_median["Year"] == test_year][pollutant] for pollutant in pollutants}


In [None]:
### Train Random Forest Model for each pollutants

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Define pollutants to train the model for
pollutants = ["nox", "pm10", "pm2.5", "co2"]

# Function to train Random Forest for each pollutant
def train_rf_for_all_pollutants(X_train_mean, X_train_median, train_data_mean, train_data_median, pollutants):
    # Store the models for each pollutant
    models_mean = {}
    models_median = {}
    
    for pollutant in pollutants:
        # Prepare the target variable for the current pollutant
        y_train_mean = train_data_mean[train_data_mean["Year"].isin([2013, 2016, 2019])][pollutant]
        y_train_median = train_data_median[train_data_median["Year"].isin([2013, 2016, 2019])][pollutant]
        
        # Apply One-Hot Encoding to the training sets
        X_train_mean_encoded = pd.get_dummies(X_train_mean, drop_first=True)
        X_train_median_encoded = pd.get_dummies(X_train_median, drop_first=True)

        # Train the model using the mean-imputed dataset
        rf_model_mean = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model_mean.fit(X_train_mean_encoded, y_train_mean)
        models_mean[pollutant] = rf_model_mean  # Save the trained model

        # Train the model using the median-imputed dataset
        rf_model_median = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model_median.fit(X_train_median_encoded, y_train_median)
        models_median[pollutant] = rf_model_median  # Save the trained model

        print(f"Training completed for pollutant: {pollutant}")
    
    return models_mean, models_median

# Prepare the training features (X_train) from 2013, 2016, and 2019
X_train_mean = train_data_mean[train_data_mean["Year"].isin([2013, 2016, 2019])].drop(columns=pollutants + ["Year"])
X_train_median = train_data_median[train_data_median["Year"].isin([2013, 2016, 2019])].drop(columns=pollutants + ["Year"])

# Train Random Forest models for each pollutant
models_mean, models_median = train_rf_for_all_pollutants(X_train_mean, X_train_median, train_data_mean, train_data_median, pollutants)

# Now models_mean and models_median store the trained Random Forest models for each pollutant.
