In [14]:
import pandas as pd

# Define file paths
file_mean = "/Users/dianenacario/scikit_learn_data/downloadedapih4apcdatasets/LAEI_2019_NA_FILLED_WITH_MEAN.csv"

# Load the datasets
mean_df = pd.read_csv(file_mean)

# Check the first few rows of each dataset
print("Data with Mean Imputation:\n", mean_df.head())

Data with Mean Imputation:
    Year              Sector          nox         n2o        pm10       pm2.5  \
0  2025    Accidental Fires    17.258441   53.946697   71.915973   66.742782   
1  2025         Agriculture   203.303679  210.885332   42.031963   15.701566   
2  2025            Aviation  3795.035535   53.946697   55.967384   45.304076   
3  2025             Biomass   785.040736   53.946697  615.931169  615.931169   
4  2025  Commercial Cooking   785.040736   53.946697  510.625152  510.625152   

             co2  
0  807486.524214  
1    7464.299415  
2  979096.462843  
3  807486.524214  
4  807486.524214  


In [15]:
# Filter dataset to include only years 2013, 2016, and 2019 for training
train_data_mean = mean_df[mean_df['Year'].isin([2025, 2030])]

# Check if filtering worked
print("Filtered Data (Mean Imputation):\n", train_data_mean.head())

Filtered Data (Mean Imputation):
    Year              Sector          nox         n2o        pm10       pm2.5  \
0  2025    Accidental Fires    17.258441   53.946697   71.915973   66.742782   
1  2025         Agriculture   203.303679  210.885332   42.031963   15.701566   
2  2025            Aviation  3795.035535   53.946697   55.967384   45.304076   
3  2025             Biomass   785.040736   53.946697  615.931169  615.931169   
4  2025  Commercial Cooking   785.040736   53.946697  510.625152  510.625152   

             co2  
0  807486.524214  
1    7464.299415  
2  979096.462843  
3  807486.524214  
4  807486.524214  


In [16]:
# Fill missing values with the mean for the mean-imputed dataset
for col in ["nox", "pm10", "pm2.5", "co2"]:
    train_data_mean.loc[:, col] = train_data_mean[col].fillna(train_data_mean[col].mean())

In [19]:
### Prepare Data for Model Training
# Define the year for training and testing
year = 2025

# Define the pollutants to predict
pollutants = ["nox", "pm10", "pm2.5", "co2"]

# Prepare the features (X) by dropping target variables and Year
X_train_mean = train_data_mean[train_data_mean["Year"] == year].drop(columns=pollutants + ["Year"])
X_test_mean = train_data_mean[train_data_mean["Year"] == year].drop(columns=pollutants + ["Year"])

# Prepare the target variables (y) for both the training and testing sets
# Using a dictionary to store the targets for each pollutant
y_train_mean = {pollutant: train_data_mean[train_data_mean["Year"] == year][pollutant] for pollutant in pollutants}
y_test_mean = {pollutant: train_data_mean[train_data_mean["Year"] == year][pollutant] for pollutant in pollutants}


In [21]:
# Prepare the target variables (y) for both the training and testing sets
pollutants = ["nox", "pm10", "pm2.5", "co2"]

y_train_mean = {pollutant: train_data_mean[train_data_mean["Year"] == train_years][pollutant] for pollutant in pollutants}
y_test_mean = {pollutant: train_data_mean[train_data_mean["Year"] == test_year][pollutant] for pollutant in pollutants}

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import joblib  # To save the models

# Define pollutants to train the model for
pollutants = ["nox", "pm10", "pm2.5", "co2"]

# Function to train and save Random Forest for each pollutant
def train_rf_for_all_pollutants(X_train_mean, train_data_mean, pollutants, year):
    # Store the models for each pollutant
    models_mean = {}

    for pollutant in pollutants:
        # Prepare the target variable for the current pollutant
        y_train_mean = train_data_mean[train_data_mean["Year"] == year][pollutant]

        # Apply One-Hot Encoding to the training sets
        X_train_mean_encoded = pd.get_dummies(X_train_mean, drop_first=True)

        # Train the model using the mean-imputed dataset
        rf_model_mean = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model_mean.fit(X_train_mean_encoded, y_train_mean)
        models_mean[pollutant] = rf_model_mean  # Save the trained model
        
        # Save the mean-imputed model to disk
        joblib.dump(rf_model_mean, f'rf_model_mean_{pollutant}.pkl')

        print(f"Training and saving completed for pollutant: {pollutant}")

    return models_mean

# Define the year for training (since you only have 2025)
year = 2025

# Prepare the training features (X_train) from 2025
X_train_mean = train_data_mean[train_data_mean["Year"] == year].drop(columns=pollutants + ["Year"])

# Train Random Forest models for each pollutant and save them
models_mean = train_rf_for_all_pollutants(X_train_mean, train_data_mean, pollutants, year)


Training and saving completed for pollutant: nox
Training and saving completed for pollutant: pm10
Training and saving completed for pollutant: pm2.5
Training and saving completed for pollutant: co2


In [26]:
import pandas as pd
import numpy as np
import joblib
import sys

# Optionally increase recursion limit if necessary
sys.setrecursionlimit(10000)

# Define file path for the mean-imputed dataset
file_mean = "/Users/dianenacario/scikit_learn_data/downloadedapih4apcdatasets/LAEI_2019_NA_FILLED_WITH_MEAN.csv"

# Reload the dataset
print("Loading dataset...")
train_data_mean = pd.read_csv(file_mean)
print("Dataset loaded successfully!")

# Define pollutants to predict
pollutants = ["nox", "pm10", "pm2.5", "co2"]

# Load the trained models from the saved files
models_mean = {}

print("Loading models...")
for pollutant in pollutants:
    models_mean[pollutant] = joblib.load(f'rf_model_mean_{pollutant}.pkl')
print("Models loaded successfully!")

# Generate input features for 2025 using the mean of the historical data (since only 2025 is available)
print("Generating input features for 2025...")

# Select the relevant data for 2025, excluding pollutants and Year
X_train_mean = train_data_mean[train_data_mean["Year"] == 2025].drop(columns=pollutants + ["Year"])

# Calculate mean for the numeric columns for 2025 features
numeric_cols = X_train_mean.select_dtypes(include=[np.number]).columns.tolist()

# Create 2025 numeric features
X_2025_mean_numeric = X_train_mean[numeric_cols].mean().values.reshape(1, -1)

# Convert the numeric data for prediction
X_2025_mean_encoded = pd.DataFrame(X_2025_mean_numeric, columns=numeric_cols)

# Align the columns to match the model's input columns
print("Aligning columns with the training set encoding...")

# Retrieve the feature names the model was trained on
trained_feature_names = models_mean['nox'].feature_names_in_

def align_features(X_2025, trained_feature_names):
    # Add missing features with default values (e.g., 0)
    missing_features = set(trained_feature_names) - set(X_2025.columns)
    for feature in missing_features:
        X_2025[feature] = 0
    
    # Ensure the columns are in the same order as trained features
    X_2025 = X_2025[trained_feature_names]
    return X_2025

# Align the encoded dataset for 2025
X_2025_mean_encoded_aligned = align_features(X_2025_mean_encoded, trained_feature_names)

# Predict pollutant levels for 2025 using the mean-imputed models
predictions_2025_mean = {}

print("Predicting pollutant levels for 2025...")
for pollutant in pollutants:
    # Predict using the mean-imputed model
    predictions_2025_mean[pollutant] = models_mean[pollutant].predict(X_2025_mean_encoded_aligned)

# Display predictions for each pollutant
print("Displaying predictions for 2025...")
for pollutant in pollutants:
    print(f"Predicted {pollutant} for 2025 (Mean Imputed): {predictions_2025_mean[pollutant][0]}")

print("Prediction completed successfully!")


Loading dataset...
Dataset loaded successfully!
Loading models...
Models loaded successfully!
Generating input features for 2025...
Aligning columns with the training set encoding...
Predicting pollutant levels for 2025...
Displaying predictions for 2025...
Predicted nox for 2025 (Mean Imputed): 677.4906096278188
Predicted pm10 for 2025 (Mean Imputed): 193.18715964770374
Predicted pm2.5 for 2025 (Mean Imputed): 80.36792450569972
Predicted co2 for 2025 (Mean Imputed): 504409.72351702576
Prediction completed successfully!
