# Linear Regression

With the Linear regression method, we are trying to predict the values for 2025

For this we are going to first load the data from the dataset

In [113]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# File paths based on  datasets_mean_median.zip
file_mean = "../datasets/LAEI_2019_NA_FILLED_WITH_MEAN.csv"
file_median = "../datasets/LAEI_2019_NA_FILLED_WITH_MEDIAN.csv"

# Load the datasets
mean_df = pd.read_csv(file_mean)
median_df = pd.read_csv(file_median)

# Printing the rows
print("Data with Mean Imputation:\n", mean_df.head())
print("\nData with Median Imputation:\n", median_df.head())

Data with Mean Imputation:
    Year  Grid ID 2019  LAEI 1km2 ID  Easting  Northing  Borough     Zone  \
0  2030             1          5910   510500    203500  Non GLA  Non GLA   
1  2030             2          5911   511500    203500  Non GLA  Non GLA   
2  2030             3          5912   512500    203500  Non GLA  Non GLA   
3  2030             4          5915   515500    203500  Non GLA  Non GLA   
4  2030             5          5916   516500    203500  Non GLA  Non GLA   

  Main Source Category   Sector        Source  ...  n2o  nh3  nmvoc  nox  pb  \
0             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
1             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
2             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
3             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
4             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   

   pcb      pm10     pm2.5  so2  E

In [114]:
# Filter dataset to include only years 2013, 2016, and 2019 for training
train_data_mean = mean_df[mean_df['Year'].isin([2013, 2016, 2019, 2025])]
train_data_median = median_df[median_df['Year'].isin([2013, 2016, 2019, 2025])]

### Process Categorical Variables

from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoder
encoder = LabelEncoder()

# Encode 'Main Source Category' using explicit row and column indexing
mean_source_encoded = encoder.fit_transform(train_data_mean.loc[:, "Main Source Category"])
median_source_encoded = encoder.fit_transform(train_data_median.loc[:, "Main Source Category"])
train_data_mean.loc[:, "main_source_encoded"] = mean_source_encoded
train_data_median.loc[:, "main_source_encoded"] = median_source_encoded

pollutants = ["nox", "pm10", "pm2.5", "co2"]


# Fill missing values with the mean for the mean-imputed dataset
for col in pollutants:
    train_data_mean.loc[:, col] = train_data_mean[col].fillna(train_data_mean[col].mean())

# Fill missing values with the median for the median-imputed dataset
for col in pollutants:
    train_data_median.loc[:, col] = train_data_median[col].fillna(train_data_median[col].median())

Defining the function to train the model

In [115]:

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import numpy as np
import joblib

def train_linear_regression_for_all_polutants(X_train_mean, X_train_median, train_data_mean, train_data_median, pollutants):

    models_mean = {}
    models_median = {}

    for pollutant in pollutants:
        print(f"pollutant: {pollutant}")
        # Prepare the target variable for the current pollutant
        y_train_mean = train_data_mean[train_data_mean["Year"].isin([2013, 2016, 2019])][pollutant]
        y_train_median = train_data_median[train_data_median["Year"].isin([2013, 2016, 2019])][pollutant]

        # Apply One-Hot Encoding to the training sets
        X_train_mean_encoded = pd.get_dummies(X_train_mean, drop_first=True)
        X_train_median_encoded = pd.get_dummies(X_train_median, drop_first=True)

        
        #Fit linear regression for the mean
        linear_regression_mean = LinearRegression()
        linear_regression_mean.fit(X_train_mean_encoded, y_train_mean)
        models_mean[f"{pollutant}"] = linear_regression_mean  # Save the trained model

        #Fit linear regression for the median
        linear_regression_median = LinearRegression()
        linear_regression_median.fit(X_train_median_encoded, y_train_median)
        models_median[f"{pollutant}"] = linear_regression_median  # Save the trained model

    return models_mean, models_median
        

Running the function to train the model

In [116]:
X_train_mean = train_data_mean[train_data_mean["Year"].isin([2013, 2016, 2019])].drop(columns=pollutants + ["bap", "cd", "c4h6", "c6h6", "ch4", "co", "hc", "hcl", "hg", "n2o", "nh3", "nmvoc", "pb", "pcb", "so2"])
X_train_median = train_data_median[train_data_median["Year"].isin([2013, 2016, 2019])].drop(columns=pollutants + ["bap", "cd", "c4h6", "c6h6", "ch4", "co", "hc", "hcl", "hg", "n2o", "nh3", "nmvoc", "pb", "pcb", "so2"])



# Train Random Forest models for each pollutant and save them
models_mean, models_median = train_linear_regression_for_all_polutants(X_train_mean, X_train_median, train_data_mean, train_data_median, pollutants)

pollutant: nox
pollutant: pm10
pollutant: pm2.5
pollutant: co2


After training the model for the different pollutants with the main and the median we are predicting the values for 2025

In [117]:
X_test_mean = train_data_mean[train_data_mean["Year"].isin([2025])].drop(columns=pollutants + ["bap", "cd", "c4h6", "c6h6", "ch4", "co", "hc", "hcl", "hg", "n2o", "nh3", "nmvoc", "pb", "pcb", "so2"])
X_test_median = train_data_median[train_data_median["Year"].isin([2025])].drop(columns=pollutants + ["bap", "cd", "c4h6", "c6h6", "ch4", "co", "hc", "hcl", "hg", "n2o", "nh3", "nmvoc", "pb", "pcb", "so2"])

X_test_mean_encoded = pd.get_dummies(X_test_mean, drop_first=True)
X_test_median_encoded = pd.get_dummies(X_test_median, drop_first=True)

for pollutant in pollutants:
    print(f"Predicting for {pollutant}")
    median_regression_model = models_median[pollutant]
    mean_regression_model = models_mean[pollutant]
    print(f"Median {median_regression_model.predict(X_test_median_encoded)}")
    print(f"Mean {mean_regression_model.predict(X_test_mean_encoded)}")
    
    

Predicting for nox
Median [-0.23480225 -0.23590088 -0.23706055 ...  1.14343262  0.94445801
  1.51446533]
Mean [0.16882324 0.16766357 0.16662598 ... 1.14337158 0.94439697 1.51434326]
Predicting for pm10
Median [0.20690918 0.20681763 0.20672607 ... 0.14407349 0.0791626  0.22332764]
Mean [0.20690918 0.20681763 0.20675659 ... 0.14407349 0.0791626  0.22332764]
Predicting for pm2.5
Median [0.21218872 0.21206665 0.21194458 ... 0.06356812 0.04586792 0.08978271]
Mean [0.21209717 0.2119751  0.21185303 ... 0.06347656 0.04580688 0.08969116]
Predicting for co2
Median [-37.6875 -38.5625 -39.4375 ... 443.25   356.5625 484.25  ]
Mean [230.375 229.5   228.625 ... 443.25  356.625 484.25 ]
