# Linear Regression

With the Linear regression method, we are trying to predict the values for 2025

For this we are going to first load the data from the dataset

In [10]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# File paths based on  datasets_mean_median.zip
file_mean = "../datasets/LAEI_2019_NA_FILLED_WITH_MEAN.csv"
file_median = "../datasets/LAEI_2019_NA_FILLED_WITH_MEDIAN.csv"

# Load the datasets
mean_df = pd.read_csv(file_mean)
median_df = pd.read_csv(file_median)

# Printing the rows
print("Data with Mean Imputation:\n", mean_df.head())
print("\nData with Median Imputation:\n", median_df.head())

Data with Mean Imputation:
    Year  Grid ID 2019  LAEI 1km2 ID  Easting  Northing  Borough     Zone  \
0  2030             1          5910   510500    203500  Non GLA  Non GLA   
1  2030             2          5911   511500    203500  Non GLA  Non GLA   
2  2030             3          5912   512500    203500  Non GLA  Non GLA   
3  2030             4          5915   515500    203500  Non GLA  Non GLA   
4  2030             5          5916   516500    203500  Non GLA  Non GLA   

  Main Source Category   Sector        Source  ...  n2o  nh3  nmvoc  nox  pb  \
0             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
1             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
2             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
3             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   
4             Domestic  Biomass  Wood Burning  ...  NaN  NaN    NaN  NaN NaN   

   pcb      pm10     pm2.5  so2  E

In [11]:
# Filter dataset to include only years 2013, 2016, and 2019 for training
train_data_mean = mean_df[mean_df['Year'].isin([2013, 2016, 2019])]
train_data_median = median_df[median_df['Year'].isin([2013, 2016, 2019])]

### Process Categorical Variables

from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoder
encoder = LabelEncoder()

# Encode 'Main Source Category' using explicit row and column indexing
mean_source_encoded = encoder.fit_transform(train_data_mean.loc[:, "Main Source Category"])
median_source_encoded = encoder.fit_transform(train_data_median.loc[:, "Main Source Category"])
train_data_mean.loc[:, "main_source_encoded"] = mean_source_encoded
train_data_median.loc[:, "main_source_encoded"] = median_source_encoded

pollutants = ["nox", "pm10", "pm2.5", "co2"]


# Fill missing values with the mean for the mean-imputed dataset
for col in pollutants:
    train_data_mean.loc[:, col] = train_data_mean[col].fillna(train_data_mean[col].mean())

# Fill missing values with the median for the median-imputed dataset
for col in pollutants:
    train_data_median.loc[:, col] = train_data_median[col].fillna(train_data_median[col].median())

### Prepare Data for Model Training
# Define the years for training and testing
train_years = [2013, 2016, 2019]
test_year = 2025

# Prepare the features (X) by dropping target variables and Year
X_train_mean = train_data_mean[train_data_mean["Year"].isin(train_years)].drop(columns=pollutants + ["Year"])
X_test_mean = train_data_mean[train_data_mean["Year"] == test_year].drop(columns=pollutants + ["Year"])

X_train_median = train_data_median[train_data_median["Year"].isin(train_years)].drop(columns=pollutants + ["Year"])
X_test_median = train_data_median[train_data_median["Year"] == test_year].drop(columns=pollutants + ["Year"])

# Prepare the target variables (y) for both the training and testing sets
# Using a dictionary to store the targets for each pollutant
y_train_mean = {pollutant: train_data_mean[train_data_mean["Year"].isin(train_years)][pollutant] for pollutant in pollutants}
y_test_mean = {pollutant: train_data_mean[train_data_mean["Year"] == test_year][pollutant] for pollutant in pollutants}

y_train_median = {pollutant: train_data_median[train_data_median["Year"].isin(train_years)][pollutant] for pollutant in pollutants}
y_test_median = {pollutant: train_data_median[train_data_median["Year"] == test_year][pollutant] for pollutant in pollutants}

Defining the function to train the model

In [12]:

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import numpy as np
import joblib

def train_linear_regression_for_all_polutants(X_train_mean, X_train_median, train_data_mean, train_data_median, pollutants):

    models_mean = {}
    models_median = {}

    for pollutant in pollutants:
        # Prepare the target variable for the current pollutant
        y_train_mean = train_data_mean[train_data_mean["Year"].isin([2013, 2016, 2019])][pollutant]
        y_train_median = train_data_median[train_data_median["Year"].isin([2013, 2016, 2019])][pollutant]

        # Apply One-Hot Encoding to the training sets
        X_train_mean_encoded = pd.get_dummies(X_train_mean, drop_first=True)
        X_train_median_encoded = pd.get_dummies(X_train_median, drop_first=True)

        
        #Fit linear regression for the mean
        linear_regression_mean = LinearRegression()
        linear_regression_mean.fit(X_train_mean_encoded, y_train_mean)
        models_mean[pollutant] = linear_regression_mean  # Save the trained model

        # Save the mean-imputed model to disk
        joblib.dump(linear_regression_mean, f'linear_regression_model_mean_{pollutant}.pkl')

        #Fit linear regression for the median

        #Fit linear regression for the mean
        linear_regression_median = LinearRegression()
        linear_regression_median.fit(X_train_median_encoded, y_train_median)
        models_mean[pollutant] = linear_regression_median  # Save the trained model

        # Save the mean-imputed model to disk
        joblib.dump(linear_regression_mean, f'linear_regression_model_mean_{pollutant}.pkl')
        

Running the function to train the model

In [13]:
X_train_mean = train_data_mean[train_data_mean["Year"].isin([2013, 2016, 2019])].drop(columns=pollutants + ["Year"])
X_train_median = train_data_median[train_data_median["Year"].isin([2013, 2016, 2019])].drop(columns=pollutants + ["Year"])


# Train Random Forest models for each pollutant and save them
models_mean, models_median = train_linear_regression_for_all_polutants(X_train_mean, X_train_median, train_data_mean, train_data_median, pollutants)

        Grid ID 2019  LAEI 1km2 ID  Easting  Northing    Borough     Zone  \
285264             1          5910   510500    203500    Non GLA  Non GLA   
285265             2          5911   511500    203500    Non GLA  Non GLA   
285266             3          5912   512500    203500    Non GLA  Non GLA   
285267             4          5915   515500    203500    Non GLA  Non GLA   
285268             5          5916   516500    203500    Non GLA  Non GLA   
...              ...           ...      ...       ...        ...      ...   
699115          3456         10059   531500    179500  Southwark    Inner   
699116          3457         10059   531500    179500  Southwark  Central   
699117          3458          9714   530500    181500     Camden  Central   
699118          3459          9716   532500    181500  Islington  Central   
699119          3460          9716   532500    181500       City  Central   

       Main Source Category          Sector        Source  bap  ...  hcl  h

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values