In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_regression
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

### Link to reference: [Link](https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-perform-multioutput-regression-with-svms-in-python.mdm)

## 1. Import Data

In [2]:
# Define file paths
# file_mean = "/Users/kitili/masters-big-data/uol_group_d/ipynb_files/LAEI_2019_NA_FILLED_WITH_MEAN.csv"
file_median = "/Users/kitili/masters-big-data/uol_group_d/ipynb_files/LAEI_2019.csv"

# Load the datasets
# mean_df = pd.read_csv(file_mean)
df = pd.read_csv(file_median)

print(df.shape)

(699120, 30)


In [3]:
df['Year'].unique()

array([2030, 2025, 2019, 2016, 2013])

In [4]:
df.head()

Unnamed: 0,Year,Grid ID 2019,LAEI 1km2 ID,Easting,Northing,Borough,Zone,Main Source Category,Sector,Source,bap,cd,c4h6,c6h6,ch4,co,co2,hc,hcl,hg,n2o,nh3,nmvoc,nox,pb,pcb,pm10,pm2.5,so2,Emissions Unit
0,2030,1,5910,510500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,,,,,,,,,,,,,,,,,0.019183,0.019183,,tonnes/annum
1,2030,2,5911,511500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,,,,,,,,,,,,,,,,,0.015719,0.015719,,tonnes/annum
2,2030,3,5912,512500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,,,,,,,,,,,,,,,,,0.019878,0.019878,,tonnes/annum
3,2030,4,5915,515500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,,,,,,,,,,,,,,,,,0.020946,0.020946,,tonnes/annum
4,2030,5,5916,516500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,,,,,,,,,,,,,,,,,0.020105,0.020105,,tonnes/annum


In [5]:
# filter training data
df = df[df['Year'].isin([2013, 2016, 2019, 2025])]
df.shape

(556488, 30)

## 2. Data clean up and handling missing values

In [6]:
# proportion of missing values per column
df.isnull().mean()

Year                    0.000000
Grid ID 2019            0.000000
LAEI 1km2 ID            0.000000
Easting                 0.000000
Northing                0.000000
Borough                 0.000000
Zone                    0.000000
Main Source Category    0.000000
Sector                  0.000000
Source                  0.000000
bap                     0.745080
cd                      0.807256
c4h6                    0.713992
c6h6                    0.664252
ch4                     0.595858
co                      0.602076
co2                     0.229748
hc                      0.950259
hcl                     0.875649
hg                      0.794820
n2o                     0.713992
nh3                     0.745080
nmvoc                   0.583423
nox                     0.148920
pb                      0.794820
pcb                     0.788603
pm10                    0.024870
pm2.5                   0.024870
so2                     0.713992
Emissions Unit          0.000000
dtype: flo

In [7]:
# fill missing values with the median
columns_with_nulls = df.columns[df.isnull().any()].tolist()
# columns_with_nulls

In [8]:
# Replace nulls with the median of specified columns per 'Grid ID 2019'
for column in columns_with_nulls:
    df[column] = df.groupby('Grid ID 2019')[column].transform(lambda x: x.fillna(x.median()))

# train_df.isnull().sum()

In [9]:
df.head()

Unnamed: 0,Year,Grid ID 2019,LAEI 1km2 ID,Easting,Northing,Borough,Zone,Main Source Category,Sector,Source,bap,cd,c4h6,c6h6,ch4,co,co2,hc,hcl,hg,n2o,nh3,nmvoc,nox,pb,pcb,pm10,pm2.5,so2,Emissions Unit
142632,2025,1,5910,510500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.0,0.0,0.0,0.204897,0.0,0.0,0.0,0.0,0.0,0.0,0.000146,0.0,0.0,0.020429,0.020429,0.0,tonnes/annum
142633,2025,2,5911,511500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.0,0.0,0.0,7.285452,0.0,0.0,0.0,0.0,0.0,0.0,0.000943,0.0,0.0,0.01674,0.01674,0.0,tonnes/annum
142634,2025,3,5912,512500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.0,0.0,0.0,3.839111,0.0,0.0,0.0,0.0,0.0,0.0,0.002829,0.0,0.0,0.021169,0.021169,0.0,tonnes/annum
142635,2025,4,5915,515500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.000138,0.0,0.0,0.434381,0.008365,0.0,0.0,0.0,0.0,0.00765,0.000788,0.0,0.0,0.022307,0.022307,0.0,tonnes/annum
142636,2025,5,5916,516500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,4.3e-05,0.0,0.0,0.308439,0.005322,0.0,0.0,0.0,0.0,0.001464,0.000336,0.0,0.0,0.02141,0.02141,0.0,tonnes/annum


In [10]:
# remove ID columns
df = df.drop(columns=['Grid ID 2019', 'LAEI 1km2 ID', 'Emissions Unit'])
df.head()

Unnamed: 0,Year,Easting,Northing,Borough,Zone,Main Source Category,Sector,Source,bap,cd,c4h6,c6h6,ch4,co,co2,hc,hcl,hg,n2o,nh3,nmvoc,nox,pb,pcb,pm10,pm2.5,so2
142632,2025,510500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.0,0.0,0.0,0.204897,0.0,0.0,0.0,0.0,0.0,0.0,0.000146,0.0,0.0,0.020429,0.020429,0.0
142633,2025,511500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.0,0.0,0.0,7.285452,0.0,0.0,0.0,0.0,0.0,0.0,0.000943,0.0,0.0,0.01674,0.01674,0.0
142634,2025,512500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.0,0.0,0.0,3.839111,0.0,0.0,0.0,0.0,0.0,0.0,0.002829,0.0,0.0,0.021169,0.021169,0.0
142635,2025,515500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,0.000138,0.0,0.0,0.434381,0.008365,0.0,0.0,0.0,0.0,0.00765,0.000788,0.0,0.0,0.022307,0.022307,0.0
142636,2025,516500,203500,Non GLA,Non GLA,Domestic,Biomass,Wood Burning,0.0,0.0,0.0,4.3e-05,0.0,0.0,0.308439,0.005322,0.0,0.0,0.0,0.0,0.001464,0.000336,0.0,0.0,0.02141,0.02141,0.0


## 3. Feature Engineering

In [11]:
# convert easting and northing to categorical
df['Easting'] = df['Easting'].astype('category')
df['Northing'] = df['Northing'].astype('category')

In [12]:
### One-Hot Encoding of Categorical Variables

df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

Unnamed: 0,Year,bap,cd,c4h6,c6h6,ch4,co,co2,hc,hcl,hg,n2o,nh3,nmvoc,nox,pb,pcb,pm10,pm2.5,so2,Easting_502500,Easting_503500,Easting_504500,Easting_505500,Easting_506500,Easting_507500,Easting_508500,Easting_509500,Easting_510500,Easting_511500,Easting_512500,Easting_513500,Easting_514500,Easting_515500,Easting_516500,Easting_517500,Easting_518500,Easting_519500,Easting_520500,Easting_521500,Easting_522500,Easting_523500,Easting_524500,Easting_525500,Easting_526500,Easting_527500,Easting_528500,Easting_529500,Easting_530500,Easting_531500,Easting_532500,Easting_533500,Easting_534500,Easting_535500,Easting_536500,Easting_537500,Easting_538500,Easting_539500,Easting_540500,Easting_541500,Easting_542500,Easting_543500,Easting_544500,Easting_545500,Easting_546500,Easting_547500,Easting_548500,Easting_549500,Easting_550500,Easting_551500,Easting_552500,Easting_553500,Easting_554500,Easting_555500,Easting_556500,Easting_557500,Easting_558500,Easting_559500,Easting_560500,Easting_561500,Northing_153500,Northing_154500,Northing_155500,Northing_156500,Northing_157500,Northing_158500,Northing_159500,Northing_160500,Northing_161500,Northing_162500,Northing_163500,Northing_164500,Northing_165500,Northing_166500,Northing_167500,Northing_168500,Northing_169500,Northing_170500,Northing_171500,Northing_172500,Northing_173500,Northing_174500,Northing_175500,Northing_176500,Northing_177500,Northing_178500,Northing_179500,Northing_180500,Northing_181500,Northing_182500,Northing_183500,Northing_184500,Northing_185500,Northing_186500,Northing_187500,Northing_188500,Northing_189500,Northing_190500,Northing_191500,Northing_192500,Northing_193500,Northing_194500,Northing_195500,Northing_196500,Northing_197500,Northing_198500,Northing_199500,Northing_200500,Northing_201500,Northing_202500,Northing_203500,Borough_Barnet,Borough_Bexley,Borough_Brent,Borough_Bromley,Borough_Camden,Borough_City,Borough_City of Westminster,Borough_Croydon,Borough_Ealing,Borough_Enfield,Borough_Greenwich,Borough_Hackney,Borough_Hammersmith and Fulham,Borough_Haringey,Borough_Harrow,Borough_Havering,Borough_Hillingdon,Borough_Hounslow,Borough_Islington,Borough_Kensington and Chelsea,Borough_Kingston,Borough_Lambeth,Borough_Lewisham,Borough_Merton,Borough_Newham,Borough_Non GLA,Borough_Redbridge,Borough_Richmond,Borough_Southwark,Borough_Sutton,Borough_Tower Hamlets,Borough_Waltham Forest,Borough_Wandsworth,Zone_Inner,Zone_Non GLA,Zone_Outer,Main Source Category_Industrial and Commercial,Main Source Category_Miscellaneous,Main Source Category_Resuspension,Main Source Category_Transport,Sector_Agriculture,Sector_Aviation,Sector_Biomass,Sector_Commercial Cooking,Sector_Construction,Sector_Forestry,Sector_Gas Leakage,Sector_Heat and Power Generation,Sector_Industrial Processes,Sector_Machinery,Sector_Rail,Sector_Resuspension,Sector_River,Sector_Road Transport,Sector_Waste,Source_Agriculture,Source_Aviation,Source_Car - Diesel,Source_Car - Electric,Source_Car - Petrol,Source_Commercial Cooking,Source_Commercial Shipping,Source_Construction / Demolition Dust,Source_Forestry,Source_Freight,Source_Gas Combustion,Source_HGV - Articulated,Source_HGV - Rigid,Source_Household and Garden NRMM,Source_LGV - Diesel,Source_LGV - Electric,Source_LGV - Petrol,Source_Landfill,Source_Motorcycle,Source_NRMM Exhaust,Source_Natural Gas Leakage,Source_Non-TfL Bus / Coach,Source_Oil/Coal Combustion,Source_PHV - Diesel,Source_PHV - Electric,Source_PHV - Petrol,Source_Part A1,Source_Part A2 / B,Source_Passenger Shipping,Source_Passengers,Source_Resuspension,Source_STW,Source_Small Private Vessels,Source_Small Scale Waste Burning,Source_Taxi,Source_TfL Bus,Source_WTS,Source_Wood Burning
142632,2025,0.0,0.0,0.0,0.0,0.0,0.0,0.204897,0.0,0.0,0.0,0.0,0.0,0.0,0.000146,0.0,0.0,0.020429,0.020429,0.0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
142633,2025,0.0,0.0,0.0,0.0,0.0,0.0,7.285452,0.0,0.0,0.0,0.0,0.0,0.0,0.000943,0.0,0.0,0.01674,0.01674,0.0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
142634,2025,0.0,0.0,0.0,0.0,0.0,0.0,3.839111,0.0,0.0,0.0,0.0,0.0,0.0,0.002829,0.0,0.0,0.021169,0.021169,0.0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
142635,2025,0.0,0.0,0.0,0.000138,0.0,0.0,0.434381,0.008365,0.0,0.0,0.0,0.0,0.00765,0.000788,0.0,0.0,0.022307,0.022307,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
142636,2025,0.0,0.0,0.0,4.3e-05,0.0,0.0,0.308439,0.005322,0.0,0.0,0.0,0.0,0.001464,0.000336,0.0,0.0,0.02141,0.02141,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


## 4. Train a model

### 4.1 Train test split

In [13]:
pollutants = ["nox", "pm10", "pm2.5", "co2"]
train_years = [2013, 2016, 2019]
test_years = [2025]

X = df_encoded.drop(columns=pollutants)
Y = df_encoded[pollutants + ['Year']]

X_train = X[X['Year'].isin(train_years)].drop(columns='Year')
Y_train = Y[Y['Year'].isin(train_years)].drop(columns='Year')

X_test = X[X['Year'].isin(test_years)].drop(columns='Year')
Y_test = Y[Y['Year'].isin(test_years)].drop(columns='Year')

In [14]:
# scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 4.2 Train the model

In [15]:
# Create the SVR regressor
# svr = SVR(epsilon=0.2)

In [16]:
# wrap the SVR in MultiOutputRegressor
mor = MultiOutputRegressor(LinearSVR(max_iter=10000))

In [17]:
# train the regressor
mor.fit(X_train_scaled, Y_train)



In [18]:
mor

In [19]:
Y_pred = mor.predict(X_test_scaled)

In [24]:
# Predictions on testing data
Y_pred = mor.predict(X_test_scaled)

# Evaluate 
# pollutants = ["nox", "pm10", "pm2.5", "co2"]
print('MSE nox:', mean_squared_error(Y_test['nox'], Y_pred[:, 0]))
print('MSE pm10:', mean_squared_error(Y_test['pm10'], Y_pred[:, 1]))
print('MSE pm2.5:', mean_squared_error(Y_test['pm2.5'], Y_pred[:, 2]))
print('MSE co2:', mean_squared_error(Y_test['co2'], Y_pred[:, 3]))

print('\n')
print('-'*50)

print('MAE nox:', mean_absolute_error(Y_test['nox'], Y_pred[:, 0]))
print('MAE pm10:', mean_absolute_error(Y_test['pm10'], Y_pred[:, 1]))
print('MAE pm2.5:', mean_absolute_error(Y_test['pm2.5'], Y_pred[:, 2]))
print('MAE co2:', mean_absolute_error(Y_test['co2'], Y_pred[:, 3]))


MSE nox: 1474.4965118294188
MSE pm10: 0.15453055915996647
MSE pm2.5: 0.19232265223990952
MSE co2: 2159550898.1401033


--------------------------------------------------
MAE nox: 0.4361962525415856
MAE pm10: 0.057003121375935856
MAE pm2.5: 0.02687167185997753
MAE co2: 455.2559251718847
