In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_regression
from sklearn.metrics import r2_score
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

In [43]:
import sys
sys.setrecursionlimit(10000)

### Link to reference: [Link](https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-perform-multioutput-regression-with-svms-in-python.mdm)

## 1. Import Data

In [44]:
# Define file paths
file_mean = "/Users/kitili/masters-big-data/uol_group_d/ipynb_files/LAEI_2019_NA_FILLED_WITH_MEAN.csv"

# Load the datasets
# mean_df = pd.read_csv(file_mean)
df = pd.read_csv(file_mean)

df.head()

Unnamed: 0,Year,Sector,nox,n2o,pm10,pm2.5,co2
0,2013,Accidental Fires,21.129667,55.804167,84.125344,78.04113,895587.8
1,2013,Agriculture,244.437997,204.042963,29.984091,16.049516,6776.104
2,2013,Aviation,3851.256755,55.804167,69.838248,58.381094,1054197.0
3,2013,Biomass,1257.982209,55.804167,1054.000324,1054.000324,895587.8
4,2013,Commercial Cooking,1257.982209,55.804167,547.828374,547.828374,895587.8


In [45]:
df['Year'].unique()

array([2013, 2016, 2019, 2025, 2030])

In [46]:
# sum emissions per year
df_sum = df.groupby(['Year']).sum()
df_sum = df_sum.reset_index()
df_sum

Unnamed: 0,Year,Sector,nox,n2o,pm10,pm2.5,co2
0,2013,Accidental FiresAgricultureAviationBiomassComm...,67998.611444,2007.135045,9877.244675,4834.691018,36766680.0
1,2016,Accidental FiresAgricultureAviationBiomassComm...,62880.177309,2179.036897,9770.843908,4368.504513,38741490.0
2,2019,Accidental FiresAgricultureAviationBiomassComm...,52953.303739,2518.37106,9389.845447,4110.513467,37124630.0
3,2025,Accidental FiresAgricultureAviationBiomassComm...,38298.716117,2284.0593,8726.431828,3635.249158,35713140.0
4,2030,Accidental FiresAgricultureAviationBiomassComm...,32054.313404,2287.065063,8255.946953,3369.858919,32614560.0


In [47]:
# filter training data
df = df[df['Year'].isin([2013, 2016, 2019, 2025])]
df.shape

(64, 7)

## 2. Data clean up and handling missing values

## 3. Feature Engineering

In [48]:
### One-Hot Encoding of Categorical Variables

df_encoded = pd.get_dummies(df, drop_first=False)
df_encoded.head()

Unnamed: 0,Year,nox,n2o,pm10,pm2.5,co2,Sector_Accidental Fires,Sector_Agriculture,Sector_Aviation,Sector_Biomass,Sector_Commercial Cooking,Sector_Construction,Sector_Forestry,Sector_Gas Leakage,Sector_Heat and Power Generation,Sector_Industrial Processes,Sector_Machinery,Sector_Rail,Sector_Resuspension,Sector_River,Sector_Road Transport,Sector_Waste
0,2013,21.129667,55.804167,84.125344,78.04113,895587.8,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2013,244.437997,204.042963,29.984091,16.049516,6776.104,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2013,3851.256755,55.804167,69.838248,58.381094,1054197.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2013,1257.982209,55.804167,1054.000324,1054.000324,895587.8,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
4,2013,1257.982209,55.804167,547.828374,547.828374,895587.8,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False


## 4. Train a model

### 4.1 Train test split

In [49]:
pollutants = ["nox", "pm10", "pm2.5", "co2"]
train_years = [2013, 2016, 2019]
test_years = [2025]

X = df_encoded.drop(columns=pollutants)
Y = df_encoded[pollutants + ['Year']]

X_train = X[X['Year'].isin(train_years)].drop(columns='Year')
Y_train = Y[Y['Year'].isin(train_years)].drop(columns='Year')

X_test = X[X['Year'].isin(test_years)].drop(columns='Year')
Y_test = Y[Y['Year'].isin(test_years)].drop(columns='Year')

In [50]:
# scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 4.2 Train the model

In [51]:
# Create the SVR regressor
# svr = SVR(epsilon=0.2)

In [74]:
# wrap the SVR in MultiOutputRegressor
# mor = MultiOutputRegressor(LinearSVR(max_iter=10000))
mor = MultiOutputRegressor(SVR())

In [75]:
# train the regressor
mor.fit(X_train_scaled, Y_train)

In [76]:
# average r2 score for all pollutants
mor.score(X_train_scaled, Y_train)

-0.19645498968871067

In [77]:
# r2 score for each pollutant
Y_pred = mor.predict(X_train_scaled)
# convert to df
Y_pred = pd.DataFrame(Y_pred, columns=pollutants)

r2_scores = {col: r2_score(Y_train[col], Y_pred[col]) for col in Y_test.columns}

r2_scores

{'nox': -0.11899736480466339,
 'pm10': -0.29438301992218263,
 'pm2.5': -0.24227054310588025,
 'co2': -0.1301690309221164}