# Linear Models

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

from source.preprocessing import create_df
from source.preprocessing import train_test_split
from source.preprocessing import create_lagged_features
from source.saving_results import save_results_to_csv

## MODEL 1: Linear model by year averaged
### (Baseline Model for Year Averaged)

- Need a consistent test set to measure performance throughout model comparison- perhaps training on sea-ice data up to year 2011, and evaluating on years 2011-2025

- Metrics I want to explicitly test are rmse and mae

- Need to consider how my test set selection impacts extrapolation when investigating first ice-free year

In [2]:
yearly_df = create_df("yearly")
print(yearly_df.head())

#training up to 2011
X_train, y_train, X_test, y_test = train_test_split(yearly_df, 2011)

   Year     Extent
0  1978  12.487000
1  1979  12.319560
2  1980  12.334148
3  1981  12.135486
4  1982  12.439445


In [3]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

save_results_to_csv(
    y_test=y_test,
    y_pred=y_pred,
    model_name="Linear Regression",
    features="Year",
    target="Yearly averaged SIE",
    train_period="1978-2011",
    test_period="2012-2025",
)

mae = 0.2083645120170348, rmse=0.2419211873045379, r2= -0.02499090472636145


## MODEL 2: Linear model by monthly average
### (Baseline Model for Monthly Averaged)

- Target is now monthly averaged rather than yearly

In [2]:
monthly_average = create_df("monthly")
print(monthly_average.head())

X_train_month, y_train_month, X_test_month, y_test_month = train_test_split(monthly_average, 2011)

   Year  Month     Extent
0  1978     10  10.402667
1  1978     11  11.645133
2  1978     12  13.667063
3  1979      1  15.414000
4  1979      2  16.175286


In [7]:
model = LinearRegression()
model.fit(X_train_month, y_train_month)
y_pred_month = model.predict(X_test_month)

save_results_to_csv(
    y_test=y_test_month,
    y_pred=y_pred_month,
    model_name="Linear Regression",
    features="Year, Month",
    target="Monthly averaged SIE for each year",
    train_period="1978-2011",
    test_period="2012-2025",
)

mae = 2.1808971714695016, rmse=2.610417471637942, r2= 0.4338773239433915


- Larger average error but higher r2 score
- Larger variability for monthly-averaged data --> larger average errors
- However relative to this variability, the model accounts for a fair proportion of it (â‰ˆ 0.434)

## Generating Lag Features

- Generating features to keep track of previous 3 months, half-year, 1 year and 2 years ago
- Referred to as Lag 1, Lag 2, Lag 3, Lag 6, Lag 12, Lag 24

In [3]:
lags = [1,2,3,6,12,24]

df_lagged = create_lagged_features(monthly_average, lags)
print(df_lagged.head())

    Year  Month     Extent      lag_1      lag_2      lag_3      lag_6  \
24  1980     10   9.182750   7.667067   7.984267  10.100062  15.429067   
25  1980     11  11.382867   9.182750   7.667067   7.984267  13.792600   
26  1980     12  13.592933  11.382867   9.182750   7.667067  12.204600   
27  1981      1  14.909688  13.592933  11.382867   9.182750  10.100062   
28  1981      2  15.604071  14.909688  13.592933  11.382867   7.984267   

       lag_12     lag_24  
24   8.747937  10.402667  
25  10.943067  11.645133  
26  13.336267  13.667063  
27  14.861875  15.414000  
28  15.955143  16.175286  


In [6]:
X_train_lagged, y_train_lagged, X_test_lagged, y_test_lagged = train_test_split(df_lagged, 2011)

## Model 3: Linear model using lag features (monthly averaged)

In [7]:
model = LinearRegression()
model.fit(X_train_lagged, y_train_lagged)
y_pred_lagged = model.predict(X_test_lagged)

save_results_to_csv(
    y_test=y_test_lagged,
    y_pred=y_pred_lagged,
    model_name="Linear Regression",
    features="Year, Month, Lags",
    target="Monthly averaged SIE for each year",
    train_period="1980-2011",
    test_period="2012-2025",
)

mae = 0.2840172055801048, rmse=0.38597079969389864, r2= 0.9876234524341628


- Much larger r2 score and lower average errors compared to baseline model for monthly averaged
- Worth noting that we had less training data as a result of the lag generation