# Linear Models

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

from source.preprocessing import create_SIE_df
from source.preprocessing import train_test_split
from source.preprocessing import create_lagged_features
from source.saving_results import save_results_to_csv
from source.preprocessing import merge_temperature_data

## MODEL 1: Linear model for year averaged
### (Baseline Model for Year Averaged)

- Need a consistent test set to measure performance throughout model comparison- perhaps training on sea-ice data up to year 2011, and evaluating on years 2011-2025

- Metrics I want to explicitly test are rmse and mae

- Need to consider how my test set selection impacts extrapolation when investigating first ice-free year

In [2]:
yearly_df = create_SIE_df("yearly")
print(yearly_df.head())

#training up to 2011
X_train, y_train, X_test, y_test = train_test_split(yearly_df, 2011)

   Year     Extent
0  1978  12.487000
1  1979  12.319560
2  1980  12.334148
3  1981  12.135486
4  1982  12.439445


In [3]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

save_results_to_csv(
    y_test=y_test,
    y_pred=y_pred,
    model_name="Linear Regression",
    features="Year",
    target="Yearly averaged SIE",
    train_period="1978-2011",
    test_period="2012-2025",
)

mae = 0.2083645120170348, rmse=0.2419211873045379, r2= -0.02499090472636145


## MODEL 2: Linear model for monthly averaged
### (Baseline Model for Monthly Averaged)

- Target is now monthly averaged SIE per year, rather than yearly averaged

In [3]:
monthly_df = create_SIE_df("monthly")
print(monthly_df.head())

X_train_month, y_train_month, X_test_month, y_test_month = train_test_split(monthly_df, 2011)

   Year  Month     Extent
0  1978     10  10.402667
1  1978     11  11.645133
2  1978     12  13.667063
3  1979      1  15.414000
4  1979      2  16.175286


In [7]:
model = LinearRegression()
model.fit(X_train_month, y_train_month)
y_pred_month = model.predict(X_test_month)

save_results_to_csv(
    y_test=y_test_month,
    y_pred=y_pred_month,
    model_name="Linear Regression",
    features="Year, Month",
    target="Monthly averaged SIE for each year",
    train_period="1978-2011",
    test_period="2012-2025",
)

mae = 2.1808971714695016, rmse=2.610417471637942, r2= 0.4338773239433915


- Larger average error but higher r2 score
- Larger variability for monthly-averaged data --> larger average errors
- However relative to this variability, the model accounts for a fair proportion of it (â‰ˆ 0.434)

## Generating lag features

- Generating features to keep track of previous 3 months, half-year, 1 year and 2 years ago
- Referred to as lag_1, lag_2, lag_3, lag_6, lag_12, lag_24

In [6]:
lags = [1,2,3,6,12,24]

monthly_df_lagged = create_lagged_features(monthly_df, lags)
print(monthly_df_lagged.head())

   Year  Month     Extent  ...      lag_6     lag_12     lag_24
0  1980     10   9.182750  ...  15.429067   8.747937  10.402667
1  1980     11  11.382867  ...  13.792600  10.943067  11.645133
2  1980     12  13.592933  ...  12.204600  13.336267  13.667063
3  1981      1  14.909688  ...  10.100062  14.861875  15.414000
4  1981      2  15.604071  ...   7.984267  15.955143  16.175286

[5 rows x 9 columns]


In [7]:
X_train_lagged, y_train_lagged, X_test_lagged, y_test_lagged = train_test_split(monthly_df_lagged, 2011)

## Model 3: Linear model using lag features (monthly averaged)

In [7]:
model = LinearRegression()
model.fit(X_train_lagged, y_train_lagged)
y_pred_lagged = model.predict(X_test_lagged)

save_results_to_csv(
    y_test=y_test_lagged,
    y_pred=y_pred_lagged,
    model_name="Linear Regression",
    features="Year, Month, Lags",
    target="Monthly averaged SIE for each year",
    train_period="1980-2011",
    test_period="2012-2025",
)

mae = 0.2840172055801048, rmse=0.38597079969389864, r2= 0.9876234524341628


- Much larger r2 score and lower average errors compared to baseline model for monthly averaged
- Worth noting that we had less training data as a result of the lag generation

## Generating Mean Temperature Anomaly Feature

In [15]:
temp_df = pd.read_csv("../data/NH.Ts+dSST.csv", skiprows=1) #skipping header row

temp_long = temp_df.copy()
temp_long = temp_long.melt(
    id_vars="Year", 
    value_vars = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
    var_name="Month", 
    value_name="Temp Anomaly"
)

temp_long["Month"] = temp_long["Month"].map({
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "Jun": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12
})

monthly_SIE = merge_temperature_data(monthly_df_lagged, temp_long, ["Temp Anomaly"])
print("Monthly SIE merged with Temperature Data:\n",monthly_SIE.head())

Monthly SIE merged with Temperature Data:
    Year  Month     Extent  ...     lag_12     lag_24  Temp Anomaly
0  1980     10   9.182750  ...   8.747937  10.402667          0.12
1  1980     11  11.382867  ...  10.943067  11.645133          0.21
2  1980     12  13.592933  ...  13.336267  13.667063          0.09
3  1981      1  14.909688  ...  14.861875  15.414000          0.80
4  1981      2  15.604071  ...  15.955143  16.175286          0.62

[5 rows x 10 columns]


In [13]:
X_train_lagged_temp, y_train_lagged_temp, X_test_lagged_temp, y_test_lagged_temp = train_test_split(monthly_SIE, 2011)

## Model 4: Linear model using lag features and temperature anomalies (monthly averaged) 

In [18]:
model = LinearRegression()
model.fit(X_train_lagged_temp, y_train_lagged_temp)
y_pred_lagged_temp = model.predict(X_test_lagged_temp)

save_results_to_csv(
    y_test=y_test_lagged_temp,
    y_pred=y_pred_lagged_temp,
    model_name="Linear Regression",
    features="Year, Month, Lags, Monthly Averaged Temperature Anomaly",
    target="Monthly averaged SIE for each year",
    train_period="1980-2011",
    test_period="2012-2025",
)

mae = 0.2833555134830227, rmse=0.3843897256804391, r2= 0.9877976962929572


- Slightly highly predictive accuracy and lower r2 score when including temperature anomaly as a feature