# Linear Models

In [1]:
import pandas as pd
import numpy as np

from source.preprocessing import create_SIE_df, train_test_split, create_lagged_features, create_temp_df, merge_temperature_data, merge_co2_data
from source.saving_results import save_results_to_csv
from source.models import linear_regression


## MODEL 1: Linear model for year averaged
### (Baseline Model for Year Averaged)

- Need a consistent test set to measure performance throughout model comparison- perhaps training on sea-ice data up to year 2011, and evaluating on years 2011-2025

- Metrics I want to explicitly test are rmse and mae

- Need to consider how my test set selection impacts extrapolation when investigating first ice-free year

In [2]:
yearly_df = create_SIE_df("yearly")
print(yearly_df.head())

#training up to 2011
X_train, y_train, X_test, y_test = train_test_split(yearly_df, 2011)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

   Year     Extent
0  1978  12.487000
1  1979  12.319560
2  1980  12.334148
3  1981  12.135486
4  1982  12.439445
(34, 1) (34,) (14, 1) (14,)


In [4]:
y_pred = linear_regression(X_train, y_train, X_test, y_test)

save_results_to_csv(
    y_test=y_test,
    y_pred=y_pred,
    model_name="Linear Regression",
    features="Year",
    target="Yearly averaged SIE",
    train_period="1978-2011",
    test_period="2012-2025",
)

mae = 0.2083645120170622, rmse=0.24192118730474046, r2= -0.02499090472807808


## MODEL 2: Linear model for monthly averaged
### (Baseline Model for Monthly Averaged)

- Target is now monthly averaged SIE per year, rather than yearly averaged

In [3]:
monthly_df = create_SIE_df("monthly")
print(monthly_df.head())

X_train_month, y_train_month, X_test_month, y_test_month = train_test_split(monthly_df, 2011)

   Year  Month     Extent
0  1978     10  10.402667
1  1978     11  11.645133
2  1978     12  13.667063
3  1979      1  15.414000
4  1979      2  16.175286


In [6]:
y_pred_month = linear_regression(X_train_month, y_train_month, X_test_month, y_test_month)

save_results_to_csv(
    y_test=y_test_month,
    y_pred=y_pred_month,
    model_name="Linear Regression",
    features="Year, Month",
    target="Monthly averaged SIE for each year",
    train_period="1978-2011",
    test_period="2012-2025",
)

mae = 2.18089717147176, rmse=2.6104174716384017, r2= 0.433877323943192


- Larger average error but higher r2 score
- Larger variability for monthly-averaged data --> larger average errors
- However relative to this variability, the model accounts for a fair proportion of it (â‰ˆ 0.434)

## Generating lag features

- Generating features to keep track of previous 3 months, half-year, 1 year and 2 years ago
- Referred to as lag_1, lag_2, lag_3, lag_6, lag_12, lag_24

In [4]:
lags = [1,2,3,6,12,24]

monthly_df_lagged = create_lagged_features(monthly_df, lags)
print(monthly_df_lagged.head())

   Year  Month     Extent      lag_1      lag_2      lag_3      lag_6  \
0  1980     10   9.182750   7.667067   7.984267  10.100062  15.429067   
1  1980     11  11.382867   9.182750   7.667067   7.984267  13.792600   
2  1980     12  13.592933  11.382867   9.182750   7.667067  12.204600   
3  1981      1  14.909688  13.592933  11.382867   9.182750  10.100062   
4  1981      2  15.604071  14.909688  13.592933  11.382867   7.984267   

      lag_12     lag_24  
0   8.747937  10.402667  
1  10.943067  11.645133  
2  13.336267  13.667063  
3  14.861875  15.414000  
4  15.955143  16.175286  


In [8]:
X_train_lagged, y_train_lagged, X_test_lagged, y_test_lagged = train_test_split(monthly_df_lagged, 2011)

## Model 3: Linear model for monthly averaged; adding lag features 

In [9]:
y_pred_lagged = linear_regression(X_train_lagged, y_train_lagged, X_test_lagged, y_test_lagged)

save_results_to_csv(
    y_test=y_test_lagged,
    y_pred=y_pred_lagged,
    model_name="Linear Regression",
    features="Year, Month, Lags",
    target="Monthly averaged SIE for each year",
    train_period="1980-2011",
    test_period="2012-2025",
)

mae = 0.28401720556050636, rmse=0.3859707996720303, r2= 0.9876234524355654


- Much larger r2 score and lower average errors compared to baseline model for monthly averaged
- Worth noting that we had less training data as a result of the lag generation

## Generating mean temperature anomaly feature

In [5]:
temp_df = create_temp_df()
monthly_SIE = merge_temperature_data(monthly_df_lagged, temp_df, ["Temp Anomaly"])
print("Monthly SIE merged with Temperature Data:\n",monthly_SIE.head())

Monthly SIE merged with Temperature Data:
    Year  Month     Extent      lag_1      lag_2      lag_3      lag_6  \
0  1980     10   9.182750   7.667067   7.984267  10.100062  15.429067   
1  1980     11  11.382867   9.182750   7.667067   7.984267  13.792600   
2  1980     12  13.592933  11.382867   9.182750   7.667067  12.204600   
3  1981      1  14.909688  13.592933  11.382867   9.182750  10.100062   
4  1981      2  15.604071  14.909688  13.592933  11.382867   7.984267   

      lag_12     lag_24  Temp Anomaly  
0   8.747937  10.402667          0.12  
1  10.943067  11.645133          0.21  
2  13.336267  13.667063          0.09  
3  14.861875  15.414000          0.80  
4  15.955143  16.175286          0.62  


In [11]:
X_train_lagged_temp, y_train_lagged_temp, X_test_lagged_temp, y_test_lagged_temp = train_test_split(monthly_SIE, 2011)

## Model 4: Linear model for monthly averaged; adding temp anomaly as a feature

In [12]:
y_pred_lagged_temp = linear_regression(X_train_lagged_temp, y_train_lagged_temp, X_test_lagged_temp, y_test_lagged_temp)

save_results_to_csv(
    y_test=y_test_lagged_temp,
    y_pred=y_pred_lagged_temp,
    model_name="Linear Regression",
    features="Year, Month, Lags, Northern Hemisphere Temperature Anomaly",
    target="Monthly averaged SIE for each year",
    train_period="1980-2011",
    test_period="2012-2025",
)

mae = 0.28335551351890587, rmse=0.3843897257177004, r2= 0.9877976962905916


- Slightly highly predictive accuracy and lower r2 score when including temperature anomaly as a feature

## Generating global CO2 concentration feature

In [6]:
monthly_co2_df = pd.read_csv("../data/co2_mm_gl.csv", skiprows=38) 
monthly_data_with_co2 = merge_co2_data(monthly_SIE, monthly_co2_df)
print("Monthly SIE merged with CO2 Data:\n",monthly_data_with_co2.head())

Monthly SIE merged with CO2 Data:
    Year  Month     Extent      lag_1      lag_2      lag_3      lag_6  \
0  1980     10   9.182750   7.667067   7.984267  10.100062  15.429067   
1  1980     11  11.382867   9.182750   7.667067   7.984267  13.792600   
2  1980     12  13.592933  11.382867   9.182750   7.667067  12.204600   
3  1981      1  14.909688  13.592933  11.382867   9.182750  10.100062   
4  1981      2  15.604071  14.909688  13.592933  11.382867   7.984267   

      lag_12     lag_24  Temp Anomaly  CO2 Concentration  
0   8.747937  10.402667          0.12             337.82  
1  10.943067  11.645133          0.21             338.93  
2  13.336267  13.667063          0.09             339.64  
3  14.861875  15.414000          0.80             340.18  
4  15.955143  16.175286          0.62             340.75  


In [14]:
X_train_lagged_temp_co2, y_train_lagged_temp_co2, X_test_lagged_temp_co2, y_test_lagged_temp_co2 = train_test_split(monthly_data_with_co2, 2011)

## Model 5: Linear model for monthly averaged; adding CO2 concentration as a feature

In [15]:
y_pred_lagged_temp_co2 = linear_regression(X_train_lagged_temp_co2, y_train_lagged_temp_co2, X_test_lagged_temp_co2, y_test_lagged_temp_co2)

save_results_to_csv(
    y_test=y_test_lagged_temp_co2,
    y_pred=y_pred_lagged_temp_co2,
    model_name="Linear Regression",
    features="Year, Month, Lags, Northern Hemisphere Temperature Anomaly, Global CO2 Concentration",
    target="Monthly averaged SIE for each year",
    train_period="1980-2011",
    test_period="2012-2025",
)

mae = 0.3188156871537209, rmse=0.42200035085578225, r2= 0.9853605528659215


- Slightly lower predictive accuracy and lowerhigher r2 score after including CO2 concentration as a feature