# Model Training

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

from scipy.optimize import minimize
import statsmodels.tsa.api as smt
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

from scripts.helper_functions import *

In [2]:
sie_df = pd.read_csv("./data/arctic_sie_clean.csv")
sie_df.head(10)

Unnamed: 0,date,extent_million_sq_km
0,10/26/1978,10.231
1,10/28/1978,10.42
2,10/30/1978,10.557
3,11/1/1978,10.67
4,11/3/1978,10.777
5,11/5/1978,10.968
6,11/7/1978,11.08
7,11/9/1978,11.189
8,11/11/1978,11.314
9,11/13/1978,11.46


The dataset contains a daily measurement over a 30-year period. Therefore, we keep the first 25 years for training and the last 5 years for testing. The test size accounts for 60 months between 2013 and 2017. Plus, an additional 3 months in 2018 -- all multiplied by 30 (average number of measurements per month).

### Data Partitioning & Preprocessing

In [3]:
# Set the split boundary
split_date = '1/1/2013'  #  measurements

# convert 'date' to datetime 
sie_df['date'] = pd.to_datetime(sie_df['date'])

# Partitioning the dataset
train_nfc = sie_df[sie_df['date'] < split_date]
test_nfc = sie_df[sie_df['date'] >= split_date]

In order to set yearly based limitations to partition the dataset, the 'date' column was changed to a datetime format. The date format is changed to YYYY-MM-DD. A split date parameter is used to separate the data at the year's start.

In [4]:
train_nfc

Unnamed: 0,date,extent_million_sq_km
0,1978-10-26,10.231
1,1978-10-28,10.420
2,1978-10-30,10.557
3,1978-11-01,10.670
4,1978-11-03,10.777
...,...,...
10830,2012-12-27,12.669
10831,2012-12-28,12.834
10832,2012-12-29,12.926
10833,2012-12-30,12.931


In [5]:
print(train_nfc['date'])

0       1978-10-26
1       1978-10-28
2       1978-10-30
3       1978-11-01
4       1978-11-03
           ...    
10830   2012-12-27
10831   2012-12-28
10832   2012-12-29
10833   2012-12-30
10834   2012-12-31
Name: date, Length: 10835, dtype: datetime64[ns]


In [None]:
# Visualizing the training and test set
# plt.title('Airline passengers train and test sets', size=20)
plt.plot(x=train_nfc['date'], y=train_nfc['extent_million_sq_km'], label='Training set')
# plt.plot(test_nfc, label='Test set', color='orange')
plt.legend()