In [1]:
from pathlib import Path
import os

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
from sklearn import utils
from sklearn.pipeline import Pipeline
from pygam import LinearGAM, s, f, te, l
from sklearn import ensemble
import pickle
import joblib

from data_processing import *

## Load data

In [9]:
# data is split into 15 min mean aggregates
df = pd.read_csv("./data/clear.csv")
df['czas'] = pd.to_datetime(df['czas'], utc=True)
df = utils.shuffle(df)

df.set_index('czas', inplace=True)

## Split data

In [10]:
train, val, test = split(df)

X_train, X_val, X_test = train.drop(["temp_zuz"], axis=1), val.drop(["temp_zuz"], axis=1), test.drop(["temp_zuz"], axis=1)
y_train, y_val, y_test = train["temp_zuz"], val["temp_zuz"], test["temp_zuz"]

In [11]:
y_val.to_csv("y_val.csv")
X_val.to_csv("x_val.csv")

## Define a RiggedRegression model

In [12]:
train

Unnamed: 0_level_0,TIR,TIX1,prob_s,prob_corg,FCX,NIR,UXM,1th_agg_TIR,1th_agg_TIX1,1th_agg_prob_s,...,3th_agg_UXM,4th_agg_TIR,4th_agg_TIX1,4th_agg_prob_s,4th_agg_prob_corg,4th_agg_FCX,4th_agg_NIR,4th_agg_UXM,temp_zuz,poprzednia_temp_zuz
czas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-08 09:00:00+00:00,28.191712,404.611148,10.620000,8.560000,26.784114,18.495937,94.251908,28.156127,404.622778,10.62,...,94.252362,28.043654,404.655195,10.620000,8.560,38.364648,18.911777,94.266597,1305.0,1308.0
2021-08-15 05:00:00+00:00,27.465522,429.861592,11.806666,8.764000,32.674263,24.439697,97.243747,27.503215,429.895788,11.82,...,97.250600,27.556993,430.170600,11.820000,8.770,31.739721,24.709425,97.252522,1302.0,1295.0
2021-08-30 04:00:00+00:00,28.319165,425.992804,11.460000,8.360000,30.718345,23.340388,97.249145,28.294528,426.023189,11.46,...,97.245469,28.354805,426.078272,11.460000,8.360,31.188879,23.490644,97.243892,1300.0,1304.0
2022-01-21 22:00:00+00:00,27.682771,417.717251,11.690000,8.237333,37.404513,12.585552,98.126427,27.766646,417.695646,11.68,...,98.132716,28.066627,417.629264,11.680000,8.240,37.434628,14.761579,98.135063,1302.0,1290.0
2021-03-12 00:30:00+00:00,28.681404,415.424901,10.500000,8.970000,40.254671,23.221130,94.245879,28.687986,415.411303,10.50,...,94.246850,28.960198,415.358450,10.500000,8.970,40.186759,19.146380,94.247382,1307.0,1310.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-19 21:00:00+00:00,28.722705,415.628295,9.890000,8.480000,39.651301,18.818214,92.260184,28.740589,415.605901,9.89,...,92.255642,28.846187,415.537749,9.890000,8.480,39.646743,19.682955,92.254755,1300.0,1302.0
2021-02-21 23:00:00+00:00,28.925618,414.482408,10.640000,8.540000,36.912076,21.651425,94.242306,29.010419,414.590262,10.64,...,94.251502,28.988751,414.914252,10.425333,8.484,21.736673,18.015348,94.250580,1309.0,1309.0
2021-07-03 17:00:00+00:00,28.087901,428.170067,9.967334,8.939334,32.475268,22.572060,97.241357,28.117184,428.173311,9.97,...,97.231171,28.244729,428.159355,9.970000,8.930,32.438760,23.612648,97.227547,1310.0,1306.0
2021-08-27 23:00:00+00:00,27.295793,425.189824,11.030000,8.260000,33.748229,24.418011,97.250829,27.326763,425.202443,11.03,...,97.237739,27.387487,425.241440,11.030000,8.260,33.717529,21.963178,97.237464,1309.0,1306.0


In [13]:
solvers = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
alphas = [.1, .5, 1]

for alpha in alphas:
    for solver in solvers:
        ridge_model = Pipeline([('normalization', preprocessing.StandardScaler()),
                        ('ridge', linear_model.Ridge(alpha=alpha, solver=solver))])
        ridge_model.fit(X_train, y_train)
        mse = metrics.mean_squared_error(y_val, ridge_model.predict(X_val))
        print(solver, alpha, mse)

svd 0.1 24.453238878716064
cholesky 0.1 24.453238878716917
lsqr 0.1 24.435974630887323
sparse_cg 0.1 24.460961737869088
sag 0.1 24.44798382484303
saga 0.1 24.440683403549837
svd 0.5 24.44811538936517
cholesky 0.5 24.448115389365825
lsqr 0.5 24.435853133131907
sparse_cg 0.5 24.453459943545337
sag 0.5 24.44180826561728
saga 0.5 24.436678757199537
svd 1 24.440009317519078
cholesky 1 24.440009317519834
lsqr 1 24.435707490477395
sparse_cg 1 24.446074995186088
sag 1 24.436365105566498
saga 1 24.4329166564034


## Train the model

In [14]:
ridge_model.fit(X_train, y_train)
metrics.mean_squared_error(y_val, ridge_model.predict(X_val))

24.43290794665738

In [15]:
filename = 'finalized_model.sav'
pickle.dump(ridge_model, open(filename, 'wb'))

In [16]:
model = pickle.load(open(filename, 'rb'))

In [17]:
metrics.mean_squared_error(y_val, model.predict(X_val))

24.43290794665738