In [1]:
from pathlib import Path
import os

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
from sklearn import utils
from sklearn.pipeline import Pipeline
from pygam import LinearGAM, s, f, te, l
from sklearn import ensemble

from data_processing import *

## Load data

In [16]:
# data is split into 15 min mean aggregates
df = pd.read_csv("../data/clear.csv")
df['czas'] = pd.to_datetime(df['czas'], utc=True)
df = utils.shuffle(df)

df.set_index('czas', inplace=True)

## Split data

In [18]:
train, val, test = split(df)

X_train, X_val, X_test = train.drop(["temp_zuz"], axis=1), val.drop(["temp_zuz"], axis=1), test.drop(["temp_zuz"], axis=1)
y_train, y_val, y_test = train["temp_zuz"], val["temp_zuz"], test["temp_zuz"]

## Define a RiggedRegression model

In [19]:
solvers = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
alphas = [.1, .5, 1]

for alpha in alphas:
    for solver in solvers:
        ridge_model = Pipeline([('normalization', preprocessing.StandardScaler()),
                        ('ridge', linear_model.Ridge(alpha=alpha, solver=solver))])
        ridge_model.fit(X_train, y_train)
        mse = metrics.mean_squared_error(y_val, ridge_model.predict(X_val))
        print(solver, alpha, mse)

svd 0.1 33.64620936208525
cholesky 0.1 33.64620936208525
lsqr 0.1 33.64378408831897
sparse_cg 0.1 33.646219989059354
sag 0.1 33.6460224471689
saga 0.1 33.64606639555378
svd 0.5 33.64622905791434
cholesky 0.5 33.646229057914354
lsqr 0.5 33.643805150239544
sparse_cg 0.5 33.64623968120399
sag 0.5 33.646286934897624
saga 0.5 33.6463073333421
svd 1 33.64625369185645
cholesky 1 33.64625369185646
lsqr 1 33.643831490461054
sparse_cg 1 33.64626431054097
sag 1 33.64568502494335
saga 1 33.64609543423505


## Train the model

In [20]:
ridge_model.fit(X_train, y_train)
metrics.mean_squared_error(y_val, ridge_model.predict(X_val))

33.6462223201496