In [17]:
from pathlib import Path
import os

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
from sklearn import utils
from sklearn.pipeline import Pipeline
from pygam import LinearGAM, s, f, te, l
from sklearn import ensemble
import pickle
import joblib

from data_processing import *

## Load data

In [30]:
# data is split into 15 min mean aggregates
df = pd.read_csv("../data/clear.csv")
df['czas'] = pd.to_datetime(df['czas'], utc=True)
df = utils.shuffle(df)

df.set_index('czas', inplace=True)

## Split data

In [31]:
train, val, test = split(df)

X_train, X_val, X_test = train.drop(["temp_zuz"], axis=1), val.drop(["temp_zuz"], axis=1), test.drop(["temp_zuz"], axis=1)
y_train, y_val, y_test = train["temp_zuz"], val["temp_zuz"], test["temp_zuz"]

## Define a RiggedRegression model

In [32]:
solvers = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
alphas = [.1, .5, 1]

for alpha in alphas:
    for solver in solvers:
        ridge_model = Pipeline([('normalization', preprocessing.StandardScaler()),
                        ('ridge', linear_model.Ridge(alpha=alpha, solver=solver))])
        ridge_model.fit(X_train, y_train)
        mse = metrics.mean_squared_error(y_val, ridge_model.predict(X_val))
        print(solver, alpha, mse)

svd 0.1 32.33896642208612
cholesky 0.1 32.338966422086145
lsqr 0.1 32.34230317011682
sparse_cg 0.1 32.339110883191076
sag 0.1 32.33877487883606
saga 0.1 32.339169390969474
svd 0.5 32.33891452308105
cholesky 0.5 32.33891452308104
lsqr 0.5 32.34225003597736
sparse_cg 0.5 32.339058905013715
sag 0.5 32.33912828123795
saga 0.5 32.33927113404872
svd 1 32.338849675679356
cholesky 1 32.338849675679356
lsqr 1 32.34218364535858
sparse_cg 1 32.33899395871959
sag 1 32.33900481873822
saga 1 32.33886281495525


## Train the model

In [33]:
ridge_model.fit(X_train, y_train)
metrics.mean_squared_error(y_val, ridge_model.predict(X_val))

32.338824457803064

In [34]:
filename = 'finalized_model.sav'
pickle.dump(ridge_model, open(filename, 'wb'))

In [35]:
model = pickle.load(open(filename, 'rb'))

In [36]:
metrics.mean_squared_error(y_val, model.predict(X_val))

32.338824457803064