In [1]:
from pathlib import Path
import os

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
from sklearn import utils
from sklearn.pipeline import Pipeline
from pygam import LinearGAM, s, f, te, l
from sklearn import ensemble
import pickle
import joblib

from data_processing import *

## Load data

In [35]:
# data is split into 15 min mean aggregates
df = pd.read_csv("./data/clear.csv")
df['czas'] = pd.to_datetime(df['czas'], utc=True)
df = utils.shuffle(df)

df.set_index('czas', inplace=True)

## Split data

In [36]:
train, val, test = split(df)

X_train, X_val, X_test = train.drop(["temp_zuz"], axis=1), val.drop(["temp_zuz"], axis=1), test.drop(["temp_zuz"], axis=1)
y_train, y_val, y_test = train["temp_zuz"], val["temp_zuz"], test["temp_zuz"]

In [37]:
y_val.to_csv("y_val.csv")
X_val.to_csv("x_val.csv")

## Define a RiggedRegression model

In [38]:
solvers = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
alphas = [.1, .5, 1]

for alpha in alphas:
    for solver in solvers:
        ridge_model = Pipeline([('normalization', preprocessing.StandardScaler()),
                        ('ridge', linear_model.Ridge(alpha=alpha, solver=solver))])
        ridge_model.fit(X_train, y_train)
        mse = metrics.mean_squared_error(y_val, ridge_model.predict(X_val))
        print(solver, alpha, mse)

svd 0.1 32.45390484269221
cholesky 0.1 32.45390484269221
lsqr 0.1 32.455496793097
sparse_cg 0.1 32.45390473827349
sag 0.1 32.45359010509
saga 0.1 32.45400421125648
svd 0.5 32.45387887172684
cholesky 0.5 32.453878871726864
lsqr 0.5 32.45547054703305
sparse_cg 0.5 32.4538787707022
sag 0.5 32.45389408816396
saga 0.5 32.45396746475847
svd 1 32.45384642954925
cholesky 1 32.45384642954925
lsqr 1 32.45543776084689
sparse_cg 1 32.45384633276102
sag 1 32.4538661451109
saga 1 32.45396030279106


## Train the model

In [39]:
ridge_model.fit(X_train, y_train)
metrics.mean_squared_error(y_val, ridge_model.predict(X_val))

32.45386467042702

In [40]:
filename = 'finalized_model.sav'
pickle.dump(ridge_model, open(filename, 'wb'))

In [41]:
model = pickle.load(open(filename, 'rb'))

In [16]:
metrics.mean_squared_error(y_val, model.predict(X_val))

33.58157706334468