In [None]:
import os
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
path = "../data"

# filename = "jena_climate_2009_2016_prep_0.csv"
filename = 'jena_climate_2009_2016_simpl.csv'
df = pd.read_csv(os.path.join(path, filename), parse_dates=["Date Time"])
df.head()

In [None]:
# from statsmodels.tsa import stattools

# temps = pd.DataFrame(df['T (degC)'].values, columns=['T (degC)'], index=df['Date Time'])

# acf_djia, confint_djia, qstat_djia, pvalues_djia = stattools.acf(temps,
#                                                              adjusted=True,
#                                                              nlags=96,
#                                                              qstat=True,
#                                                              fft=True,
#                                                              alpha = 0.05, )

# plt.figure(figsize=(7, 5))
# plt.plot(pd.Series(acf_djia), color='r', linewidth=2)
# plt.title('Autocorrelation plot', weight='bold', fontsize=16)
# plt.xlabel('Lag', weight='bold', fontsize=14)
# plt.ylabel('Value', weight='bold', fontsize=14)
# plt.xticks(weight='bold', fontsize=12, rotation=45)
# plt.yticks(weight='bold', fontsize=12)
# plt.grid(color = 'y', linewidth = 0.5)

########################################################################################################################
# There must be a way to optimize the number of lags as a model param:
# It probably needs to redefine each regression function, so it becomes a model param to be optimized.
########################################################################################################################

In [None]:
##################
# Dataprep utils
##################

def date_to_timestamp(df, date_col = "Date Time"):
    """
    Transform dates to datetime, then timestamp in sec
    """
    date_time = df[date_col]
    if not is_datetime(date_time):
        date_time = pd.to_datetime(df[date_col])
    return date_time.map(pd.Timestamp.timestamp)

def timestamp_to_daily_sin_cos(timestamp_s):
    day = 24*60*60
    return np.sin(timestamp_s * (2 * np.pi / day)), np.cos(timestamp_s * (2 * np.pi / day))

def timestamp_to_weekly_sin_cos(timestamp_s):
    day = 24*60*60
    week = day*7
    return np.sin(timestamp_s * (2 * np.pi / week)), np.cos(timestamp_s * (2 * np.pi / week))

def timestamp_to_monthly_sin_cos(timestamp_s):
    day = 24*60*60
    month = day*30
    return np.sin(timestamp_s * (2 * np.pi / month)), np.cos(timestamp_s * (2 * np.pi / month))

def timestamp_to_yearly_sin_cos(timestamp_s):
    day = 24*60*60
    year = (365.2425)*day
    return np.sin(timestamp_s * (2 * np.pi / year)), np.cos(timestamp_s * (2 * np.pi / year))


In [None]:
timestamp_s = date_to_timestamp(df, "Date Time")
df['Day sin'], df['Day cos'] = timestamp_to_daily_sin_cos(timestamp_s)
df['Year sin'], df['Year cos'] = timestamp_to_yearly_sin_cos(timestamp_s)
df.head()

In [None]:
class TimeSeries():
    def __init__(self, df, y, date_col="Date Time", lags=16, test_len=24):
        self.date_time = df[date_col] if is_datetime(df[date_col]) else pd.to_datetime(df[date_col])
        self.y = df[y]
        self.X = df.drop(columns=[y, date_col])
        self.lags = lags
        self.test_len = test_len
        
        self._add_lags()
        self._train_test_split()
        
    def _add_lags(self):
        if self.lags < 1:
            return
        for i in range(1, self.lags + 1, 1):
            self.X[f"lag_{i}"] = [np.nan]*i + list(self.y[:-i])

        idx = ~self.X[f"lag_{self.lags}"].isna()
        self.date_time = self.date_time[idx]
        self.X = self.X[idx]
        self.y = self.y[idx]
        
    def _train_test_split(self):
        self.Xtrain = self.X.iloc[:-self.test_len,:]
        self.ytrain = self.y[:-self.test_len]
        self.Xtest = self.X.iloc[-self.test_len:,:]
        self.ytest = self.y[-self.test_len:]

    def get_Xtrain(self):
        return self.Xtrain

    def get_Xtest(self):
        return self.Xtest

    def get_ytrain(self):
        return self.ytrain
    
    def get_ytest(self):
        return self.ytest

In [None]:
ts = TimeSeries(df, y='T (degC)', lags=72)
len(ts.get_Xtrain())

In [None]:
len(ts.get_ytrain())

In [None]:
models_dict = {
    "LR": {},
    "XGB": {
        'learning_rate': 0.05,
        'n_estimators': 500
        },
    "Ridge_": {
        'alpha': 0.2,
        'positive': True,
        'solver': 'lbfgs'
        },
    "Lasso_": {
        'alpha': 0.1,
        'warm_start': True,
        'positive': True,
        'selection': 'cyclic'
        },
    "Elasticnet_": {
        'alpha': 0.75,
        'l1_ratio': 0.25,
        'warm_start': True,
        'positive': True,
        'selection': 'cyclic'
        }
}

In [None]:
# Importing the Packages:
import functools
import optuna

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor as XGBReg
from sklearn import model_selection


############################################
# Decorator:
def optimizer(model):
    def deco_optim(parameterizer):
        @functools.wraps(parameterizer)
        def wrapper_optim(*args):
            params = parameterizer(*args)
            return model(**params)
        return wrapper_optim
    return deco_optim


############################################
# Set trial parameters for each model:
@optimizer(LinearRegression)
def LinearRegression_optimizer(trial):
    params = {}
    return params

@optimizer(Ridge)
def Ridge_optimizer(trial):
    params = {
        "alpha": trial.suggest_float("ridge_alpha", 0.1, 2),
        "positive": trial.suggest_categorical("ridge_positive", [True, False]),
        "solver": "auto"
        }
    return params

@optimizer(Lasso)
def Lasso_optimizer(trial):
    params = {
        "alpha": trial.suggest_float("lasso_alpha", 0.1, 2),
        'warm_start': trial.suggest_categorical("lasso_warm_start", [True, False]),
        "positive": trial.suggest_categorical("lasso_positive", [True, False]),
        "selection": trial.suggest_categorical("lasso_selection", ["cyclic", "random"])
        }
    return params

@optimizer(ElasticNet)
def ElasticNet_optimizer(trial):
    params = {
        "alpha": trial.suggest_float("elastic_alpha", 0.1, 2),
        "l1_ratio": trial.suggest_float("elastic_l1_ratio", 0.1, 1),
        'warm_start': trial.suggest_categorical("elastic_warm_start", [True, False]),
        "positive": trial.suggest_categorical("elastic_positive", [True, False]),
        "selection": trial.suggest_categorical("elastic_selection", ["cyclic", "random"])
        }
    return params

@optimizer(XGBReg)
def XGBReg_optimizer(trial):
    params = {
        "learning_rate": trial.suggest_float("xgbr_learning_rate", 0.1, 1),
        "n_estimators": trial.suggest_int("xgbr_n_estimators", 200, 500, 100),
        'max_depth': trial.suggest_int("xgbr_max_depth", 3, 5, 1),
        "loss": "squared_error" # trial.suggest_categorical("xgbr_loss", ["squared_error", "absolute_error", "quantile"])
        }
    return params


In [None]:
def objective(trial, Xtrain, ytrain, model_list):

    # Setup values for the hyperparameters optimization:
    classifier_name = trial.suggest_categorical("classifier", model_list)
    classifier_optimizer = f"{classifier_name}_optimizer"
    classifier_obj = eval(classifier_optimizer)(trial)

    # Scoring method:
    score = model_selection.cross_val_score(
        classifier_obj,
        Xtrain, ytrain,
        n_jobs=-1,
        cv=5,
        error_score=float(np.inf)
    )

    # Return accuracy
    return score.mean()


def optimize(self, model_list, n_trials=5):
    """
    Run optimizer
    note: to pass args to the objective func, wrap it inside a lambda func + args
    and call the lambda func in study.optimize()
    """
    
    Xtrain = self.get_Xtrain()
    ytrain = self.get_ytrain()
    
    objective_func = lambda trial: objective(trial, Xtrain, ytrain, model_list)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_func, n_trials=n_trials, show_progress_bar=True)

    self.study = study

TimeSeries.optimize = optimize

In [None]:
# model_list=["LinearRegression", "Ridge", "Lasso", "ElasticNet", "XGBReg"]
model_list=["LinearRegression", "XGBReg"]
ts.optimize(model_list=model_list, n_trials=30)

In [None]:
def get_best_model(self):

    def clean_param_names(params):
        """
        Retrieve the original param names,
        so they can be passed to the best model
        """
        clean = lambda p: "_".join(p.split("_")[1:])
        return {clean(p): v for p, v in params.items()}

    study = self.study
    best_value = study.best_value
    best_params = study.best_params
    best_model = best_params.pop('classifier')
    best_params = clean_param_names(best_params)

    ############################################
    # Getting the best result
    print(f"\nBest accuracy: {best_value}")
    print(f"Best algorithm: {best_model}")
    print(f"Best parameters (ready to use): {best_params}\n")

    return best_model, best_params

def performance(model):
    pass

def train_best_model(self):

    best_model, best_params = self.get_best_model()

    print(f"Running {best_model} as best best")
    print("Params:")
    print(best_params)

    model = eval(best_model)(**best_params)
    Xtrain = np.array(self.get_Xtrain())
    ytrain = np.array(self.get_ytrain()).reshape(-1, 1)
    model.fit(Xtrain, ytrain)

    self.model = model

def predict(self, what="train"):
    model = self.model
    if what == "train":
        return model.predict(np.array(self.get_Xtrain()))
    elif what == "test":
        return model.predict(np.array(self.get_Xtest()))
    else:
        return None
    
TimeSeries.get_best_model = get_best_model
TimeSeries.train_best_model = train_best_model
TimeSeries.predict = predict

In [None]:
ts.train_best_model()
# train_pred, test_pred = ts.predict()

In [None]:
def plot(self):
    ytrain = self.get_ytrain().tolist()
    ytest = self.get_ytest().tolist()
    test_pred = self.predict("test")
    date_time = self.date_time
    
    plt.figure(figsize=(18, 8))

    plt.plot(date_time[-self.test_len:], ytest, linewidth=2, label="Observed")
    plt.plot(date_time[-self.test_len:], test_pred, linewidth=2, c='orange', label="predicted")

    plt.xlabel('Time', fontsize=18)
    plt.xticks(fontsize=14)
    plt.ylabel('Values', fontsize=18)
    plt.yticks(fontsize=14)

    plt.legend(fontsize=14)
    plt.show()
    
TimeSeries.plot = plot

In [None]:
ts.plot()