In [None]:
from tqdm.auto import tqdm
import pandas as pd

time_deltas = [
    ("-1h", "-0h"),
    ("-2h", "-1h"),
    ("-8h", "-2h"),
    ("-24h", "-0h")
]

def compute_lag(df, deltas = time_deltas, index_inc = pd.Timedelta("15m"), relevant_index = lambda x: True):
    
    max_l_inc = int(max(abs(pd.Timedelta(l) / index_inc) for l, _ in deltas)) + 1
    max_u_inc = int(max(abs(pd.Timedelta(u) / index_inc) for _, u in deltas)) + 1
    
    columns = None
    new_data = []
    new_index = []
    for i, (index, row) in tqdm(enumerate(df.iterrows()), total=df.shape[0]):
        
        if not relevant_index is None and not relevant_index(index):
            continue
        
        local_data = []
        local_columns = []
        
        # Speed up by only selecting maximal relevant points
        local_range = df.iloc[i - max_l_inc : i + max_u_inc]
        
        new_index.append(index)
        
        for low, high in deltas:
            
            # Get data points
            start_date = index + pd.Timedelta(low)
            end_date = index + pd.Timedelta(high)
            
            # Compute stats
            temp =  local_range.loc[(local_range.index >= start_date) & (local_range.index <= end_date)]            
            
            stats = pd.concat([temp.mean(), temp.std().fillna(0)])
            
            # Get names and build new df
            if columns is None:
                local_columns.extend([c + f"_[{low}, {high}]_mean" for c in temp.columns])
                local_columns.extend([c + f"_[{low}, {high}]_std"  for c in temp.columns])
            
            local_data.extend(stats.values.tolist())
    
        # Set up names
        if columns is None:
            columns = local_columns
    
        new_data.append(local_data)
    
    new_df = pd.DataFrame(new_data, columns=columns, index=new_index)

    return new_df
    

In [None]:
from src.functions import * 
import operator 
def preprocess(data):
    
    out = {}
    for key, values in data.items():
        
        if key != "a":
            continue
        
        # Get the local variables
        X_est, X_obs, X_test, y = (
            values[key] for key in [
                "X_train_estimated", "X_train_observed", 
                "X_test_estimated", "train_targets"
            ]
        )
        
        # Concat and align
        X_train = pd.concat([X_obs, X_est], axis = 0)
        
        # Set up indexing
        for X in [X_train, X_test]:
            X.set_index("date_forecast", inplace=True)
            X.drop(["date_calc"], inplace=True,axis=1)        
        y.set_index("time", inplace=True)
        
        # Preprocess - only predic on full hours (min == 0)
        X_train = compute_lag(X_train, relevant_index = lambda idx: idx.minute == 0)
        X_test = compute_lag(X_test)
        
        # Handle not aligned
        y = y[y.index.isin(X_train.index)]
        X_train = X_train[X_train.index.isin(y.index)]
        

        out[key] = {
            "X_train": X_train,
            "y_train": y,
            "X_test": X_test
        }
    
    return out

In [None]:
from src.dataset import Dataset
from src.splitting import SplitterMonth

data = Dataset(splitter=SplitterMonth(n_splits=3, shuffle=False), preprocess=preprocess)

In [None]:
from pathlib import Path

Path("lag_data").mkdir(exist_ok=True, parents=True)

data.save(Path("lag_data"))

In [None]:
from tqdm.auto import tqdm
import pandas as pd
from pathlib import Path
from src.dataset import Dataset
from src.splitting import SplitterMonth
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel

def select_and_impute(data):
    
    # Out data
    out = {}
    
    # Iterate over keys (locations)
    for key, values in tqdm(data.items()):
        
        # Load data
        X_train = data[key]["X_train"]
        y = data[key]["y_train"]
        X_test = data[key]["X_test"]
        
        # Handle not aligned
        X_train = X_train[X_train.index.isin(y.index)]
                
        # Impute nan
        imputer = SimpleImputer()
        X_train_ = imputer.fit_transform(X_train)
        X_test_ = imputer.transform(X_test)
        
        # Select relevant features
        selector = SelectFromModel(estimator=GradientBoostingRegressor(n_estimators=500, max_depth=5), max_features=None, threshold=None)
        X_train_ = selector.fit_transform(X_train_, y)
        X_test_ = selector.transform(X_test_)
        
        columns = [c for c, b in zip(X_train.columns, selector.get_support()) if b]
        
        out[key] = {
            "X_train": pd.DataFrame(X_train_, columns=columns, index=X_train.index),
            "y_train": y,
            "X_test": pd.DataFrame(X_test_, columns=columns, index=X_test.index),
        }
    
    return out

data_dir = Path("lag_data")
data = Dataset(data_dir, splitter=SplitterMonth(n_splits=3, shuffle=False), preprocess=select_and_impute)


In [None]:
from pathlib import Path

folder = Path("lag_data_imputed+selected")
folder.mkdir(exist_ok=True, parents=True)

data.save(folder)

In [1]:
from pathlib import Path

from src.dataset import Dataset
from src.splitting import SplitterMonth

folder = Path("data_selected")

data = Dataset(folder, splitter=SplitterMonth(n_splits=3, shuffle=False))

Loaded pre-stored dataset from data_selected


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error
import pickle
from pathlib import Path

def hyperparameter_tuning(est, param_grid, data):

    # Create the scorer
    scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # Create the grid search
    grid_search = GridSearchCV(
        estimator=est, param_grid=param_grid, scoring=scorer,
        cv = data.split_index(), 
        verbose=2, n_jobs=-1
    )
    
    # Fir the grid search
    import numpy as np
    print(data.X.shape, np.count_nonzero(np.isnan(data.X)))
    grid_search.fit(data.X, data.y["pv_measurement"])

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"Best parameters b: {best_params} - {grid_search.best_score_}")
    
    return best_params, grid_search


def run_hp_search(model, grid, data, folder : Path, splitter = None):
    
    folder.mkdir(exist_ok=True, parents=True)
    
    for key, x in data.train_data.items():
        
        if key != "a":
            continue
        
        
        if not splitter is None:
            x.splitter = splitter
        
        for i, split in enumerate(x.split()):
            _, grid_search = hyperparameter_tuning(model, grid, split["train"])

            data_val = split["val"]
            X_val, y_val = data_val.X, data_val.y
            X_val = X_val[grid_search.feature_names_in_]

            mae = mean_absolute_error(
                grid_search.best_estimator_.predict(X_val),
                y_val.pv_measurement
            )
            print(mae)
            
            with open(folder / f"{key}_{i}.pkl", "wb+") as f:
                pickle.dump({"mae": mae, "grid_search": grid_search}, f)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Grid 'learning_rate': 0.01, 'loss': 'absolute_error', 'max_depth': 5, 'n_estimators': 500 ~ 190

grid_gbr = {
    "n_estimators": [1000], 
    "loss": ["absolute_error"],
    "max_depth": [5],
    "learning_rate": [0.05],
    "min_samples_split": [None]
}

gbr = GradientBoostingRegressor()

run_hp_search(gbr, grid_gbr, data, Path(f"gbr_2"))

In [None]:
from catboost import CatBoostRegressor

grid_cb = {
        "iterations": [1000],
        "learning_rate": [1e-3, 0.1],
        "depth": [1, 10],
        "subsample": [0.05, 1.0],
        "colsample_bylevel": [ 0.05, 1.0],
        "min_data_in_leaf": [ 1, 100],
    }

cb = CatBoostRegressor()

run_hp_search(cb, grid_cb, data, Path("cb"))

In [None]:
from sklearn.ensemble import RandomForestRegressor

grid_RF = {
    "criterion": ["absolute_error"],
    "n_estimators": [200], 
    "max_depth": [None],
    "min_samples_leaf": [4]
}

rf = RandomForestRegressor()

run_hp_search(rf, grid_RF, data, Path("rf"))

In [None]:
from sklearn.neighbors import KNeighborsRegressor

grid_KNN = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"]
    "leaf_size" : [10, 30, 50]
    'metric': ['euclidean', 'manhattan']
}

KNN = KNeighborsRegressor()

run_hp_search(KNN, grid_KNN, data, Path("KNN"))

In [None]:
from lightgbm import LGBMRegressor

grid_LightGBM = {
    'colsample_bytree': (0.3, 1.0),    # Range: 0.3 to 1.0
    'learning_rate': (0.001, 0.3),     # Range: 0.001 to 0.3
    'max_depth': (-1, 15),             # Range: -1 (no limit) to 15
    'n_estimators': (50, 1000),        # Range: 50 to 1000
    'objective': 'regression',         # Typically fixed, based on task
    'subsample': (0.5, 1.0),           # Range: 0.5 to 1.0
    'min_data_in_leaf': (1, 100),      # Range: 1 to 100
    'num_leaves': (20, 100)            # Range: 20 to 100
}

LGBM = LGBMRegressor()


run_hp_search(LGBM, grid_LightGBM, data, Path("LGBM"))

In [None]:
import xgboost as xgb 

grid_XGB = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.5, 0.7, 0.9],
    'n_estimators': [100, 200, 500],
    'objective': ['reg:squarederror']
}

xg_reg = xgb.XGBRegressor() 

run_hp_search(xg_reg, grid_XGB, data, Path("grid_XGB"))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Grid 'learning_rate': 0.01, 'loss': 'absolute_error', 'max_depth': 5, 'n_estimators': 500 ~ 190

grid_gbr = {
    "n_estimators": [1000], 
    "loss": ["absolute_error"],
    "max_depth": [5],
    "learning_rate": [0.05],
    "min_samples_split": [None]
}

gbr = GradientBoostingRegressor()

run_hp_search(gbr, grid_gbr, data, Path(f"gbr_2"))

In [20]:
from sklearn.svm import SVR

grid_gbr = {
    "kernel": ["rbf"], 
    "C": [400],
    "gamma": [6.89548836361599e-13, 1e-12, 9e-13]
}

gbr = SVR()

run_hp_search(gbr, grid_gbr, data, Path(f"svr_2"))

(22683, 22) 0
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters b: {'C': 400, 'gamma': 1e-12, 'kernel': 'rbf'} - -209.3497835214799
196.18456559467407
(22706, 22) 0
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters b: {'C': 400, 'gamma': 1e-12, 'kernel': 'rbf'} - -198.16310383832126
218.97583281577454
(22685, 22) 0
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters b: {'C': 400, 'gamma': 1e-12, 'kernel': 'rbf'} - -208.79645228115123
197.32007163683934
[CV] END ......C=400, gamma=6.89548836361599e-13, kernel=rbf; total time=  14.8s
[CV] END .....................C=400, gamma=1e-10, kernel=rbf; total time=  20.5s
[CV] END .....................C=400, gamma=1e-10, kernel=rbf; total time=  23.3s
[CV] END .....................C=400, gamma=1e-08, kernel=rbf; total time=  31.1s
[CV] END ......C=400, gamma=6.89548836361599e-13, kernel=rbf; total time=  14.1s
[CV] END ......C=400, gamma=6.89548836361599e-13, kernel=rbf; tot

In [14]:
gbr._gamma

6.89548836361599e-13