Time Series

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [None]:
df1=pd.read_excel('//content//drive//MyDrive//energy.xlsx')

In [None]:
# Convert object columns to numeric where applicable
numeric_columns = ['ForecastWindProduction', 'SystemLoadEA', 'SMPEA',
                   'ORKTemperature', 'ORKWindspeed', 'CO2Intensity',
                   'ActualWindProduction', 'SystemLoadEP2', 'SMPEP2']

# Replace commas (if any) and convert to numeric
for col in numeric_columns:
    df1[col] = pd.to_numeric(df1[col], errors='coerce')

# Verify the conversions
print(df1.dtypes)

# Ensure no NaN values are present after conversion
if df1.isnull().sum().sum() > 0:
    print("Warning: Missing values introduced after type conversion.")
else:
    print("All columns successfully converted to numeric.")

DateTime                  datetime64[ns]
HolidayFlag                        int64
DayOfWeek                          int64
WeekOfYear                         int64
Day                                int64
Month                              int64
Year                               int64
PeriodOfDay                        int64
ForecastWindProduction           float64
SystemLoadEA                     float64
SMPEA                            float64
ORKTemperature                   float64
ORKWindspeed                     float64
CO2Intensity                     float64
ActualWindProduction             float64
SystemLoadEP2                    float64
SMPEP2                           float64
dtype: object


In [None]:
# Convert the 'DateTime' column to pandas datetime format if not already
df1['DateTime'] = pd.to_datetime(df1['DateTime'], errors='coerce')

# Extract useful components from the 'DateTime' column (e.g., year, month, day, etc.)
df1['Year'] = df1['DateTime'].dt.year
df1['Month'] = df1['DateTime'].dt.month
df1['Day'] = df1['DateTime'].dt.day
df1['DayOfWeek'] = df1['DateTime'].dt.dayofweek  # Monday=0, Sunday=6
df1['Hour'] = df1['DateTime'].dt.hour

# Drop the original 'DateTime' column if it's no longer needed
df1 = df1.drop(columns=['DateTime'])

# Verify the transformation
print(df1.head())


   HolidayFlag  DayOfWeek  WeekOfYear  Day  Month  Year  PeriodOfDay  \
0            0          0          44    1     11  2021            0   
1            0          0          44    1     11  2021            1   
2            0          0          44    1     11  2021            2   
3            0          0          44    1     11  2021            3   
4            0          0          44    1     11  2021            4   

   ForecastWindProduction  SystemLoadEA  SMPEA  ORKTemperature  ORKWindspeed  \
0                  315.31       3388.77  49.26             6.0           9.3   
1                  321.80       3196.66  49.26             6.0          11.1   
2                  328.57       3060.71  49.10             5.0          11.1   
3                  335.60       2945.56  48.04             6.0           9.3   
4                  342.90       2849.34  33.75             6.0          11.1   

   CO2Intensity  ActualWindProduction  SystemLoadEP2  SMPEP2  Hour  
0        600.71  

In [None]:
# Drop rows with missing values
df1 = df1.dropna()

# Check if all missing values are eliminated
print("Number of missing values after dropping rows:", df1.isnull().sum().sum())


Number of missing values after dropping rows: 0


In [None]:
X= df1.drop('SystemLoadEP2', axis=1)
y = df1['SystemLoadEP2']
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
# Define parameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'solver': ['auto', 'svd', 'cholesky', 'lsqr']  # Different solvers for Ridge
}

# Initialize Ridge Regression model
ridge = Ridge()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate the model with best parameters
best_ridge = grid_search.best_estimator_
y_pred = best_ridge.predict(X_test)
print("Testing MSE:", mean_squared_error(y_test, y_pred))
print("Testing R2:", r2_score(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'alpha': 10, 'solver': 'auto'}
Testing MSE: 22826.30614256781
Testing R2: 0.9678191916242567


In [None]:
from scipy.stats import uniform

# Define parameter distribution
param_dist = {
    'alpha': uniform(0.01, 100),  # Continuous distribution for alpha
    'solver': ['auto', 'svd', 'cholesky', 'lsqr']
}

# Initialize Ridge Regression model
ridge = Ridge()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=ridge, param_distributions=param_dist, scoring='neg_mean_squared_error',
                                    cv=5, n_iter=50, random_state=42, verbose=1)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate the model with best parameters
best_ridge = random_search.best_estimator_
y_pred = best_ridge.predict(X_test)
print("Testing MSE:", mean_squared_error(y_test, y_pred))
print("Testing R2:", r2_score(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'alpha': 14.102422497476264, 'solver': 'cholesky'}
Testing MSE: 22826.108591425625
Testing R2: 0.9678194701342973


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Run GridSearchCV or RandomizedSearchCV with scaled data
grid_search.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
!pip install optuna
import optuna
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error



In [None]:
def objective(trial):
    # Define the search space
    alpha = trial.suggest_loguniform('alpha', 1e-3, 1e2)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr'])

    # Create Ridge model with sampled hyperparameters
    model = Ridge(alpha=alpha, solver=solver)

    # Evaluate using cross-validation
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    return -score  # Minimize MSE

# Run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best parameters
print("Best parameters:", study.best_params)

# Train the best model
best_model = Ridge(**study.best_params)
best_model.fit(X_train, y_train)

# Evaluate on test data
y_pred = best_model.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, y_pred))
print("Testing R2:", r2_score(y_test, y_pred))

[I 2025-01-13 16:32:50,804] A new study created in memory with name: no-name-f6370fdc-9157-4aa0-a3ea-048adf8f42fd
  alpha = trial.suggest_loguniform('alpha', 1e-3, 1e2)
[I 2025-01-13 16:32:50,907] Trial 0 finished with value: 22095.590916459136 and parameters: {'alpha': 1.7773996389845392, 'solver': 'cholesky'}. Best is trial 0 with value: 22095.590916459136.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 1e2)
[I 2025-01-13 16:32:51,079] Trial 1 finished with value: 22095.57042732198 and parameters: {'alpha': 16.718927035462272, 'solver': 'svd'}. Best is trial 1 with value: 22095.57042732198.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 1e2)
[I 2025-01-13 16:32:51,183] Trial 2 finished with value: 22095.601725508634 and parameters: {'alpha': 0.06026089205438035, 'solver': 'auto'}. Best is trial 1 with value: 22095.57042732198.
  alpha = trial.suggest_loguniform('alpha', 1e-3, 1e2)
[I 2025-01-13 16:32:51,287] Trial 3 finished with value: 22095.601688717034 and parameters: {'alph

Best parameters: {'alpha': 11.66728796553334, 'solver': 'svd'}
Test MSE: 22826.224838878996
Testing R2: 0.9678193062472015


In [None]:
pip install hyperopt




In [None]:
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Define the objective function for Hyperopt
def objective(params):
    model = Ridge(**params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    return -score  # Minimize MSE

# Define the search space
space = {
    'alpha': hp.loguniform('alpha', -3, 2),  # Equivalent to log space for [0.001, 100]
    'solver': hp.choice('solver', ['auto', 'svd', 'cholesky', 'lsqr'])
}

# Run the optimization
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
)

# Map Hyperopt's categorical indexes to actual values
best_params['solver'] = ['auto', 'svd', 'cholesky', 'lsqr'][best_params['solver']]
print("Best parameters:", best_params)

# Train the best model
best_model = Ridge(**best_params)
best_model.fit(X_train, y_train)

# Evaluate on test data
y_pred = best_model.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, y_pred))
print("Testing R2:", r2_score(y_test, y_pred))


100%|██████████| 50/50 [00:17<00:00,  2.85trial/s, best loss: 22095.568179160742]
Best parameters: {'alpha': 7.323465043846107, 'solver': 'auto'}
Test MSE: 22826.4396041136
Testing R2: 0.9678190034685208
