In [4]:
import pandas as pd
from numpy import asarray
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from pandas import DataFrame, concat

# Function to transform a time series dataset into a supervised learning dataset
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if isinstance(data, list) else data.shape[1]
    df = DataFrame(data)
    cols = list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    # put it all together
    agg = concat(cols, axis=1)
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values

# Function to split a dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test, :], data[-n_test:, :]

# Fit an XGBoost model and make a one-step prediction
def xgboost_forecast(train, testX):
    train = asarray(train)
    trainX, trainy = train[:, :-1], train[:, -1]
    model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
    model.fit(trainX, trainy)
    yhat = model.predict(asarray([testX]))
    return yhat[0]

# Walk-forward validation for time series data
def walk_forward_validation(data, n_test):
    predictions = list()
    train, test = train_test_split(data, n_test)
    history = [x for x in train]
    for i in range(len(test)):
        testX, testy = test[i, :-1], test[i, -1]
        yhat = xgboost_forecast(history, testX)
        predictions.append(yhat)
        history.append(test[i])
        print(f'>expected={testy:.1f}, predicted={yhat:.1f}, count = {i}')
    error = mean_absolute_error(test[:, -1], predictions)
    return error, test[:, -1], predictions

# Step 1: Load the data
data = pd.read_csv('solar_data_with_temperature_adjusted_irradiance.csv')

# Step 2: Separate the features (X) and the target (y)
X = data[['GHI', 'DNI', 'DHI', 'Temperature']]  # Features
y = data['Tilted Irradiance (Adjusted)']  # Target

# Combine features and target into a single dataset
data_combined = pd.concat([X, y], axis=1).dropna()  # Drop rows with NaN values
values = data_combined.values  # Convert to numpy array

# Transform the time series data into supervised learning format
n_input = 6  # Number of lag observations as input
supervised_data = series_to_supervised(values, n_in=n_input)

# Walk-forward validation
n_test = 200  # Number of samples for testing
mae, y, yhat = walk_forward_validation(supervised_data, n_test)
print(f'MAE: {mae:.3f}')

# Plot expected vs predicted values
pyplot.plot(y, label='Expected')
pyplot.plot(yhat, label='Predicted')
pyplot.legend()
pyplot.show()


>expected=27.8, predicted=34.2, count = 0
>expected=0.0, predicted=0.5, count = 1


KeyboardInterrupt: 