# Exploring Solar Data 2021

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings

from datetime import datetime
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf



## Loading the data

In [3]:
weather_df = pd.read_csv("aws_10min.csv")
df = pd.read_csv('C:\\Users\\danie\\PycharmProjects\\internalProject\\data\\SolarForecast\\solar_forecast_2021.csv',
                 index_col='DateTime')
df.index = pd.to_datetime(df.index)
df.loc['2021-01-01 08:00:00':'2021-01-01 20:00:00']

Unnamed: 0_level_0,Most recent forecast [MW],Day-Ahead forecast [MW],Week-Ahead forecast [MW],Real-time Upscaled Measurement [MW],Corrected Upscaled Measurement [MW],Monitored Capacity [MWp],Day-Ahead forecast (11h00) [MW]
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01 08:00:00,0.0,0.0,0.0,0.0,0.0,4787.56,0.0
2021-01-01 08:15:00,0.0,0.0,0.0,0.0,0.0,4787.56,0.0
2021-01-01 08:30:00,0.01,0.03,0.04,0.0,0.0,4787.56,0.02
2021-01-01 08:45:00,6.65,7.18,10.58,8.69,8.69,4787.56,7.04
2021-01-01 09:00:00,29.84,31.54,51.85,39.59,39.59,4787.56,31.44
2021-01-01 09:15:00,69.04,73.09,121.58,81.65,81.65,4787.56,75.56
2021-01-01 09:30:00,118.72,125.24,197.56,126.55,126.55,4787.56,126.18
2021-01-01 09:45:00,171.09,179.45,270.14,172.47,172.47,4787.56,178.27
2021-01-01 10:00:00,221.21,234.77,337.73,218.18,218.18,4787.56,233.6
2021-01-01 10:15:00,269.81,287.33,399.75,271.94,271.94,4787.56,288.08


Unnamed: 0_level_0,Most recent forecast [MW],Day-Ahead forecast [MW],Week-Ahead forecast [MW],Real-time Upscaled Measurement [MW],Corrected Upscaled Measurement [MW],Monitored Capacity [MWp],Day-Ahead forecast (11h00) [MW]
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01 08:00:00,0.0,0.0,0.0,0.0,0.0,4787.56,0.0
2021-01-01 08:15:00,0.0,0.0,0.0,0.0,0.0,4787.56,0.0
2021-01-01 08:30:00,0.01,0.03,0.04,0.0,0.0,4787.56,0.02
2021-01-01 08:45:00,6.65,7.18,10.58,8.69,8.69,4787.56,7.04
2021-01-01 09:00:00,29.84,31.54,51.85,39.59,39.59,4787.56,31.44
2021-01-01 09:15:00,69.04,73.09,121.58,81.65,81.65,4787.56,75.56
2021-01-01 09:30:00,118.72,125.24,197.56,126.55,126.55,4787.56,126.18
2021-01-01 09:45:00,171.09,179.45,270.14,172.47,172.47,4787.56,178.27
2021-01-01 10:00:00,221.21,234.77,337.73,218.18,218.18,4787.56,233.6
2021-01-01 10:15:00,269.81,287.33,399.75,271.94,271.94,4787.56,288.08


## EDA

In [None]:
df['Corrected Upscaled Measurement [MW]'] - df['']

In [None]:
df.isna().sum()

In [None]:
df['Monitored Capacity [MWp]'].unique()

In [None]:
fig, axs = plt.subplots(figsize=(12, 4))

df.groupby(df.index.month)["Real-time Upscaled Measurement [MW]"].mean().plot(
    kind='bar', rot=0, ax=axs
)
plt.ylabel("Mean Solar Power Generation Per Month [MW]")
# plt.xticks([""])
plt.show()

In [None]:
forecast_disc = df['Week-Ahead forecast [MW]'] - df['Day-Ahead forecast [MW]']
forecast_disc.plot()

In [None]:
df['Real-time Upscaled Measurement [MW]'].plot()

In [None]:
plot_acf(df['Real-time Upscaled Measurement [MW]'])
plt.show()

In [None]:
pd.plotting.autocorrelation_plot(df['Real-time Upscaled Measurement [MW]'])
# plt.ylim([-0.1, 0.1])
plt.show()

## ARIMA Modelling
p: The number of lag observations included in the model, also called the lag order.

d: The number of times that the raw observations are differenced, also called the degree of differencing.

q: The size of the moving average window, also called the order of moving average.

In [None]:
# fit an ARIMA model and plot residual errors

# load dataset
def parser(x):
    return datetime.strptime('190'+x, '%Y-%m')

series = df['Real-time Upscaled Measurement [MW]'].copy()
series.index = df.index.to_period('M')

# fit model
model = ARIMA(series, order=(4, 1, 0))
model_fit = model.fit()

# summary of fit model
print(model_fit.summary())

In [None]:
# line plot of residuals
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()

In [None]:
# density plot of residuals
residuals.plot(kind='kde')
plt.show()

# summary stats of residuals
print(residuals.describe())

In [None]:
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]

    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit()
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

In [None]:
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))

In [None]:
# load dataset
def parser(x):
    return datetime.strptime('190'+x, '%Y-%m')
series = pd.read_csv('shampoo-sales.csv', header=0, index_col=0, date_parser=parser).squeeze()

In [None]:
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)

In [None]:
# grid search ARIMA parameters for time series
import warnings
from math import sqrt
from pandas import read_csv
from pandas import datetime
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
 
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]

    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit()
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error
 
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))
    return None
 
# load dataset
def parser(x):
    return datetime.strptime('190'+x, '%Y-%m')
series = read_csv('shampoo-sales.csv', header=0, index_col=0, parse_dates=True, squeeze=True, date_parser=parser)
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)