In [53]:
import pandas as pd
import numpy as np
import os
import json as json

In [3]:
def col_names(df):
    ''' this function renames the columns to make them easier to read 
      additionally set the date as index in our dataframe'''
    column_names = {'Photovoltaics [MWh] Original resolutions': 'Solar_generation_MWh',
                'Photovoltaics [MW] Calculated resolutions': 'Solar_installed_MW',
                'Total (grid load) [MWh] Original resolutions': 'Total_consumption_MWh',
                'Germany/Luxembourg [€/MWh] Calculated resolutions': 'DE_LU_price_per_MWh',}
    df.rename(columns=column_names, inplace=True)
    #df.set_index('Date', inplace=True)
    return df


In [4]:
def test_train_timeseries(df, target):
    ''' In the first part we select the train and test data.
    In the second per the columns we want to use for our predictions '''
    
    test = df[df.Date >= '2022-06-01']
    train = df[df.Date < '2022-06-01'] #alternatively use 2022-05-25

    # now we select the columns we want to use for our predictions

    test = test[target]
    train = train[target]
    return test, train



In [5]:
# split a univariate sequence into samller samples to feed into the LSTM
def split_sequence(input, n_steps, pred_size, target = []):
    ''' This function will split our timeseries into supervised timeseries snipets. 
    input = dataframe to be split
    n_steps = length of the X_variable 
    pred_size = length of the y_variable
    target
    target = list of targets to be split
    At the same time we will collect the corresponding timestamps in two additional arrays '''
    input_array = input[target]
    date_array = input['Date']

    x_index, y_index = list(), list()
    x, y = list(), list()
    for i in range(len(input_array)):
        end_ix = i + n_steps # find the end of this pattern
        if end_ix+pred_size > len(input)-1: # check if we are beyond the sequence
            break
        seq_x, seq_y = input_array[i:end_ix], input_array[end_ix: end_ix+pred_size]# gather input and output parts of the pattern
        ind_x, ind_y = date_array[i:end_ix], date_array[end_ix: end_ix+pred_size]# gather input and output Dates of the pattern
        x.append(seq_x)
        y.append(seq_y)
        x_index.append(ind_x)
        y_index.append(ind_y)

    
    return np.array(x), np.squeeze(np.array(y)), np.array(x_index), np.squeeze(np.array(y_index)) 

In [6]:
df = pd.read_pickle("../data/final_dataframe.pkl")

In [7]:
target = ['Date', 'Solar_generation_MWh_normalized']

test, train = test_train_timeseries(df, target)
test.to_pickle('../data/test.pkl')
train.to_pickle('../data/train.pkl')

In [8]:
# define input sequence
input = train
# choose a number of time steps
n_steps = 672
# prediction size 
pred_size= 96

target = ['Solar_generation_MWh_normalized']

# split into samples
X, y, X_train_index, Y_train_index = split_sequence(input, n_steps, pred_size, target)

In [10]:
X_test, y_test, X_test_index, Y_test_index = split_sequence(test , n_steps, pred_size, target)

## Create a dataframe with the input sequences 

In [61]:
# we make a new df containing the input arrays and the timestempt from when we want to predict 

def input_df(X, index, y):
    df_1 = pd.DataFrame(X.reshape(X.shape[0], X.shape[1]))
    df_2 = pd.DataFrame(y)
    df_1['input'] = df_1.apply(lambda row: np.array(row), axis =1)
    df_2['output'] = df_2.apply(lambda row: np.array(row), axis =1)
    df_date = pd.DataFrame(index[:, -1], columns = ['Date'])
    df_date['input'] = df_1['input'] 
    df_date['output'] = df_2['output']
    df_date['input'] = df_date['input'].apply(lambda cell: cell.reshape(1, 672, 1))
    return df_date

In [62]:
df_input = input_df(X_test, X_test_index, y_test)

In [65]:
#create a single input sequence 
input = df_input.iloc[1, 1].tolist()

output = df_input.iloc[1, 2].tolist()

In [69]:
#save the single input sequence as json file 
with open('../data/input.json', 'w') as json_file:
    json.dump(input, json_file)

with open('../data/output.json', 'w') as json_file:
    json.dump(output, json_file)

In [60]:
df_input.to_json('../data/test_input_solar.json')