In [4]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans
import os
from filter_01_6400_years import TSA64K
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

In [7]:
# load the clustering model
def load_cluster_model(model_file):

    '''
    load the clustering model

    Arguments:
        model_file: the json file that stores the clustering model 
        ### future work: complete codes that can also read pkl file

    Return:
        clustering_model
    '''

    # model_file should be in the same folder
    result_path = model_file
    clustering_model = TimeSeriesKMeans.from_json(result_path)

    return clustering_model

In [8]:
def calculate_ws(clustering_model, pred_csv, years = 6400):

    '''
    calculate the demand frequency ws for the given data

    Arguments:
        pred_csv: csv file that stores the data we are going to predict

        years: int, how many simulation years are we going to predict, default 6400

    Return:
        ws: with shape of (years, number of clusters)

    '''

    # Use functions in TSA64K to get train_data
    metric = 'euclidean'
    tsa_task = TSA64K(pred_csv, metric, years)
    dispatch_array = tsa_task.read_data()
    tsa_task.read_input_pmax()
    train_data, data_index = tsa_task.transform_origin_data(dispatch_array)

    pred_res = []
    day_01_count = []

    # pred_res: list of (years, 364), are labels predicted by the clustering_model
    for i in range(years):
        year_data = train_data[i*364:(i+1)*364]
        day_0 = 0
        day_1 = 0
        del_index = []
        for idx, day in enumerate(year_data):
            if day.sum() == 0:
                day_0 += 1
                del_index.append(idx)
            elif day.sum() == 24:
                day_1 += 1
                del_index.append(idx)
            else:
                continue

        # count how many 0/1 capacity factor days in one year.
        day_01_count.append(np.array([day_0,day_1]))

        # just pred the days that are not 0/1.
        new_year_data = np.delete(year_data, del_index, axis = 0)

        # In some cases, the whole year is 0 capacity, add an empty array
        if len(new_year_data) == 0:
            pred_res.append(np.array([]))
        else:
            pred_res.append(clustering_model.predict(new_year_data))

    day_01_count = np.array(day_01_count)

    # record the number of clusters
    num_clusters = clustering_model.n_clusters

    # count the 0/1 capacity days
    ws = []
    for c, yr in enumerate(pred_res):
        elements, count = np.unique(yr,return_counts=True)
        res_dict = dict(zip(elements,count))
        count_dict = {}
        for j in range(num_clusters):
            if j in res_dict.keys():
                count_dict[j] = res_dict[j]/364
            else:
                count_dict[j] = 0

        # the first element in w is frequency of 0 cf days
        w = [day_01_count[c][0]/364]
        for k in count_dict.items():
            w.append(k[1])

        # the last element in w is frequency of 0 cf days
        w.append(day_01_count[c][1]/364)
        ws.append(w)

    # ws now np.array with size of (years,32)
    ws = np.array(ws)

    return ws

In [9]:
def read_input_x(input_data_x, pred_csv, years = 6400):

    '''
    read the input x from the csv

    Arguments:
        input_data_x: csv file that stores the input data 

        pred_csv: csv file that stores the data we are going to predict

        years: int, how many simulation years are we going to use, default 6400

    Return:
        x: with shape of (years, 8)

    '''
    metric = 'euclidean'
    tsa_task = TSA64K(pred_csv, metric, years)
    dispatch_array = tsa_task.read_data()
    tsa_task.read_input_pmax()
    train_data, data_index = tsa_task.transform_origin_data(dispatch_array)

    df_input_data = pd.read_hdf(input_data_x)

    # rows: data_index
    # cols: from jordan's code
    x = df_input_data.iloc[data_index,[1,2,3,4,5,6,7,9]].to_numpy()

    return x

In [10]:
inp_file = 'prescient_generator_inputs.h5'
pred_csv = 'Dispatch_shuffled_data_0.csv'
x = read_input_x(inp_file, pred_csv)