# Model description
The maintenance and failure of generators scenario is modeled as a markov chain where at each change of state a random value is gerated to determine how much time each generator spends in the new state and at what maximum capacity.
The state transition probabilities are obtained by counting the number of transitions in the given dataset (df) for each possible transition and dividing by the total number of transitions with the same starting state. The time spent at each state is modeled indipendently for each different type of generator by fitting the data with an exponential distribution. The capacity distribution is instead approximated by a kernel density estimator.

## Dataset

The dataset used for modeling it sourced by [ENTSOE](https://transparency.entsoe.eu/outage-domain/r2/unavailabilityOfProductionAndGenerationUnits/show?name=&defaultValue=true&viewType=TABLE&areaType=CTA&atch=false&dateTime.dateTime=28.02.2024+00:00|UTC|DAY&dateTime.endDateTime=01.03.2024+00:00|UTC|DAY&CTY|10YAT-APG------L|MULTI=CTY|10YAT-APG------L|MULTI&area.values=CTY|10YAT-APG------L!CTA|10YAT-APG------L&assetType.values=PU&assetType.values=GU&outageType.values=A54&outageType.values=A53&outageStatus.values=A05&masterDataFilterName=&masterDataFilterCode=&dv-datatable_length=10).

In [3]:
#Helper functions


def get_week(date):
    """
    input: date in date_time format
    output: what week of the year the date corresponds to
    """
    return date.week


def truncate(number, digits) -> float:
    # Improve accuracy with floating point operations, to avoid truncate(16.4, 2) = 16.39 or truncate(-1.13, 2) = -1.12
    nbDecimals = len(str(number).split('.')[1]) 
    if nbDecimals <= digits:
        return number
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

In [4]:
import powerplantmatching as pm
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
from scipy.stats import expon
from scipy.optimize import curve_fit
from astropy.visualization import hist
#for fitting:
from scipy.stats import expon, rv_discrete
from scipy.optimize import curve_fit
from sklearn.neighbors import KernelDensity
from sklearn.preprocessing import StandardScaler
#for plotting Markov Chain graph
import networkx as nx
import matplotlib.pyplot as plt


#helper functions

def import_df(path, cols = ['StartTS', 'EndTS', 'TimeZone', 'Status', 'Type', 'AreaCode',
       'AreaTypeCode', 'AreaName', 'MapCode', 'PowerResourceEIC', 'UnitName',
       'ProductionType', 'InstalledCapacity', 'AvailableCapacity',
       'Reason']):
    """
    imports and preprocess data_frame
    path: string containing path of csv file containing table
    cols: list of column names to select in df
    returns: non redundat dataframe with only failures
    """
    
    df = pd.read_csv(path, sep = "\t", parse_dates = [0,1])
    df = df[cols] #selects only column
    df = df.drop_duplicates(subset = ["UnitName","StartTS"]) #deletes redundant rows
    #df = df[(df["Reason"] == "Failure")] # WHERE | (df["Reason"] == 'Foreseen Maintenance')
    #maybe can do df[df["Reason"] in reasons]?
    return df


def markov_graph(transitions, seed = 42, digits = 4, title = ""):
    """
    input: transitions, a dictionary having as 
    keys: touples with 2 elements being the from state and from state
    values: the transition probability
    output: markov chain graph
    """
    G = nx.MultiDiGraph()

    for transition, probability in transitions.items():
        state_from, state_to = transition
        if probability != 0: 
        #if probability state_from to state_to is not 0 we add an edge to the graph
            G.add_edge(state_from, state_to, weight=truncate(probability, digits))

    #create positions of nodes: dictionary with coordinates
    pos = nx.spring_layout(G, seed) 

    # Increase the scale to avoid overlap
    pos = {k: [v[0] * 2, v[1] * 2] for k, v in pos.items()}

    labels = nx.get_edge_attributes(G, 'weight')
    nx.draw(G, pos, with_labels=True, node_size=700, node_color='skyblue', font_size=8, font_color='black',
            connectionstyle='arc3,rad=0.1')

    # Annotate edges manually with adjusted positions to avoid overlap
    for edge, weight in labels.items():
        (x, y) = pos[edge[0]]
        text_x = 3/4*x + 1/4*pos[edge[1]][0]
        text_y = 3/4*y + 1/4*pos[edge[1]][1]
        #shift text to avoid overlap
        text_y += 0.2 if edge[0] == edge[1] else 0


        plt.text(text_x, text_y, f"{weight}", fontsize=8, color='blue', verticalalignment='center',
                 horizontalalignment='center')
    plt.title(title)
    plt.show()
    
    
def combine_overlaps(df):
    """
    this functions combines any time overlaps present in the dataframe for each generator
    so that for every time t there is at most one row describing the generator at time t.
    df: dataframe containing UnitName, StartTS, EndTS
    """

    # Step 1: Sort the DataFrame
    df.sort_values(by=["UnitName", "StartTS"], inplace=True)

    # Step 2 and 3: Combine overlapping intervals
    result = []
    current_interval = None
    n_rows = df.shape[0]
    perc = n_rows // 100 *  5

    for k, row in df.iterrows():
        if k % perc == 0:
            print(f"percentage of rows parsed = {k / n_rows *100:.2f}%")
        if current_interval is None:
             current_interval = row.copy()
        elif row["StartTS"] >= current_interval["EndTS"] or row["UnitName"] != current_interval["UnitName"]:
            # No overlap or new UnitID
            result.append(current_interval)
            current_interval = row.copy()
        else:
            # Overlapping intervals, update the EndTS
            current_interval["EndTS"] = row["StartTS"]

    result_df = pd.DataFrame(result)

    return result_df


def get_markov_probs(df, states_column):
    """
    input:
    df: dataframe having as columns: states_column, "ProductionType", "StartTS", "UpTime"
    states_column: string with name of column where the state of the generator is saved
    output: dictionary having as keys tuples with two states and the associated probability transition
    """
    states = list(df[states_column].unique())
    states.append("Running")
    transitions = []
    for x in states:
        for y in states:
            transitions.append((x,y))
            
    transitions_counter = dict(zip(transitions, [0]*len(transitions)))
    GenGroups = df.groupby("UnitName")
    previous_state = "Running"
    current_state = "Running"
    for unit_name, unit_df in GenGroups:
        unit_df = unit_df.sort_values(["StartTS"])
        #count transition occurante for unit
        for index, row in unit_df.iterrows():
            uptime = row["RunningTime"]
            #get current state from row
            current_state = row[states_column]

            if pd.isna(uptime):
                #if uptime == "Nan" then it was the first recorded instance of the generator in the dataframe so before it was running.
                previous_state = "Running"
            elif uptime > 10 / (60 * 24): # and previous_state != "Running"
                #if the generator had some time between the previous row than the previous state was running
                #and we must add 1 to previousprevious state and running
                transitions_counter[(previous_state, "Running")] += 1
                previous_state = "Running"    

            transitions_counter[(previous_state, current_state)] += 1
            #the current state becomes the previous_state
            previous_state = current_state

    #get the transtions probabilities
    transitions_probs = transitions_counter
    counter_dict = dict(zip(states, [0]*len(states)))
    for state in states:
        for transition, counter in transitions_probs.items():
            if transition[0] == state:
                counter_dict[state] += counter 

    for transition, counter in transitions_probs.items():
        if counter_dict[transition[0]] != 0:
            #if transition[0] occurs at least one time
            transitions_probs[transition] = transitions_probs[transition] / counter_dict[transition[0]]
    return transitions_probs

def weighted_values(values, probabilities, size):
    bins = np.add.accumulate(probabilities)
    return values[np.digitize(np.random.random_sample(size), bins)]
    



In [5]:
#Import Outages Dataframe to construct model ENTSO-E format
data_path = "../outagesmodelingdata/"
df = pd.read_csv(data_path+"deltaWithEverything_df.csv", parse_dates = [0,1])
df["StartTS"] = pd.to_datetime(df["StartTS"])
df["EndTS"] = pd.to_datetime(df["EndTS"])
df = df.sort_values(["UnitName","StartTS"])

#Combines overlaps so that every generator is only at one state at the time
df = combine_overlaps(df)
#combine regions of countries into correspondint country name
df["MapCode"] = df["MapCode"].apply(lambda x: x[0:2])



percentage of rows parsed = 0.00%
percentage of rows parsed = 5.00%
percentage of rows parsed = 10.00%
percentage of rows parsed = 15.00%
percentage of rows parsed = 20.00%
percentage of rows parsed = 25.00%
percentage of rows parsed = 30.00%
percentage of rows parsed = 35.00%
percentage of rows parsed = 39.99%
percentage of rows parsed = 44.99%
percentage of rows parsed = 49.99%
percentage of rows parsed = 54.99%
percentage of rows parsed = 59.99%
percentage of rows parsed = 64.99%
percentage of rows parsed = 69.99%
percentage of rows parsed = 74.99%
percentage of rows parsed = 79.99%
percentage of rows parsed = 84.99%
percentage of rows parsed = 89.99%
percentage of rows parsed = 94.99%
percentage of rows parsed = 99.99%


In [7]:

#statetime_df: dataframe with the parametres of the exponantial distribution of statetime
#markov_d: dictionary of markov chains of the various generator types
#capacity_d: dictionary of distributions of capacity in p.u.


#STATE TIME MODELING


#Change state_column to look at different state distributions
state_column = "Type"
#def state_time_distribution_fitting(df, state_column):
states = list(df[state_column].unique())
delta_df = df
delta_df = delta_df.sort_values(by = ["UnitName", "StartTS"])

for state in states:
    #calculate time spent in each state
    delta_df[state+"Time"] = [np.datetime64("NaT")]*df.shape[0]
    state_df = delta_df[delta_df[state_column] == state]
    delta_df.loc[delta_df[state_column] == state, state+"Time"] = state_df["EndTS"] - state_df["StartTS"]
    delta_df[state+"Time"] = (delta_df[state+"Time"] /  np.timedelta64(1, 'h'))
    
for unit, unit_df in delta_df.groupby("UnitName"):                             
    unit_df["EndTS"] = pd.to_datetime(unit_df["EndTS"])
    shifted_endts = pd.to_datetime(unit_df["EndTS"].shift())
    start_ts = delta_df.loc[delta_df["UnitName"] == unit, "StartTS"]
    delta_df.loc[delta_df["UnitName"] == unit, "RunningTime"] = start_ts - shifted_endts

delta_df["RunningTime"] = (delta_df["RunningTime"] /  np.timedelta64(1, 'h'))
delta_df.loc[delta_df["RunningTime"] == 0, "RunningTime"] = np.nan

states = states + ["Running"]


#fit outages distributions
statetime_df = pd.DataFrame() #create empty parameter table
grouped_delta = delta_df.groupby(["ProductionType"])
statetime_df["ProductionType"] = grouped_delta.first().reset_index()["ProductionType"]
for state in states:
    statetime_df[state + "Time"] = [np.nan]*len(list(statetime_df["ProductionType"]))


def exponential_fit(x, scale):
    return expon.pdf(x, scale=scale)


for group_name, group_df in grouped_delta:
    for state in states:
        if not pd.isna(group_df[state+"Time"].mean()):
            # Fit the data to the exponential function
            mean = group_df[state+"Time"].mean()
            state_scale = mean #this is the MLE for exponential distribution
            statetime_df.loc[statetime_df["ProductionType"] == group_name[0], state+"Time"] = state_scale


            
#MARKOV CHAIN MODELING
print("Starting Markov chain modeling")
df = delta_df[delta_df["Reason"] != "Shutdown" ] #remove shutdowns 

markov_d= {}
#Create a list containing tuples rapresenting all possible state changes (x,y) := x --> y
#data is a dataframe containing the correct "UpTime" between the states considered

GenTypeGroup_df = df.groupby("ProductionType")

#We can use different states
#print markov chain for each type of generator
for production_type, data in GenTypeGroup_df:
    transitions_probs = get_markov_probs(data, "Type")
    markov_d[production_type] = transitions_probs
    #uncomment to plot:
    #markov_graph(transitions_probs, title = f"{production_type} Markov Chain")

#t_probs = dict(transitions_probs)
#for key, value in transitions_probs.items():
#    if value < 0.05:
#        del t_probs[key]
#print graph


#CAPACITY MODELING - the parameters for KDE correspond to the input data.
print("Starting Capacity Modeling")
df["p.u."] = df["AvailableCapacity"].copy() / df["InstalledCapacity"]
df = df[~np.isnan(df["pu"])]
kernel_df = df[["ProductionType", "Type", "p.u."]]

#SAVE CALCULATED PARAMETERS:
kernel_df.to_csv("kernel_capacity_df.csv", index = False)
statetime_df.to_csv("exponential_statetime_df.csv", index = False)
np.save("markov_state_change_d.npy",markov_d)


Starting Markov chain modeling
Starting Capacity Modeling


In [8]:
#Save models
kernel_df.to_csv("kernel_capacity_df.csv")
statetime_df.to_csv("exponential_statetime_df.csv")
np.save("markov_state_change_d.npy",markov_d)
