In [None]:
import powerplantmatching as pm
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
from scipy.stats import expon
from scipy.optimize import curve_fit
from astropy.visualization import hist
#for fitting:
from scipy.stats import expon, rv_discrete
from scipy.optimize import curve_fit
from sklearn.neighbors import KernelDensity
from sklearn.preprocessing import StandardScaler
#for plotting Markov Chain graph
import networkx as nx
import matplotlib.pyplot as plt


#helper functions

def import_df(path, cols = ['StartTS', 'EndTS', 'TimeZone', 'Status', 'Type', 'AreaCode',
       'AreaTypeCode', 'AreaName', 'MapCode', 'PowerResourceEIC', 'UnitName',
       'ProductionType', 'InstalledCapacity', 'AvailableCapacity',
       'Reason']):
    """
    imports and preprocess data_frame
    path: string containing path of csv file containing table
    cols: list of column names to select in df
    returns: non redundat dataframe with only failures
    """
    
    df = pd.read_csv(path, sep = "\t", parse_dates = [0,1])
    df = df[cols] #selects only column
    df = df.drop_duplicates(subset = ["UnitName","StartTS"]) #deletes redundant rows
    #df = df[(df["Reason"] == "Failure")] # WHERE | (df["Reason"] == 'Foreseen Maintenance')
    #maybe can do df[df["Reason"] in reasons]?
    return df

def get_week(date):
    """
    input: date in date_time format
    output: what week of the year the date corresponds to
    """
    return date.week


def truncate(number, digits) -> float:
    # Improve accuracy with floating point operations, to avoid truncate(16.4, 2) = 16.39 or truncate(-1.13, 2) = -1.12
    nbDecimals = len(str(number).split('.')[1]) 
    if nbDecimals <= digits:
        return number
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

def markov_graph(transitions, seed = 42, digits = 4, title = ""):
    """
    input: transitions, a dictionary having as 
    keys: touples with 2 elements being the from state and from state
    values: the transition probability
    output: markov chain graph
    """
    G = nx.MultiDiGraph()

    for transition, probability in transitions.items():
        state_from, state_to = transition
        if probability != 0: 
        #if probability state_from to state_to is not 0 we add an edge to the graph
            G.add_edge(state_from, state_to, weight=truncate(probability, digits))

    #create positions of nodes: dictionary with coordinates
    pos = nx.spring_layout(G, seed) 

    # Increase the scale to avoid overlap
    pos = {k: [v[0] * 2, v[1] * 2] for k, v in pos.items()}

    labels = nx.get_edge_attributes(G, 'weight')
    nx.draw(G, pos, with_labels=True, node_size=700, node_color='skyblue', font_size=8, font_color='black',
            connectionstyle='arc3,rad=0.1')

    # Annotate edges manually with adjusted positions to avoid overlap
    for edge, weight in labels.items():
        (x, y) = pos[edge[0]]
        text_x = 3/4*x + 1/4*pos[edge[1]][0]
        text_y = 3/4*y + 1/4*pos[edge[1]][1]
        #shift text to avoid overlap
        text_y += 0.2 if edge[0] == edge[1] else 0


        plt.text(text_x, text_y, f"{weight}", fontsize=8, color='blue', verticalalignment='center',
                 horizontalalignment='center')
    plt.title(title)
    plt.show()
    
    
def combine_overlaps(df):
    """
    this functions combines any time overlaps present in the dataframe for each generator
    so that for every time t there is at most one row describing the generator at time t.
    df: dataframe containing UnitName, StartTS, EndTS
    """

    # Step 1: Sort the DataFrame
    df.sort_values(by=["UnitName", "StartTS"], inplace=True)

    # Step 2 and 3: Combine overlapping intervals
    result = []
    current_interval = None
    n_rows = df.shape[0]
    perc = n_rows // 100 *  5

    for k, row in df.iterrows():
        if k % perc == 0:
            print(f"percentage of rows parsed = {k / n_rows *100:.2f}%")
        if current_interval is None:
             current_interval = row.copy()
        elif row["StartTS"] >= current_interval["EndTS"] or row["UnitName"] != current_interval["UnitName"]:
            # No overlap or new UnitID
            result.append(current_interval)
            current_interval = row.copy()
        else:
            # Overlapping intervals, update the EndTS
            current_interval["EndTS"] = row["StartTS"]

    result_df = pd.DataFrame(result)

    return result_df


def get_markov_probs(df, states_column):
    """
    input:
    df: dataframe having as columns: states_column, "ProductionType", "StartTS", "UpTime"
    states_column: string with name of column where the state of the generator is saved
    output: dictionary having as keys tuples with two states and the associated probability transition
    """
    states = list(df[states_column].unique())
    states.append("Running")
    transitions = []
    for x in states:
        for y in states:
            transitions.append((x,y))
            
    transitions_counter = dict(zip(transitions, [0]*len(transitions)))
    GenGroups = df.groupby("UnitName")
    previous_state = "Running"
    current_state = "Running"
    for unit_name, unit_df in GenGroups:
        unit_df = unit_df.sort_values(["StartTS"])
        #count transition occurante for unit
        for index, row in unit_df.iterrows():
            uptime = row["RunningTime"]
            #get current state from row
            current_state = row[states_column]

            if pd.isna(uptime):
                #if uptime == "Nan" then it was the first recorded instance of the generator in the dataframe so before it was running.
                previous_state = "Running"
            elif uptime > 10 / (60 * 24): # and previous_state != "Running"
                #if the generator had some time between the previous row than the previous state was running
                #and we must add 1 to previousprevious state and running
                transitions_counter[(previous_state, "Running")] += 1
                previous_state = "Running"    

            transitions_counter[(previous_state, current_state)] += 1
            #the current state becomes the previous_state
            previous_state = current_state

    #get the transtions probabilities
    transitions_probs = transitions_counter
    counter_dict = dict(zip(states, [0]*len(states)))
    for state in states:
        for transition, counter in transitions_probs.items():
            if transition[0] == state:
                counter_dict[state] += counter 

    for transition, counter in transitions_probs.items():
        if counter_dict[transition[0]] != 0:
            #if transition[0] occurs at least one time
            transitions_probs[transition] = transitions_probs[transition] / counter_dict[transition[0]]
    return transitions_probs

def weighted_values(values, probabilities, size):
    bins = np.add.accumulate(probabilities)
    return values[np.digitize(np.random.random_sample(size), bins)]

def next_state_markov(markov, current_state):
    possible_states = []
    transition_probs = []
    for key, prob in markov.items():
        if key[0] == current_state and prob != 0:
            possible_states.append(key[1])
            transition_probs.append(prob)
    #if len(possible_states) == 0:
    #    return current_state
    #else:
    return weighted_values(np.array(possible_states), np.array(transition_probs),1)[0]

def get_gen_type(geo_df):
    geo_to_entsoe_gen_d = {
        "Hard Coal": "Fossil Hard coal ", 
        'Lignite' : 'Fossil Brown coal/Lignite ',
        'Oil': "Fossil Oil ",
        'Waste': "Waste ",
        'Natural Gas': "Fossil Gas ",#o ci andrebbe qualcos altro
        #'Hydro',
        'Nuclear': "Nuclear ", 
        'Other' : "Other ", 
        'Solar':"Solar ",
        'Wind':"Wind Onshore ",
        'Geothermal':"Geothermal "
        }
    #given a unit row of geo dataframe give gen type in ENTSOE format
    #in geo there is just one windpower type (not onshore or offshore) or maybe you can see from the dataframe
    fuel_type = geo_df["Fueltype"]
    gen_type = geo_df["Technology"]
    if fuel_type == "Hydro":
        if gen_type == "Reservoir":
            return 'Hydro Water Reservoir '
        elif gen_type == "Run-Of-River":
            return 'Hydro Run-of-river and poundage '
        elif gen_type == "Pumped Storage":
            return 'Hydro Pumped Storage '
    elif fuel_type in geo_to_entsoe_gen_d.keys():
        return geo_to_entsoe_gen_d[fuel_type]
    else:
        print(f"Generetor {gen_type},{fuel_type} not found classfied as Other")
        return "Other "





In [None]:
#Import Outages Dataframe to construct model ENTSO-E format
data_path = "../outagesmodelingdata/"
df = pd.read_csv(data_path+"deltaWithEverything_df.csv", parse_dates = [0,1])
df["StartTS"] = pd.to_datetime(df["StartTS"])
df["EndTS"] = pd.to_datetime(df["EndTS"])
df = df.sort_values(["UnitName","StartTS"])

#Combine overlaps fo that every generator is only at one state at the time
df = combine_overlaps(df)
#combine countries
df["MapCode"] = df["MapCode"].apply(lambda x: x[0:2])



In [None]:
scenario.set_index("TimeStamp").groupby(
    [
        pd.Series(scenario.set_index("TimeStamp").columns).map(geo.set_index("projectID").Country).values,
        pd.Series(scenario.set_index("TimeStamp").columns).map(geo.set_index("projectID").Fueltype).values,
    ],
    axis=1
).mean()["Germany"].plot()

In [None]:
gen_series = pd.Series(scenario.columns[1:])

scenario.set_index("TimeStamp").groupby(
    [
        gen_series.map(geo.set_index("projectID").Country).values,
        gen_series.map(geo.set_index("projectID").Fueltype).values,
    ],
    axis=1
).mean()["Austria"]

# Plot Mean running capacity by type of generator over time

In [None]:
generators = df.PowerResourceEIC.unique()
start_date = df.StartTS.min() + np.timedelta64(24,"M")
#end_date = df.EndTS.max()
end_date = start_date + np.timedelta64(6,"M")

print("generators in germany in dataset:",np.sum(gen_info["MapCode"] == "DE"))
print("generators in germany in generated scenarios:", np.sum(geo.Country == "Germany"))

In [1]:
#convert to pypsa format
#check for first event correctness
df = df.sort_values(["UnitName","StartTS"])
scenarios_d = {}
for gen, gen_df in df.groupby("UnitName"):
    gen_scenario = []
    current_date = start_date
    #we assume df is sorted by startTS
    
    for index, event in gen_df.iterrows():
        if event.EndTS <= end_date:
            hours_before_event = int((event["StartTS"] - current_date) / np.timedelta64(1,"h"))
            gen_scenario += [1] * hours_before_event
            duration_event = int((event["EndTS"] - event["StartTS"]) / np.timedelta64(1,"h"))
            pu = event["p.u."]
            gen_scenario += [pu] * duration_event
            current_date = event["EndTS"]
        elif event.StartTS <= end_date:
            hours_before_event = int((event["StartTS"] - current_date) / np.timedelta64(1,"h"))
            gen_scenario += [1] * hours_before_event
            duration_event = int((end_date - event["StartTS"]) / np.timedelta64(1,"h"))
            pu = event["p.u."]
            gen_scenario += [pu] * duration_event
            current_date = end_date
            break
        else:
            break
        
    last_hours = int((end_date - current_date) / np.timedelta64(1,"h"))
    gen_scenario += [1]*last_hours
    scenarios_d[gen] = gen_scenario


min_l = 50000
for key, value in scenarios_d.items():
    if len(value) < min_l:
        min_l = len(value)
for key, value in scenarios_d.items():
    scenarios_d[key] = scenarios_d[key][:min_l]

NameError: name 'df' is not defined

In [None]:
timestamp = []
for h in np.arange(min_l):
    timestamp += [start_date + np.timedelta64(h,"h")]
scenarios_d["TimeStamp"] = timestamp
data_scenario = pd.DataFrame(scenarios_d)

gen_series = pd.Series(data_scenario.set_index("TimeStamp").columns)
gen_info = df.groupby("UnitName").first().reset_index()
data_scenario.set_index("TimeStamp").groupby(
    [
        gen_series.map(gen_info.set_index("UnitName").MapCode).values,
        gen_series.map(gen_info.set_index("UnitName").ProductionType).values,
    ],
    axis=1
).mean()["DE"].plot()