In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from scipy import stats

In [243]:
Folder_A = 'A.The_full_year_MILP'
Folder_B = 'B.Operation_cost'
Folder_D = 'D.Representative_days_based_on_RES_and_Demand'
Folder_E = 'E.Representative_days_based_on_Line_Benefits_OptModel'
Folder_K = 'K.Investments_per_hour'
Folder_L = 'L.Cont_Investments_per_hour'

category_dict = {Folder_A: "FYMILP",
                 Folder_B: "OPC",
                 Folder_D: "R&D",
                 Folder_E: "LB",
                 Folder_K: "DHI",
                 Folder_L: "CHI",
                }

### Reading Operating Cost CSV files

In [244]:
def read_oc_files(folder,CaseName_Base,cm,nbc): 
    
    destination_folder = f"{folder}/{CaseName_Base}/{cm}"

    filename = f"oT_Result_GenerationCost_{CaseName}_ByStages_nc{nbc}.csv"


    # Read input data
    # df = pd.read_csv(f"J.Full_year_operational_cost/Results/{folder}/{CaseName_Base}_ByStages_nc{nbc}/3.Out/{filename}", index_col=[0,1,2])
    df = pd.read_csv(os.path.join(DirName, 'J.Full_year_operational_cost', 'Results', folder, f"{CaseName}_ByStages_nc{nbc}", '3.Out', filename), index_col=[0,1,2])
    
    return df

In [245]:
InitialTime = time.time()

DirName  = os.getcwd()

In [246]:
CaseName = 'RTS24'

In [247]:
pd.options.mode.chained_assignment = None  # default='warn'
# Dictionary with CSV files
CSV_files = {}

# reading the file from the reference folder
folder_ref = Folder_A


df_ref = pd.read_csv(f"{folder_ref}/{CaseName}/3.Out/oT_Result_GenerationCost_{CaseName}.csv", index_col=[0,1,2])

In [248]:
df_ref

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mEUR
Period,Scenario,LoadLevel,Unnamed: 3_level_1
2030,sc01,01-01 00:00:00+01:00,0.797019
2030,sc01,01-01 01:00:00+01:00,0.797805
2030,sc01,01-01 02:00:00+01:00,0.815489
2030,sc01,01-01 03:00:00+01:00,0.854001
2030,sc01,01-01 04:00:00+01:00,0.969537
2030,sc01,...,...
2030,sc01,12-30 19:00:00+01:00,1.158526
2030,sc01,12-30 20:00:00+01:00,1.055206
2030,sc01,12-30 21:00:00+01:00,0.967965
2030,sc01,12-30 22:00:00+01:00,0.862253


In [249]:
# number of clusters to assess
nbcs = [20,50,100,150,200,250,300,350,400]

#list of folders to read
folders = [Folder_B,Folder_D,Folder_E,Folder_K,Folder_L]

for folder in folders:
    for nc in nbcs:
        CSV_files[f'{folder}_{CaseName}_{nc}'] = read_oc_files(folder,CaseName,category_dict[folder],nc)
        print(f'{folder}_{CaseName}_nc{nc}: done')


B.Operation_cost_RTS24_nc20: done
B.Operation_cost_RTS24_nc50: done
B.Operation_cost_RTS24_nc100: done
B.Operation_cost_RTS24_nc150: done
B.Operation_cost_RTS24_nc200: done
B.Operation_cost_RTS24_nc250: done
B.Operation_cost_RTS24_nc300: done
B.Operation_cost_RTS24_nc350: done
B.Operation_cost_RTS24_nc400: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc20: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc50: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc100: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc150: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc200: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc250: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc300: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc350: done
D.Representative_days_based_on_RES_and_Demand_RTS24_nc400: done
E.Representative_days_based_on_Line_Benefits_OptModel_RTS24_nc20: done
E.Representative_days_based_on_Line_Benefi

In [250]:
df_complete = pd.concat(CSV_files.values(), axis=0, keys=CSV_files.keys())

In [251]:
CSV_files

{'B.Operation_cost_RTS24_20':                                           mEUR
 Period Scenario LoadLevel                     
 2030   sc01     01-01 00:00:00+01:00  1.296835
                 01-01 01:00:00+01:00  1.297764
                 01-01 02:00:00+01:00  1.508098
                 01-01 03:00:00+01:00  2.475440
                 01-01 04:00:00+01:00  5.794755
 ...                                        ...
                 12-30 19:00:00+01:00  9.919625
                 12-30 20:00:00+01:00  8.686143
                 12-30 21:00:00+01:00  5.741702
                 12-30 22:00:00+01:00  2.682728
                 12-30 23:00:00+01:00  1.305654
 
 [8736 rows x 1 columns],
 'B.Operation_cost_RTS24_50':                                           mEUR
 Period Scenario LoadLevel                     
 2030   sc01     01-01 00:00:00+01:00  1.296835
                 01-01 01:00:00+01:00  1.297764
                 01-01 02:00:00+01:00  1.484915
                 01-01 03:00:00+01:00  2.451978
  

### T_test

In [252]:
# Assuming 'original_costs' is defined
original_costs = np.array(df_ref['mEUR'])

# List to store results before converting to DataFrame
results_list = []

for folder in folders:
    for nc in nbcs:
        # Generate the key dynamically
        reduced_costs = np.array(CSV_files[f'{folder}_{CaseName}_{nc}']['mEUR'])
        
        # Perform the t-test
        t_statistic, p_value = stats.ttest_ind(original_costs, reduced_costs)
        
        # Output the results
        print(f"{folder}_{CaseName}_nc{nc}")
        print(f"T-statistic: {t_statistic:.4f}")
        print(f"P-value: {p_value:.4f}")
        
        # Append results to the list
        results_list.append({'CaseName': CaseName,
                             'folder': folder,
                             'nc': nc,
                             'T-statistic': t_statistic,
                             'P-value': p_value})

# Convert the list to a DataFrame
results = pd.DataFrame(results_list)

B.Operation_cost_RTS24_nc20
T-statistic: -35.3658
P-value: 0.0000
B.Operation_cost_RTS24_nc50
T-statistic: -34.8439
P-value: 0.0000
B.Operation_cost_RTS24_nc100
T-statistic: -35.4367
P-value: 0.0000
B.Operation_cost_RTS24_nc150
T-statistic: -36.1740
P-value: 0.0000
B.Operation_cost_RTS24_nc200
T-statistic: -36.1740
P-value: 0.0000
B.Operation_cost_RTS24_nc250
T-statistic: -35.5551
P-value: 0.0000
B.Operation_cost_RTS24_nc300
T-statistic: -35.4367
P-value: 0.0000
B.Operation_cost_RTS24_nc350
T-statistic: -36.1740
P-value: 0.0000
B.Operation_cost_RTS24_nc400
T-statistic: -35.5551
P-value: 0.0000
D.Representative_days_based_on_RES_and_Demand_RTS24_nc20
T-statistic: -35.4367
P-value: 0.0000
D.Representative_days_based_on_RES_and_Demand_RTS24_nc50
T-statistic: -35.5551
P-value: 0.0000
D.Representative_days_based_on_RES_and_Demand_RTS24_nc100
T-statistic: -35.5551
P-value: 0.0000
D.Representative_days_based_on_RES_and_Demand_RTS24_nc150
T-statistic: -34.9364
P-value: 0.0000
D.Representative_

In [253]:
df_plot = results.set_index(['folder','CaseName','nc']).stack().reset_index().rename(columns={'level_3':'Stat',0:'Value'})

In [254]:
# change the name of the folder using catergory_dict
df_plot['folder'] = df_plot['folder'].map(category_dict)

In [255]:
# # plot the results using altair library with points and different colors for each folder and symbols for t-statistic and p-value
# import altair as alt

# alt.Chart(df_plot).mark_point().encode(
#     x='nc:O',
#     y='Value:Q',
#     color='folder:N',
#     shape='Stat:N'
# ).configure_axis(
#     labelFontSize=12,
#     titleFontSize=12
# ).configure_title(
#     fontSize=20
# ).properties(
#     width=600,
#     height=400,
#     title='T-test results for the different clustering methods of the case {}'.format(CaseName)
# )

In [256]:
# make one plot for each statistic and facet by the folder
label_font_size = 16
title_font_size = 18
chart = alt.Chart(df_plot).mark_point(size=200).encode(
    x=alt.X('nc:O', title='Number of clusters', axis=alt.Axis(labelAngle=0, labelFontSize=label_font_size, titleFontSize=title_font_size)),
    y='Value:Q',
    color='folder:N',
    shape='folder:N'
).properties(
    width=400,
    height=400,
)

chart.facet(
    column=alt.Column('Stat:N', title='Statistic', header=alt.Header(labelFontSize=label_font_size, titleFontSize=title_font_size))
).configure_axis(
    labelFontSize=label_font_size,
    titleFontSize=title_font_size
).configure_title(
    fontSize=20
).configure_legend(
    labelFontSize=label_font_size,
    titleFontSize=title_font_size
).properties(
    title='T-test results for the different clustering methods of the case {}'.format(CaseName)
)