# System market clearing volume MWh

## Convert sdv files with correct delimiter to excel files

In [28]:
import pandas as pd
import os

source_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Volume/2023'
target_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Volume/ExcelFiles'

os.makedirs(target_directory, exist_ok=True)

for filename in os.listdir(source_directory):
    if filename.endswith('.sdv'):
        source_file_path = os.path.join(source_directory, filename)
        target_file_path = os.path.join(target_directory, filename.replace('.sdv', '.xlsx'))
        
        with open(source_file_path, 'r', encoding='latin1') as file:
            lines = file.readlines()
        
        # Split each line by ';' and strip newline characters
        data = [line.strip().split(';') for line in lines if not line.startswith('#')]
        
        # Determine the maximum number of columns
        max_cols = max(len(row) for row in data)
        
        # Create a DataFrame with appropriate number of columns
        df = pd.DataFrame(data, columns=[f'Column {i+1}' for i in range(max_cols)])
        
        # Save to Excel
        try:
            df.to_excel(target_file_path, index=False)
            print(f'Successfully converted {filename} to Excel format.')
        except Exception as e:
            print(f'Failed to convert {filename}: {e}')


Successfully converted spot2318.sdv to Excel format.
Successfully converted spot2330.sdv to Excel format.
Successfully converted spot2324.sdv to Excel format.
Successfully converted spot2325.sdv to Excel format.
Successfully converted spot2331.sdv to Excel format.
Successfully converted spot2319.sdv to Excel format.
Successfully converted spot2327.sdv to Excel format.
Successfully converted spot2333.sdv to Excel format.
Successfully converted spot2332.sdv to Excel format.
Successfully converted spot2326.sdv to Excel format.
Successfully converted spot2322.sdv to Excel format.
Successfully converted spot2336.sdv to Excel format.
Successfully converted spot2337.sdv to Excel format.
Successfully converted spot2323.sdv to Excel format.
Successfully converted spot2335.sdv to Excel format.
Successfully converted spot2321.sdv to Excel format.
Successfully converted spot2309.sdv to Excel format.
Successfully converted spot2308.sdv to Excel format.
Successfully converted spot2320.sdv to Excel f

## Extract data 

In [29]:
import pandas as pd
import os

# Define the directory containing the Excel file
directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Volume/ExcelFiles'

# Prepare an empty DataFrame to compile all the data
compiled_data = pd.DataFrame()

# Assuming previous parts of the script remain the same

# Correct calculation for columns_to_extract to include F and I to AH excluding L
# Correct indices for columns F, and I to AH excluding L
columns_to_extract = [5] + [i for i in range(8, 33) if i != 11]

# Ensure we have 25 labels (1 for Date + 24 for Hours)
hour_labels = ['Date'] + [f'Hour {i}' for i in range(1, 25)]

print(f"Columns to extract indices: {columns_to_extract}")


# Adding an assertion to ensure the lengths match before assigning
assert len(columns_to_extract) == len(hour_labels), "Columns and labels count mismatch"

# Iterate through each Excel file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.xlsx'):
        file_path = os.path.join(directory, filename)
        # Read the Excel file without headers
        df = pd.read_excel(file_path, header=None)
        
        # Filter rows where column 1 is 'SK' and column 6 is 'SP1'
        filtered_df = df[(df.iloc[:, 1] == 'SK') & (df.iloc[:, 6] == 'SP1')]
        
        # Extract the relevant data using the column positions
        extracted_data = filtered_df.iloc[:, columns_to_extract]
        
        # Check if the number of extracted columns matches the number of custom headers
        if len(extracted_data.columns) != len(hour_labels):
            raise ValueError("Extracted data columns and header labels count do not match.")
        
        # Assign the custom headers
        extracted_data.columns = hour_labels
        
        # Append the extracted data to the compiled DataFrame
        compiled_data = pd.concat([compiled_data, extracted_data], ignore_index=True)

# Optionally, save the compiled data to a new Excel file
compiled_data.to_excel('compiled_data.xlsx', index=False)

Columns to extract indices: [5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]


In [30]:
compiled_data

Unnamed: 0,Date,Hour 1,Hour 2,Hour 3,Hour 4,Hour 5,Hour 6,Hour 7,Hour 8,Hour 9,...,Hour 15,Hour 16,Hour 17,Hour 18,Hour 19,Hour 20,Hour 21,Hour 22,Hour 23,Hour 24
0,23.08.2021,306773,301519,299555,297255,303932,318981,351608,373395,388559,...,383310,384617,378820,384481,385685,384117,379320,372936,355921,337910
1,24.08.2021,317696,311388,310274,311624,313457,328696,357128,385724,395033,...,380967,380761,380057,384969,382265,377689,373706,366270,349330,333443
2,25.08.2021,316818,317404,319832,320878,323976,336947,367560,386005,398836,...,366775,369887,374010,379410,382160,377238,371777,368768,350615,328378
3,26.08.2021,307856,297293,289439,294554,304252,322220,355743,381337,393097,...,388676,385560,386090,393293,392094,391560,388940,383061,368729,346954
4,27.08.2021,327988,319201,315615,313342,317766,334262,365276,389465,401633,...,385104,384895,386307,390594,391231,388142,383162,377368,366731,348586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2186,17.05.2023,367648,361311,361050,359568,360646,371574,390308,406793,415594,...,373170,379684,391920,404738,411795,414358,417698,416306,407126,393750
2187,18.05.2023,380176,373924,364651,358449,356580,357705,364568,371320,377470,...,326345,329630,346518,367605,384340,382603,379992,379326,376790,371867
2188,19.05.2023,363882,355273,349493,345119,348953,359841,374737,390178,393706,...,333071,340802,354016,370879,386288,387605,384941,382639,378620,364500
2189,20.05.2023,347724,340137,328185,324248,323505,323740,321787,319269,319039,...,305945,309659,308089,328854,344792,359906,367959,362450,356674,345330


In [31]:
# Convert the "Date" column to datetime
compiled_data['Date'] = pd.to_datetime(compiled_data['Date'], dayfirst=True)
# Set the "Date" column as the index
compiled_data.set_index('Date', inplace=True)
# Sort the DataFrame by the index (i.e., the "Date")
compiled_data.sort_index(inplace=True)

In [32]:
compiled_data

Unnamed: 0_level_0,Hour 1,Hour 2,Hour 3,Hour 4,Hour 5,Hour 6,Hour 7,Hour 8,Hour 9,Hour 10,...,Hour 15,Hour 16,Hour 17,Hour 18,Hour 19,Hour 20,Hour 21,Hour 22,Hour 23,Hour 24
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,396765,394922,383818,374923,374431,377423,395227,411838,418077,425689,...,456178,468599,482034,488985,484245,473900,462487,454036,437357,416843
2018-01-02,406974,398090,395763,397429,406403,429294,478235,538223,567389,571806,...,571734,576197,581862,583221,575281,561231,535353,503964,482715,460691
2018-01-03,428054,415787,410488,409808,417972,440364,491140,529401,542673,545591,...,547949,557311,565299,565436,554480,540756,521098,504078,480506,449197
2018-01-04,422618,411955,407325,407584,415290,431744,484441,536988,564025,571504,...,576641,579743,579635,578117,573613,564217,544208,528304,507625,461396
2018-01-05,423950,413632,410628,411400,419057,442050,490517,539320,564970,572463,...,576093,581570,585872,585757,581401,569586,549470,537399,514795,491305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-27,452265,447056,443807,441045,438885,452219,479723,521806,540570,538506,...,528626,547171,555529,562824,557704,536276,515300,488636,469173,449776
2023-12-28,446214,437711,430868,428501,430868,445063,457662,477723,490949,493671,...,503550,514088,518111,524945,512723,494140,476774,463608,447085,436839
2023-12-29,421625,415193,410358,408443,410717,418639,433211,455094,467188,475315,...,493065,498725,504639,509644,503397,489929,479026,465835,450278,434532
2023-12-30,427038,424725,419987,420787,418934,421275,428880,445837,464593,475532,...,488046,494103,511994,529535,520765,509079,499624,489681,476726,455889


In [37]:
import pandas as pd

# Ensure the 'Date' column is in datetime format (if not already done)
compiled_data.index = pd.to_datetime(compiled_data.index)

# Melt the DataFrame
melted = compiled_data.reset_index().melt(id_vars=['Date'], var_name='Hour', value_name='Volume traded MWh')

# Convert 'Hour X' to a proper timedelta (hour)
melted['Hour'] = pd.to_timedelta(melted['Hour'].str.extract('(\d+)$')[0].astype(int) - 1, unit='h')

# Combine 'Date' and 'Hour' into a single datetime index
melted['DateTime'] = melted['Date'] + melted['Hour']
melted.set_index('DateTime', inplace=True)

# Drop the now unnecessary 'Date' and 'Hour' columns
melted.drop(columns=['Date', 'Hour'], inplace=True)

# Your final DataFrame is now 'melted' with datetime index and a single 'Volume' column


In [38]:
melted.sort_index(inplace=True)

In [39]:
melted

Unnamed: 0_level_0,Volume traded MWh
DateTime,Unnamed: 1_level_1
2018-01-01 00:00:00,396765
2018-01-01 01:00:00,394922
2018-01-01 02:00:00,383818
2018-01-01 03:00:00,374923
2018-01-01 04:00:00,374431
...,...
2023-12-31 19:00:00,498730
2023-12-31 20:00:00,482699
2023-12-31 21:00:00,475541
2023-12-31 22:00:00,468307


In [40]:
# Convert "Volume traded MWh" from strings with "," as decimal point to floats with "."
if melted['Volume traded MWh'].dtype == object:
    melted['Volume traded MWh'] = melted['Volume traded MWh'].str.replace(',', '.').astype(float)

# Now, melted['Volume traded MWh'] contains numeric values with "." as the decimal separator
melted

Unnamed: 0_level_0,Volume traded MWh
DateTime,Unnamed: 1_level_1
2018-01-01 00:00:00,39676.5
2018-01-01 01:00:00,39492.2
2018-01-01 02:00:00,38381.8
2018-01-01 03:00:00,37492.3
2018-01-01 04:00:00,37443.1
...,...
2023-12-31 19:00:00,49873.0
2023-12-31 20:00:00,48269.9
2023-12-31 21:00:00,47554.1
2023-12-31 22:00:00,46830.7


### Export df to excel file

In [41]:
# Specify the path to the Excel file you want to create
excel_file_path = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/tradingvolume.xlsx'

# Export the melted DataFrame to an Excel file
melted.to_excel(excel_file_path, index=True)

print(f'DataFrame successfully saved to {excel_file_path}')


DataFrame successfully saved to /Users/evenbakke/Documents/Master Thesis/MasterThesis-/tradingvolume.xlsx


# Operating data (consumption, production, etc.)

## Converting sdv to excel, first for NO, then SE, DK, FI

In [78]:
import pandas as pd
import os

base_source_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Operating data /SE'
base_target_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Operating data /SE/ExcelFiles'

# Iterate over each subfolder in the base source directory
for folder_name in os.listdir(base_source_directory):
    # Construct the path to the current subfolder
    current_source_directory = os.path.join(base_source_directory, folder_name)
    
    # Skip if not a directory
    if not os.path.isdir(current_source_directory):
        continue

    # Make a corresponding subfolder in the target directory
    current_target_directory = os.path.join(base_target_directory, folder_name)
    os.makedirs(current_target_directory, exist_ok=True)
    
    # Process each .sdv file in the current subfolder
    for filename in os.listdir(current_source_directory):
        if filename.endswith('.sdv'):
            source_file_path = os.path.join(current_source_directory, filename)
            target_file_path = os.path.join(current_target_directory, filename.replace('.sdv', '.xlsx'))
            
            with open(source_file_path, 'r', encoding='latin1') as file:
                lines = file.readlines()
            
            # Split each line by ';' and strip newline characters
            data = [line.strip().split(';') for line in lines if not line.startswith('#')]
            
            # Determine the maximum number of columns
            max_cols = max(len(row) for row in data)
            
            # Create a DataFrame with appropriate number of columns
            df = pd.DataFrame(data, columns=[f'Column {i+1}' for i in range(max_cols)])
            
            # Save to Excel
            try:
                df.to_excel(target_file_path, index=False)
                print(f'Successfully converted {filename} to Excel format in folder {folder_name}.')
            except Exception as e:
                print(f'Failed to convert {filename} in folder {folder_name}: {e}')


Successfully converted pose2249.sdv to Excel format in folder 2022.
Successfully converted pose2248.sdv to Excel format in folder 2022.
Successfully converted pose2238.sdv to Excel format in folder 2022.
Successfully converted pose2204.sdv to Excel format in folder 2022.
Successfully converted pose2210.sdv to Excel format in folder 2022.
Successfully converted pose2211.sdv to Excel format in folder 2022.
Successfully converted pose2205.sdv to Excel format in folder 2022.
Successfully converted pose2239.sdv to Excel format in folder 2022.
Successfully converted pose2213.sdv to Excel format in folder 2022.
Successfully converted pose2207.sdv to Excel format in folder 2022.
Successfully converted pose2206.sdv to Excel format in folder 2022.
Successfully converted pose2212.sdv to Excel format in folder 2022.
Successfully converted pose2216.sdv to Excel format in folder 2022.
Successfully converted pose2202.sdv to Excel format in folder 2022.
Successfully converted pose2203.sdv to Excel for

## Extracting data, first for norway 

In [141]:
import pandas as pd
import os

# Define the base directory containing the year folders with Excel files
base_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Operating data /NO/ExcelFiles'

# Define the years you want to process
years = ['2018', '2019', '2020', '2021', '2022', '2023']

# Define the categories to be extracted
#categories = ['E', 'P', 'F', 'PE', 'WS', 'WE']
categories = ['F', 'E', 'P', 'PE', 'WS', 'WE']

# Prepare an empty DataFrame to compile all the data for each category across all years
category_compiled_data = {cat: pd.DataFrame() for cat in categories}

# Define the columns to extract, which corresponds to F, and H to AF excluding K in Excel
columns_to_extract = [5] + [i for i in range(7, 32) if i != 10]  # Adjusted to skip column K which is index 10

# Ensure we have 25 labels (1 for Date + 24 for Hours)
hour_labels = ['Date'] + [f'Hour {i}' for i in range(1, 25)]

# Iterate through each year's directory
for year in years:
    directory = os.path.join(base_directory, year)
    # Check if the year directory exists
    if os.path.exists(directory):
        # Iterate through each Excel file in the year directory
        for filename in os.listdir(directory):
            if filename.endswith('.xlsx'):
                file_path = os.path.join(directory, filename)
                # Read the Excel file without headers
                df = pd.read_excel(file_path, header=None, engine='openpyxl')

                # Process each category
                for category in categories:
                    # Filter rows for the current category where column 6 is 'NO'
                    filtered_df = df[(df.iloc[:, 1] == category) & (df.iloc[:, 6] == 'NO')]

                    # Extract the relevant data using the column positions
                    extracted_data = filtered_df.iloc[:, columns_to_extract]
                    extracted_data.columns = hour_labels  # Set the custom headers

                    # Append the extracted data to the corresponding category DataFrame
                    category_compiled_data[category] = pd.concat([category_compiled_data[category], extracted_data], ignore_index=True)

# Now merge all the category dataframes on Date and Hour.
# Initialize the final DataFrame with the first category DataFrame
final_df = category_compiled_data[categories[0]]

# Merge the rest of the category DataFrames into final_df on 'Date'
for category in categories[1:]:
    final_df = final_df.merge(category_compiled_data[category], on='Date', suffixes=('', f'_{category}'))

In [142]:
# Renaming hour columns to include category prefix (assuming no duplicate hours across categories)
for category in categories:  # Skip the first category as it's already in the final_df
    final_df.rename(columns={f'Hour {i}': f'Hour {i}_{category}' for i in range(1, 25)}, inplace=True)

In [143]:
final_df

Unnamed: 0,Date,Hour 1_F,Hour 2_F,Hour 3_F,Hour 4_F,Hour 5_F,Hour 6_F,Hour 7_F,Hour 8_F,Hour 9_F,...,Hour 15_WE,Hour 16_WE,Hour 17_WE,Hour 18_WE,Hour 19_WE,Hour 20_WE,Hour 21_WE,Hour 22_WE,Hour 23_WE,Hour 24_WE
0,13.06.2022,11500,11014,10860,10906,10909,11366,12788,13482,13656,...,1962,2099,2153,2204,2130,2015,1983,1921,1895,1800
1,14.06.2022,12224,11711,11547,11717,11593,11681,12538,13432,13791,...,515,547,642,754,861,914,926,1059,972,881
2,15.06.2022,12230,11691,11376,11276,11171,11401,12383,13578,13967,...,691,694,625,569,498,482,464,414,371,379
3,16.06.2022,12069,11730,11559,11438,11326,11443,12330,13315,13634,...,424,462,531,527,508,469,408,384,389,370
4,17.06.2022,12050,11558,11440,11269,11215,11385,12188,13221,13440,...,1241,1371,1563,1642,1704,1742,1581,1390,1337,1265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,15.02.2023,16380,16042,15899,16065,16004,16330,17353,18510,18661,...,1938,1987,2066,2196,2370,2524,2778,2929,2940,2832
374,16.02.2023,16452,16025,15447,15368,15610,15909,17016,18023,18536,...,2730,2434,2503,2438,2076,1919,1859,1847,1872,2007
375,17.02.2023,16446,16072,15765,15556,15446,15821,17106,18357,18704,...,2176,2059,1845,1544,1501,1697,2079,2514,2676,2810
376,18.02.2023,16420,16097,15806,15818,15661,15770,15982,16589,17264,...,2890,2971,2876,2871,2718,2715,2575,2481,2465,2431


In [144]:
# Convert the "Date" column to datetime
final_df['Date'] = pd.to_datetime(final_df['Date'], dayfirst=True)
# Set the "Date" column as the index
final_df.set_index('Date', inplace=True)
# Sort the DataFrame by the index (i.e., the "Date")
final_df.sort_index(inplace=True)

In [145]:
# Continuing from the previous code...

final_df = final_df.reset_index()
# Initialize an empty DataFrame to hold the melted data
melted_df = pd.DataFrame()

# We will perform the melting process for each category
for category in ['F', 'E', 'P', 'PE', 'WS', 'WE']:
    # Extract the columns for the current category
    hour_columns = [f'Hour {i}_{category}' for i in range(1, 25)]
    
    # Melt the DataFrame to go from wide to long format
    melted_category_df = final_df.melt(id_vars='Date', value_vars=hour_columns, var_name='Hour_Category', value_name=category)
    
    # Extract the hour number from the column name and convert to integer
    melted_category_df['Hour'] = melted_category_df['Hour_Category'].str.extract('(\d+)').astype(int)
    
    # Drop the temporary 'Hour_Category' as it's no longer needed
    melted_category_df = melted_category_df.drop('Hour_Category', axis=1)
    
    # If it's the first category, initialize melted_df
    if melted_df.empty:
        melted_df = melted_category_df
    else:
        # Merge the current category's melted data with the main melted_df on 'Date' and 'Hour'
        melted_df = pd.merge(melted_df, melted_category_df, on=['Date', 'Hour'], how='outer')

# The 'Date' column should be converted back to a datetime index
melted_df['Date'] = pd.to_datetime(melted_df['Date'])
melted_df.set_index('Date', inplace=True)

# Sort by the index (Date) and 'Hour' column
melted_df.sort_values(by=['Date', 'Hour'], inplace=True)

# Ensure the 'Hour' column is before the category values
columns = ['Hour'] + [cat for cat in categories]
melted_df = melted_df[columns]

# The final DataFrame will have 7 columns: 'Hour', 'E', 'P', 'F', 'PE', 'WS', 'WE'
print(melted_df.head())


            Hour      F      E      P     PE    WS    WE
Date                                                    
2022-06-13     1  11500  12752  14458  15171  1376  1380
2022-06-13     2  11014  12640  13141  13852  1247  1345
2022-06-13     3  10860  11387  12314  13048  1221  1259
2022-06-13     4  10906  10774  12177  12930  1341  1246
2022-06-13     5  10909  10778  12437  13028  1283  1274


In [146]:
melted_df

Unnamed: 0_level_0,Hour,F,E,P,PE,WS,WE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-06-13,1,11500,12752,14458,15171,1376,1380
2022-06-13,2,11014,12640,13141,13852,1247,1345
2022-06-13,3,10860,11387,12314,13048,1221,1259
2022-06-13,4,10906,10774,12177,12930,1341,1246
2022-06-13,5,10909,10778,12437,13028,1283,1274
...,...,...,...,...,...,...,...
2023-12-31,20,20049,20119,15660,16027,3279,3318
2023-12-31,21,19641,19852,15288,15570,3250,3405
2023-12-31,22,19155,19639,15301,15184,3293,3392
2023-12-31,23,18784,19342,14829,14762,3384,3402


In [147]:
# Rename the columns as per the provided dictionary
rename_dict = {
    'F': 'Total Consumption MWh',
    'E': 'Day-ahead consumption prognosis MWh',
    'P': 'Total Production MWh',
    'PE': 'Day-ahead production prognosis MWh',
    'WS': 'Settled wind production MWh',
    'WE': 'Day-ahead wind production prognosis MWh'
}

melted_df = melted_df.rename(columns=rename_dict)

# Convert the 'Hour' column into a string representing HH:MM:SS format
melted_df['Hour'] = melted_df['Hour'].apply(lambda x: f"{(x-1):02}:00:00")

# Convert 'Hour' from string to a timedelta
melted_df['Hour'] = pd.to_timedelta(melted_df['Hour'])

# If the 'Date' index is not already a datetime type, convert it
melted_df.index = pd.to_datetime(melted_df.index)

# Now add the 'Hour' timedelta to the 'Date' to create a datetime index
melted_df.index = melted_df.index + melted_df['Hour']

# Drop the 'Hour' column as its information has been incorporated into the index
melted_df = melted_df.drop('Hour', axis=1)




In [148]:
melted_df

Unnamed: 0,Total Consumption MWh,Day-ahead consumption prognosis MWh,Total Production MWh,Day-ahead production prognosis MWh,Settled wind production MWh,Day-ahead wind production prognosis MWh
2022-06-13 00:00:00,11500,12752,14458,15171,1376,1380
2022-06-13 01:00:00,11014,12640,13141,13852,1247,1345
2022-06-13 02:00:00,10860,11387,12314,13048,1221,1259
2022-06-13 03:00:00,10906,10774,12177,12930,1341,1246
2022-06-13 04:00:00,10909,10778,12437,13028,1283,1274
...,...,...,...,...,...,...
2023-12-31 19:00:00,20049,20119,15660,16027,3279,3318
2023-12-31 20:00:00,19641,19852,15288,15570,3250,3405
2023-12-31 21:00:00,19155,19639,15301,15184,3293,3392
2023-12-31 22:00:00,18784,19342,14829,14762,3384,3402


In [149]:
melted_df.isna().sum()

Total Consumption MWh                      1
Day-ahead consumption prognosis MWh        1
Total Production MWh                       1
Day-ahead production prognosis MWh         1
Settled wind production MWh                1
Day-ahead wind production prognosis MWh    1
dtype: int64

### Extract to excel file 

In [38]:
# Specify the path to the Excel file you want to create
excel_file_path = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_NO.xlsx'

# Export the melted DataFrame to an Excel file
melted_df.to_excel(excel_file_path, index=True)

print(f'DataFrame successfully saved to {excel_file_path}')

DataFrame successfully saved to /Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_NO.xlsx


# Sweden

In [255]:
'''
import pandas as pd
import os
import numpy as np

# Directory containing your Excel files
excel_files_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Operating data /SE/ExcelFiles/2018'

# Prepare an empty DataFrame to store concatenated results
all_files_data = pd.DataFrame()

# Categories and areas of interest
categories = ['P', 'E', 'PE', 'F', 'WS', 'WE']
areas = ['SE1', 'SE2', 'SE3', 'SE4']

# Columns for hours, taking into account skipping column 'K'
# Generate a dictionary mapping Excel column letters (or indexes in melted_df) to hour numbers
hour_columns = {df.columns[i]: i-6 for i in range(7, 32) if i != 10}  # Skip column K (index 10), adjust index for hours

# Iterate over each file in the directory
for filename in os.listdir(excel_files_directory):
    if filename.endswith('.xlsx'):  # or '.xls' for older Excel files
        file_path = os.path.join(excel_files_directory, filename)
        
        # Read the Excel file
        df = pd.read_excel(file_path)
        
        # Filter rows based on your conditions
        filtered_df = df[(df.iloc[:, 1].isin(categories)) & (df.iloc[:, 6].isin(areas))]
        
        # Melt the DataFrame to long format for the hour columns
        melted_df = pd.melt(filtered_df, id_vars=[df.columns[5], df.columns[1], df.columns[6]], value_vars=list(hour_columns.keys()), var_name='Hour', value_name='Value')
        
        # Map 'Hour' to hour numbers using our hour_columns mapping
        melted_df['Hour'] = melted_df['Hour'].apply(lambda x: hour_columns[x])

        # Combine category and area into a single column
        melted_df['Category_Area'] = melted_df.iloc[:, 1] + "_" + melted_df.iloc[:, 2]
        
        # Drop the original category and area columns
        melted_df.drop(columns=[melted_df.columns[1], melted_df.columns[2]], inplace=True)

        # Append to the all_files_data DataFrame
        all_files_data = pd.concat([all_files_data, melted_df])

# Pivot the DataFrame to have each category-area as a separate column, with DateTime and Hour as indexes
final_df = all_files_data.pivot_table(index=[all_files_data.columns[0], 'Hour'], columns='Category_Area', values='Value').reset_index()

# Optionally, convert the 'Date' column to a datetime type and sort
final_df[final_df.columns[0]] = pd.to_datetime(final_df[final_df.columns[0]])
final_df.sort_values(by=[final_df.columns[0], 'Hour'], inplace=True)

#Reset index and display the DataFrame
final_df.reset_index(drop=True, inplace=True)
#print(final_df.head())
'''

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

In [271]:
import pandas as pd
import os
import numpy as np

# Main directory containing subdirectories with your Excel files
main_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Operating data /SE/ExcelFiles/'

# Prepare an empty DataFrame to store concatenated results
all_files_data = pd.DataFrame()

# Categories and areas of interest
categories = ['P', 'E', 'PE', 'F', 'WS', 'WE']
areas = ['SE1', 'SE2', 'SE3', 'SE4']

# Iterate over each subdirectory in the main directory
for folder_name in os.listdir(main_directory):
    folder_path = os.path.join(main_directory, folder_name)
    if os.path.isdir(folder_path):  # Check if it is a folder
        # Iterate over each file in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith('.xlsx'):  # or '.xls' for older Excel files
                file_path = os.path.join(folder_path, filename)
                
                # Read the Excel file
                df = pd.read_excel(file_path)
                
                # Assuming the first operation on df to create 'hour_columns' depends on df being defined
                hour_columns = {df.columns[i]: i-6 for i in range(7, 32) if i != 10}  # Adjust for skipping 'K'
                
                # Filter rows based on your conditions
                filtered_df = df[(df.iloc[:, 1].isin(categories)) & (df.iloc[:, 6].isin(areas))]
                
                # Melt the DataFrame to long format for the hour columns
                melted_df = pd.melt(filtered_df, id_vars=[df.columns[5], df.columns[1], df.columns[6]], value_vars=list(hour_columns.keys()), var_name='Hour', value_name='Value')
                
                # Map 'Hour' to hour numbers using our hour_columns mapping
                melted_df['Hour'] = melted_df['Hour'].apply(lambda x: hour_columns[x])

                # Combine category and area into a single column
                melted_df['Category_Area'] = melted_df.iloc[:, 1] + "_" + melted_df.iloc[:, 2]
                
                # Drop the original category and area columns
                melted_df.drop(columns=[melted_df.columns[1], melted_df.columns[2]], inplace=True)

                # Append to the all_files_data DataFrame
                all_files_data = pd.concat([all_files_data, melted_df])

# Pivot the DataFrame to have each category-area as a separate column, with DateTime and Hour as indexes
final_df = all_files_data.pivot_table(index=[all_files_data.columns[0], 'Hour'], columns='Category_Area', values='Value').reset_index()

# Optionally, convert the 'Date' column to a datetime type and sort
final_df[final_df.columns[0]] = pd.to_datetime(final_df[final_df.columns[0]])
final_df.sort_values(by=[final_df.columns[0], 'Hour'], inplace=True)

# Reset index and display the DataFrame
final_df.reset_index(drop=True, inplace=True)
print(final_df.head())


Category_Area   Column 6  Hour   E_SE1   E_SE2   E_SE3   E_SE4   F_SE1  \
0             2018-01-01     1  1190.0  2051.0  9562.0  2616.0  1141.0   
1             2018-01-01     2  1172.0  2044.0  9412.0  2543.0  1016.0   
2             2018-01-01     3  1151.0  2030.0  9229.0  2499.0  1206.0   
3             2018-01-01     5  1149.0  1992.0  9205.0  2467.0  1195.0   
4             2018-01-01     6  1153.0  1959.0  9146.0  2455.0  1201.0   

Category_Area   F_SE2   F_SE3   F_SE4  ...    P_SE3   P_SE4  WE_SE1  WE_SE2  \
0              2189.0  9735.0  2498.0  ...  11752.0  1054.0   233.0   656.0   
1              2198.0  9584.0  2431.0  ...  11573.0  1034.0   247.0   687.0   
2              2213.0  9438.0  2402.0  ...  11522.0  1028.0   275.0   761.0   
3              2092.0  9381.0  2358.0  ...  11425.0  1021.0   256.0   772.0   
4              2023.0  9235.0  2385.0  ...  11313.0  1051.0   227.0   788.0   

Category_Area  WE_SE3  WE_SE4  WS_SE1  WS_SE2  WS_SE3  WS_SE4  
0               

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

In [272]:
final_df

Category_Area,Column 6,Hour,E_SE1,E_SE2,E_SE3,E_SE4,F_SE1,F_SE2,F_SE3,F_SE4,...,P_SE3,P_SE4,WE_SE1,WE_SE2,WE_SE3,WE_SE4,WS_SE1,WS_SE2,WS_SE3,WS_SE4
0,2018-01-01,1,1190.0,2051.0,9562.0,2616.0,1141.0,2189.0,9735.0,2498.0,...,11752.0,1054.0,233.0,656.0,871.0,446.0,248.0,466.0,765.0,438.0
1,2018-01-01,2,1172.0,2044.0,9412.0,2543.0,1016.0,2198.0,9584.0,2431.0,...,11573.0,1034.0,247.0,687.0,833.0,430.0,238.0,450.0,656.0,428.0
2,2018-01-01,3,1151.0,2030.0,9229.0,2499.0,1206.0,2213.0,9438.0,2402.0,...,11522.0,1028.0,275.0,761.0,777.0,432.0,229.0,497.0,589.0,387.0
3,2018-01-01,5,1149.0,1992.0,9205.0,2467.0,1195.0,2092.0,9381.0,2358.0,...,11425.0,1021.0,256.0,772.0,723.0,431.0,219.0,589.0,536.0,366.0
4,2018-01-01,6,1153.0,1959.0,9146.0,2455.0,1201.0,2023.0,9235.0,2385.0,...,11313.0,1051.0,227.0,788.0,676.0,463.0,187.0,639.0,477.0,361.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52573,2023-12-31,21,1618.0,2401.0,11877.0,2846.0,1655.0,2561.0,11947.0,2754.0,...,10114.0,2132.0,255.0,768.0,2268.0,1655.0,116.0,431.0,2137.0,1670.0
52574,2023-12-31,22,1586.0,2346.0,11563.0,2737.0,1632.0,2405.0,11561.0,2653.0,...,10111.0,2142.0,238.0,804.0,2335.0,1691.0,113.0,440.0,2199.0,1691.0
52575,2023-12-31,23,1569.0,2302.0,11350.0,2648.0,1638.0,2384.0,11276.0,2503.0,...,10126.0,2117.0,230.0,818.0,2392.0,1706.0,100.0,484.0,2267.0,1693.0
52576,2023-12-31,24,1534.0,2270.0,11099.0,2591.0,1540.0,2252.0,11035.0,2406.0,...,10129.0,2103.0,225.0,833.0,2447.0,1683.0,92.0,536.0,2311.0,1689.0


In [273]:
# Assuming final_df is your DataFrame and it contains an 'Hour' column that needs to be corrected

# Correct the 'Hour' column
# The approach here is to ensure the 'Hour' column starts at 1 and ends at 24
unique_hours = sorted(final_df['Hour'].unique())
hour_correction_map = {old_hour: new_hour for new_hour, old_hour in enumerate(unique_hours, start=1)}

# Apply the correction
final_df['Hour'] = final_df['Hour'].map(hour_correction_map)

# Verify the correction by checking the unique values in the 'Hour' column
print(sorted(final_df['Hour'].unique()))


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


In [274]:

# Rename 'Column 6' to 'Date'
final_df.rename(columns={'Column 6': 'Date'}, inplace=True)

# Convert 'Date' to datetime and 'Hour' to int, and adjust 'Hour' to proper format
final_df['Date'] = pd.to_datetime(final_df['Date'])
final_df['Hour'] = final_df['Hour'].astype(int) - 1

# Create a 'DateTime' index
final_df['DateTime'] = final_df.apply(lambda row: row['Date'] + pd.Timedelta(hours=row['Hour']), axis=1)

# Set the 'DateTime' as the index of the DataFrame
final_df.set_index('DateTime', inplace=True)

# Drop the 'Date' and 'Hour' columns since they are now part of the index
final_df.drop(['Date', 'Hour'], axis=1, inplace=True)

# Ensure there is no index name set (if you want it unnamed)
final_df.index.name = None

# Or if you want to name the index 'DateTime'
# final_df.index.name = 'DateTime'


In [275]:
final_df

Category_Area,E_SE1,E_SE2,E_SE3,E_SE4,F_SE1,F_SE2,F_SE3,F_SE4,PE_SE1,PE_SE2,...,P_SE3,P_SE4,WE_SE1,WE_SE2,WE_SE3,WE_SE4,WS_SE1,WS_SE2,WS_SE3,WS_SE4
2018-01-01 00:00:00,1190.0,2051.0,9562.0,2616.0,1141.0,2189.0,9735.0,2498.0,1586.0,2933.0,...,11752.0,1054.0,233.0,656.0,871.0,446.0,248.0,466.0,765.0,438.0
2018-01-01 01:00:00,1172.0,2044.0,9412.0,2543.0,1016.0,2198.0,9584.0,2431.0,1268.0,2707.0,...,11573.0,1034.0,247.0,687.0,833.0,430.0,238.0,450.0,656.0,428.0
2018-01-01 02:00:00,1151.0,2030.0,9229.0,2499.0,1206.0,2213.0,9438.0,2402.0,1277.0,2713.0,...,11522.0,1028.0,275.0,761.0,777.0,432.0,229.0,497.0,589.0,387.0
2018-01-01 03:00:00,1149.0,1992.0,9205.0,2467.0,1195.0,2092.0,9381.0,2358.0,1252.0,2710.0,...,11425.0,1021.0,256.0,772.0,723.0,431.0,219.0,589.0,536.0,366.0
2018-01-01 04:00:00,1153.0,1959.0,9146.0,2455.0,1201.0,2023.0,9235.0,2385.0,1218.0,2714.0,...,11313.0,1051.0,227.0,788.0,676.0,463.0,187.0,639.0,477.0,361.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,1618.0,2401.0,11877.0,2846.0,1655.0,2561.0,11947.0,2754.0,3352.0,4892.0,...,10114.0,2132.0,255.0,768.0,2268.0,1655.0,116.0,431.0,2137.0,1670.0
2023-12-31 20:00:00,1586.0,2346.0,11563.0,2737.0,1632.0,2405.0,11561.0,2653.0,3080.0,4402.0,...,10111.0,2142.0,238.0,804.0,2335.0,1691.0,113.0,440.0,2199.0,1691.0
2023-12-31 21:00:00,1569.0,2302.0,11350.0,2648.0,1638.0,2384.0,11276.0,2503.0,2985.0,3917.0,...,10126.0,2117.0,230.0,818.0,2392.0,1706.0,100.0,484.0,2267.0,1693.0
2023-12-31 22:00:00,1534.0,2270.0,11099.0,2591.0,1540.0,2252.0,11035.0,2406.0,2851.0,3667.0,...,10129.0,2103.0,225.0,833.0,2447.0,1683.0,92.0,536.0,2311.0,1689.0


In [276]:
# Assuming 'final_df' is the final DataFrame you've been working with
column_names = final_df.columns.tolist()
print(column_names)


['E_SE1', 'E_SE2', 'E_SE3', 'E_SE4', 'F_SE1', 'F_SE2', 'F_SE3', 'F_SE4', 'PE_SE1', 'PE_SE2', 'PE_SE3', 'PE_SE4', 'P_SE1', 'P_SE2', 'P_SE3', 'P_SE4', 'WE_SE1', 'WE_SE2', 'WE_SE3', 'WE_SE4', 'WS_SE1', 'WS_SE2', 'WS_SE3', 'WS_SE4']


In [277]:
# Create new columns by summing up the values for each category across all areas
final_df['E'] = final_df[['E_SE1', 'E_SE2', 'E_SE3', 'E_SE4']].sum(axis=1)
final_df['F'] = final_df[['F_SE1', 'F_SE2', 'F_SE3', 'F_SE4']].sum(axis=1)
final_df['PE'] = final_df[['PE_SE1', 'PE_SE2', 'PE_SE3', 'PE_SE4']].sum(axis=1)
final_df['P'] = final_df[['P_SE1', 'P_SE2', 'P_SE3', 'P_SE4']].sum(axis=1)
final_df['WE'] = final_df[['WE_SE1', 'WE_SE2', 'WE_SE3', 'WE_SE4']].sum(axis=1)
final_df['WS'] = final_df[['WS_SE1', 'WS_SE2', 'WS_SE3', 'WS_SE4']].sum(axis=1)



In [278]:
# Select only the summed columns to be present in final_df
final_df = final_df[['E', 'F', 'PE', 'P', 'WE', 'WS']]


In [279]:
final_df

Category_Area,E,F,PE,P,WE,WS
2018-01-01 00:00:00,15419.0,15563.0,16874.0,18029.0,2206.0,1917.0
2018-01-01 01:00:00,15171.0,15229.0,16274.0,16989.0,2197.0,1772.0
2018-01-01 02:00:00,14909.0,15259.0,16234.0,16910.0,2245.0,1702.0
2018-01-01 03:00:00,14813.0,15026.0,16104.0,16675.0,2182.0,1710.0
2018-01-01 04:00:00,14713.0,14844.0,15990.0,16539.0,2154.0,1664.0
...,...,...,...,...,...,...
2023-12-31 19:00:00,18742.0,18917.0,20679.0,20448.0,4946.0,4354.0
2023-12-31 20:00:00,18232.0,18251.0,19923.0,19655.0,5068.0,4443.0
2023-12-31 21:00:00,17869.0,17801.0,19339.0,19064.0,5146.0,4544.0
2023-12-31 22:00:00,17494.0,17233.0,18917.0,18673.0,5188.0,4628.0


In [280]:
rename_dict = {
    'F': 'Total Consumption MWh',
    'E': 'Day-ahead consumption prognosis MWh',
    'P': 'Total Production MWh',
    'PE': 'Day-ahead production prognosis MWh',
    'WS': 'Settled wind production MWh',
    'WE': 'Day-ahead wind production prognosis MWh'
}

final_df = final_df.rename(columns=rename_dict)

In [281]:
final_df

Category_Area,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
2018-01-01 00:00:00,15419.0,15563.0,16874.0,18029.0,2206.0,1917.0
2018-01-01 01:00:00,15171.0,15229.0,16274.0,16989.0,2197.0,1772.0
2018-01-01 02:00:00,14909.0,15259.0,16234.0,16910.0,2245.0,1702.0
2018-01-01 03:00:00,14813.0,15026.0,16104.0,16675.0,2182.0,1710.0
2018-01-01 04:00:00,14713.0,14844.0,15990.0,16539.0,2154.0,1664.0
...,...,...,...,...,...,...
2023-12-31 19:00:00,18742.0,18917.0,20679.0,20448.0,4946.0,4354.0
2023-12-31 20:00:00,18232.0,18251.0,19923.0,19655.0,5068.0,4443.0
2023-12-31 21:00:00,17869.0,17801.0,19339.0,19064.0,5146.0,4544.0
2023-12-31 22:00:00,17494.0,17233.0,18917.0,18673.0,5188.0,4628.0


In [282]:
# Specify the path to the Excel file you want to create
excel_file_path = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_SE.xlsx'

# Export the melted DataFrame to an Excel file
final_df.to_excel(excel_file_path, index=True)

print(f'DataFrame successfully saved to {excel_file_path}')

DataFrame successfully saved to /Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_SE.xlsx


## Denmark

In [287]:
import pandas as pd
import os
import numpy as np

# Main directory containing subdirectories with your Excel files
main_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Operating data /DK/ExcelFiles'

# Prepare an empty DataFrame to store concatenated results
all_files_data = pd.DataFrame()

# Categories and areas of interest
categories = ['P', 'E', 'PE', 'F', 'WS', 'WE']
areas = ['JY', 'SJ']

# Iterate over each subdirectory in the main directory
for folder_name in os.listdir(main_directory):
    folder_path = os.path.join(main_directory, folder_name)
    if os.path.isdir(folder_path):  # Check if it is a folder
        # Iterate over each file in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith('.xlsx'):  # or '.xls' for older Excel files
                file_path = os.path.join(folder_path, filename)
                
                # Read the Excel file
                df = pd.read_excel(file_path)
                
                # Assuming the first operation on df to create 'hour_columns' depends on df being defined
                hour_columns = {df.columns[i]: i-6 for i in range(7, 32) if i != 10}  # Adjust for skipping 'K'
                
                # Filter rows based on your conditions
                filtered_df = df[(df.iloc[:, 1].isin(categories)) & (df.iloc[:, 6].isin(areas))]
                
                # Melt the DataFrame to long format for the hour columns
                melted_df = pd.melt(filtered_df, id_vars=[df.columns[5], df.columns[1], df.columns[6]], value_vars=list(hour_columns.keys()), var_name='Hour', value_name='Value')
                
                # Map 'Hour' to hour numbers using our hour_columns mapping
                melted_df['Hour'] = melted_df['Hour'].apply(lambda x: hour_columns[x])

                # Combine category and area into a single column
                melted_df['Category_Area'] = melted_df.iloc[:, 1] + "_" + melted_df.iloc[:, 2]
                
                # Drop the original category and area columns
                melted_df.drop(columns=[melted_df.columns[1], melted_df.columns[2]], inplace=True)

                # Append to the all_files_data DataFrame
                all_files_data = pd.concat([all_files_data, melted_df])

# Pivot the DataFrame to have each category-area as a separate column, with DateTime and Hour as indexes
final_df = all_files_data.pivot_table(index=[all_files_data.columns[0], 'Hour'], columns='Category_Area', values='Value').reset_index()

# Optionally, convert the 'Date' column to a datetime type and sort
final_df[final_df.columns[0]] = pd.to_datetime(final_df[final_df.columns[0]])
final_df.sort_values(by=[final_df.columns[0], 'Hour'], inplace=True)

# Reset index and display the DataFrame
final_df.reset_index(drop=True, inplace=True)
print(final_df.head())

Category_Area   Column 6  Hour    E_JY    E_SJ    F_JY    F_SJ  PE_JY  PE_SJ  \
0             2018-01-01     1  1831.0  1591.0  1813.0  1607.0  730.0  594.0   
1             2018-01-01     2  1748.0  1541.0  1741.0  1565.0  734.0  595.0   
2             2018-01-01     3  1659.0  1497.0  1659.0  1457.0  613.0  593.0   
3             2018-01-01     5  1600.0  1426.0  1595.0  1426.0  616.0  559.0   
4             2018-01-01     6  1561.0  1378.0  1537.0  1382.0  639.0  513.0   

Category_Area    P_JY    P_SJ   WE_JY  WE_SJ   WS_JY  WS_SJ  
0              2335.0  1248.0  1513.0  279.0  1729.0  649.0  
1              2332.0  1204.0  1361.0  340.0  1763.0  613.0  
2              2117.0  1252.0  1441.0  503.0  1508.0  663.0  
3              2081.0  1313.0  1570.0  627.0  1514.0  723.0  
4              2093.0  1296.0  1681.0  684.0  1523.0  713.0  


  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

In [288]:
final_df

Category_Area,Column 6,Hour,E_JY,E_SJ,F_JY,F_SJ,PE_JY,PE_SJ,P_JY,P_SJ,WE_JY,WE_SJ,WS_JY,WS_SJ
0,2018-01-01,1,1831.0,1591.0,1813.0,1607.0,730.0,594.0,2335.0,1248.0,1513.0,279.0,1729.0,649.0
1,2018-01-01,2,1748.0,1541.0,1741.0,1565.0,734.0,595.0,2332.0,1204.0,1361.0,340.0,1763.0,613.0
2,2018-01-01,3,1659.0,1497.0,1659.0,1457.0,613.0,593.0,2117.0,1252.0,1441.0,503.0,1508.0,663.0
3,2018-01-01,5,1600.0,1426.0,1595.0,1426.0,616.0,559.0,2081.0,1313.0,1570.0,627.0,1514.0,723.0
4,2018-01-01,6,1561.0,1378.0,1537.0,1382.0,639.0,513.0,2093.0,1296.0,1681.0,684.0,1523.0,713.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52573,2023-12-31,21,2690.0,1751.0,2683.0,1777.0,3126.0,1850.0,3681.0,1933.0,2554.0,1149.0,3302.0,1219.0
52574,2023-12-31,22,2545.0,1644.0,2508.0,1655.0,2965.0,1788.0,3346.0,1843.0,2458.0,1100.0,2990.0,1171.0
52575,2023-12-31,23,2444.0,1573.0,2442.0,1591.0,2789.0,1730.0,3129.0,1756.0,2354.0,1027.0,2784.0,1113.0
52576,2023-12-31,24,2344.0,1517.0,2329.0,1542.0,2648.0,1604.0,3101.0,1524.0,2249.0,923.0,2713.0,920.0


In [289]:
# Assuming final_df is your DataFrame and it contains an 'Hour' column that needs to be corrected

# Correct the 'Hour' column
# The approach here is to ensure the 'Hour' column starts at 1 and ends at 24
unique_hours = sorted(final_df['Hour'].unique())
hour_correction_map = {old_hour: new_hour for new_hour, old_hour in enumerate(unique_hours, start=1)}

# Apply the correction
final_df['Hour'] = final_df['Hour'].map(hour_correction_map)

# Verify the correction by checking the unique values in the 'Hour' column
print(sorted(final_df['Hour'].unique()))


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


In [290]:
final_df

Category_Area,Column 6,Hour,E_JY,E_SJ,F_JY,F_SJ,PE_JY,PE_SJ,P_JY,P_SJ,WE_JY,WE_SJ,WS_JY,WS_SJ
0,2018-01-01,1,1831.0,1591.0,1813.0,1607.0,730.0,594.0,2335.0,1248.0,1513.0,279.0,1729.0,649.0
1,2018-01-01,2,1748.0,1541.0,1741.0,1565.0,734.0,595.0,2332.0,1204.0,1361.0,340.0,1763.0,613.0
2,2018-01-01,3,1659.0,1497.0,1659.0,1457.0,613.0,593.0,2117.0,1252.0,1441.0,503.0,1508.0,663.0
3,2018-01-01,4,1600.0,1426.0,1595.0,1426.0,616.0,559.0,2081.0,1313.0,1570.0,627.0,1514.0,723.0
4,2018-01-01,5,1561.0,1378.0,1537.0,1382.0,639.0,513.0,2093.0,1296.0,1681.0,684.0,1523.0,713.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52573,2023-12-31,20,2690.0,1751.0,2683.0,1777.0,3126.0,1850.0,3681.0,1933.0,2554.0,1149.0,3302.0,1219.0
52574,2023-12-31,21,2545.0,1644.0,2508.0,1655.0,2965.0,1788.0,3346.0,1843.0,2458.0,1100.0,2990.0,1171.0
52575,2023-12-31,22,2444.0,1573.0,2442.0,1591.0,2789.0,1730.0,3129.0,1756.0,2354.0,1027.0,2784.0,1113.0
52576,2023-12-31,23,2344.0,1517.0,2329.0,1542.0,2648.0,1604.0,3101.0,1524.0,2249.0,923.0,2713.0,920.0


In [291]:

# Rename 'Column 6' to 'Date'
final_df.rename(columns={'Column 6': 'Date'}, inplace=True)

# Convert 'Date' to datetime and 'Hour' to int, and adjust 'Hour' to proper format
final_df['Date'] = pd.to_datetime(final_df['Date'])
final_df['Hour'] = final_df['Hour'].astype(int) - 1

# Create a 'DateTime' index
final_df['DateTime'] = final_df.apply(lambda row: row['Date'] + pd.Timedelta(hours=row['Hour']), axis=1)

# Set the 'DateTime' as the index of the DataFrame
final_df.set_index('DateTime', inplace=True)

# Drop the 'Date' and 'Hour' columns since they are now part of the index
final_df.drop(['Date', 'Hour'], axis=1, inplace=True)

# Ensure there is no index name set (if you want it unnamed)
final_df.index.name = None

# Or if you want to name the index 'DateTime'
# final_df.index.name = 'DateTime'

In [292]:
final_df

Category_Area,E_JY,E_SJ,F_JY,F_SJ,PE_JY,PE_SJ,P_JY,P_SJ,WE_JY,WE_SJ,WS_JY,WS_SJ
2018-01-01 00:00:00,1831.0,1591.0,1813.0,1607.0,730.0,594.0,2335.0,1248.0,1513.0,279.0,1729.0,649.0
2018-01-01 01:00:00,1748.0,1541.0,1741.0,1565.0,734.0,595.0,2332.0,1204.0,1361.0,340.0,1763.0,613.0
2018-01-01 02:00:00,1659.0,1497.0,1659.0,1457.0,613.0,593.0,2117.0,1252.0,1441.0,503.0,1508.0,663.0
2018-01-01 03:00:00,1600.0,1426.0,1595.0,1426.0,616.0,559.0,2081.0,1313.0,1570.0,627.0,1514.0,723.0
2018-01-01 04:00:00,1561.0,1378.0,1537.0,1382.0,639.0,513.0,2093.0,1296.0,1681.0,684.0,1523.0,713.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,2690.0,1751.0,2683.0,1777.0,3126.0,1850.0,3681.0,1933.0,2554.0,1149.0,3302.0,1219.0
2023-12-31 20:00:00,2545.0,1644.0,2508.0,1655.0,2965.0,1788.0,3346.0,1843.0,2458.0,1100.0,2990.0,1171.0
2023-12-31 21:00:00,2444.0,1573.0,2442.0,1591.0,2789.0,1730.0,3129.0,1756.0,2354.0,1027.0,2784.0,1113.0
2023-12-31 22:00:00,2344.0,1517.0,2329.0,1542.0,2648.0,1604.0,3101.0,1524.0,2249.0,923.0,2713.0,920.0


In [293]:
# Create new columns by summing up the values for each category across all areas
final_df['E'] = final_df[['E_JY', 'E_SJ']].sum(axis=1)
final_df['F'] = final_df[['F_JY', 'F_SJ']].sum(axis=1)
final_df['PE'] = final_df[['PE_JY', 'PE_SJ']].sum(axis=1)
final_df['P'] = final_df[['P_JY', 'P_SJ']].sum(axis=1)
final_df['WE'] = final_df[['WE_JY', 'WE_SJ']].sum(axis=1)
final_df['WS'] = final_df[['WS_JY', 'WS_SJ']].sum(axis=1)

In [294]:
# Select only the summed columns to be present in final_df
final_df = final_df[['E', 'F', 'PE', 'P', 'WE', 'WS']]

In [295]:
rename_dict = {
    'F': 'Total Consumption MWh',
    'E': 'Day-ahead consumption prognosis MWh',
    'P': 'Total Production MWh',
    'PE': 'Day-ahead production prognosis MWh',
    'WS': 'Settled wind production MWh',
    'WE': 'Day-ahead wind production prognosis MWh'
}

final_df = final_df.rename(columns=rename_dict)

In [296]:
final_df

Category_Area,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
2018-01-01 00:00:00,3422.0,3420.0,1324.0,3583.0,1792.0,2378.0
2018-01-01 01:00:00,3289.0,3306.0,1329.0,3536.0,1701.0,2376.0
2018-01-01 02:00:00,3156.0,3116.0,1206.0,3369.0,1944.0,2171.0
2018-01-01 03:00:00,3026.0,3021.0,1175.0,3394.0,2197.0,2237.0
2018-01-01 04:00:00,2939.0,2919.0,1152.0,3389.0,2365.0,2236.0
...,...,...,...,...,...,...
2023-12-31 19:00:00,4441.0,4460.0,4976.0,5614.0,3703.0,4521.0
2023-12-31 20:00:00,4189.0,4163.0,4753.0,5189.0,3558.0,4161.0
2023-12-31 21:00:00,4017.0,4033.0,4519.0,4885.0,3381.0,3897.0
2023-12-31 22:00:00,3861.0,3871.0,4252.0,4625.0,3172.0,3633.0


In [297]:
# Specify the path to the Excel file you want to create
excel_file_path = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_DK.xlsx'

# Export the melted DataFrame to an Excel file
final_df.to_excel(excel_file_path, index=True)

print(f'DataFrame successfully saved to {excel_file_path}')

DataFrame successfully saved to /Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_DK.xlsx


# Finland

In [298]:
import pandas as pd
import os
import numpy as np

# Main directory containing subdirectories with your Excel files
main_directory = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Operating data /FI/ExcelFiles'

# Prepare an empty DataFrame to store concatenated results
all_files_data = pd.DataFrame()

# Categories and areas of interest
categories = ['P', 'E', 'PE', 'F', 'WS', 'WE']
areas = ['FI']

# Iterate over each subdirectory in the main directory
for folder_name in os.listdir(main_directory):
    folder_path = os.path.join(main_directory, folder_name)
    if os.path.isdir(folder_path):  # Check if it is a folder
        # Iterate over each file in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith('.xlsx'):  # or '.xls' for older Excel files
                file_path = os.path.join(folder_path, filename)
                
                # Read the Excel file
                df = pd.read_excel(file_path)
                
                # Assuming the first operation on df to create 'hour_columns' depends on df being defined
                hour_columns = {df.columns[i]: i-6 for i in range(7, 32) if i != 10}  # Adjust for skipping 'K'
                
                # Filter rows based on your conditions
                filtered_df = df[(df.iloc[:, 1].isin(categories)) & (df.iloc[:, 6].isin(areas))]
                
                # Melt the DataFrame to long format for the hour columns
                melted_df = pd.melt(filtered_df, id_vars=[df.columns[5], df.columns[1], df.columns[6]], value_vars=list(hour_columns.keys()), var_name='Hour', value_name='Value')
                
                # Map 'Hour' to hour numbers using our hour_columns mapping
                melted_df['Hour'] = melted_df['Hour'].apply(lambda x: hour_columns[x])

                # Combine category and area into a single column
                melted_df['Category_Area'] = melted_df.iloc[:, 1] + "_" + melted_df.iloc[:, 2]
                
                # Drop the original category and area columns
                melted_df.drop(columns=[melted_df.columns[1], melted_df.columns[2]], inplace=True)

                # Append to the all_files_data DataFrame
                all_files_data = pd.concat([all_files_data, melted_df])

# Pivot the DataFrame to have each category-area as a separate column, with DateTime and Hour as indexes
final_df = all_files_data.pivot_table(index=[all_files_data.columns[0], 'Hour'], columns='Category_Area', values='Value').reset_index()

# Optionally, convert the 'Date' column to a datetime type and sort
final_df[final_df.columns[0]] = pd.to_datetime(final_df[final_df.columns[0]])
final_df.sort_values(by=[final_df.columns[0], 'Hour'], inplace=True)

# Reset index and display the DataFrame
final_df.reset_index(drop=True, inplace=True)
print(final_df.head())

Category_Area   Column 6  Hour    E_FI    F_FI   PE_FI    P_FI  WE_FI  WS_FI
0             2018-01-01     1  9752.0  9715.0  7875.0  8050.0    NaN    NaN
1             2018-01-01     2  9498.0  9410.0  7874.0  7953.0    NaN    NaN
2             2018-01-01     3  9368.0  9242.0  7819.0  7849.0    NaN    NaN
3             2018-01-01     5  9362.0  9260.0  7768.0  7875.0    NaN    NaN
4             2018-01-01     6  9435.0  9364.0  7789.0  7850.0    NaN    NaN


  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

In [299]:
final_df

Category_Area,Column 6,Hour,E_FI,F_FI,PE_FI,P_FI,WE_FI,WS_FI
0,2018-01-01,1,9752.0,9715.0,7875.0,8050.0,,
1,2018-01-01,2,9498.0,9410.0,7874.0,7953.0,,
2,2018-01-01,3,9368.0,9242.0,7819.0,7849.0,,
3,2018-01-01,5,9362.0,9260.0,7768.0,7875.0,,
4,2018-01-01,6,9435.0,9364.0,7789.0,7850.0,,
...,...,...,...,...,...,...,...,...
52573,2023-12-31,21,11864.0,12586.0,9536.0,11345.0,1411.0,3189.0
52574,2023-12-31,22,11636.0,12542.0,9187.0,10750.0,1341.0,2829.0
52575,2023-12-31,23,11897.0,12790.0,9112.0,10381.0,1246.0,2572.0
52576,2023-12-31,24,11784.0,12770.0,8913.0,9925.0,1228.0,2249.0


In [300]:
# Assuming final_df is your DataFrame and it contains an 'Hour' column that needs to be corrected

# Correct the 'Hour' column
# The approach here is to ensure the 'Hour' column starts at 1 and ends at 24
unique_hours = sorted(final_df['Hour'].unique())
hour_correction_map = {old_hour: new_hour for new_hour, old_hour in enumerate(unique_hours, start=1)}

# Apply the correction
final_df['Hour'] = final_df['Hour'].map(hour_correction_map)

# Verify the correction by checking the unique values in the 'Hour' column
print(sorted(final_df['Hour'].unique()))


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


In [301]:
final_df

Category_Area,Column 6,Hour,E_FI,F_FI,PE_FI,P_FI,WE_FI,WS_FI
0,2018-01-01,1,9752.0,9715.0,7875.0,8050.0,,
1,2018-01-01,2,9498.0,9410.0,7874.0,7953.0,,
2,2018-01-01,3,9368.0,9242.0,7819.0,7849.0,,
3,2018-01-01,4,9362.0,9260.0,7768.0,7875.0,,
4,2018-01-01,5,9435.0,9364.0,7789.0,7850.0,,
...,...,...,...,...,...,...,...,...
52573,2023-12-31,20,11864.0,12586.0,9536.0,11345.0,1411.0,3189.0
52574,2023-12-31,21,11636.0,12542.0,9187.0,10750.0,1341.0,2829.0
52575,2023-12-31,22,11897.0,12790.0,9112.0,10381.0,1246.0,2572.0
52576,2023-12-31,23,11784.0,12770.0,8913.0,9925.0,1228.0,2249.0


In [302]:

# Rename 'Column 6' to 'Date'
final_df.rename(columns={'Column 6': 'Date'}, inplace=True)

# Convert 'Date' to datetime and 'Hour' to int, and adjust 'Hour' to proper format
final_df['Date'] = pd.to_datetime(final_df['Date'])
final_df['Hour'] = final_df['Hour'].astype(int) - 1

# Create a 'DateTime' index
final_df['DateTime'] = final_df.apply(lambda row: row['Date'] + pd.Timedelta(hours=row['Hour']), axis=1)

# Set the 'DateTime' as the index of the DataFrame
final_df.set_index('DateTime', inplace=True)

# Drop the 'Date' and 'Hour' columns since they are now part of the index
final_df.drop(['Date', 'Hour'], axis=1, inplace=True)

# Ensure there is no index name set (if you want it unnamed)
final_df.index.name = None

# Or if you want to name the index 'DateTime'
# final_df.index.name = 'DateTime'

In [303]:
final_df

Category_Area,E_FI,F_FI,PE_FI,P_FI,WE_FI,WS_FI
2018-01-01 00:00:00,9752.0,9715.0,7875.0,8050.0,,
2018-01-01 01:00:00,9498.0,9410.0,7874.0,7953.0,,
2018-01-01 02:00:00,9368.0,9242.0,7819.0,7849.0,,
2018-01-01 03:00:00,9362.0,9260.0,7768.0,7875.0,,
2018-01-01 04:00:00,9435.0,9364.0,7789.0,7850.0,,
...,...,...,...,...,...,...
2023-12-31 19:00:00,11864.0,12586.0,9536.0,11345.0,1411.0,3189.0
2023-12-31 20:00:00,11636.0,12542.0,9187.0,10750.0,1341.0,2829.0
2023-12-31 21:00:00,11897.0,12790.0,9112.0,10381.0,1246.0,2572.0
2023-12-31 22:00:00,11784.0,12770.0,8913.0,9925.0,1228.0,2249.0


In [305]:
# Create new columns by summing up the values for each category across all areas
final_df['E'] = final_df[['E_FI']].sum(axis=1)
final_df['F'] = final_df[['F_FI']].sum(axis=1)
final_df['PE'] = final_df[['PE_FI']].sum(axis=1)
final_df['P'] = final_df[['P_FI']].sum(axis=1)
final_df['WE'] = final_df[['WE_FI']].sum(axis=1)
final_df['WS'] = final_df[['WS_FI']].sum(axis=1)

In [306]:
# Select only the summed columns to be present in final_df
final_df = final_df[['E', 'F', 'PE', 'P', 'WE', 'WS']]

In [307]:
rename_dict = {
    'F': 'Total Consumption MWh',
    'E': 'Day-ahead consumption prognosis MWh',
    'P': 'Total Production MWh',
    'PE': 'Day-ahead production prognosis MWh',
    'WS': 'Settled wind production MWh',
    'WE': 'Day-ahead wind production prognosis MWh'
}

final_df = final_df.rename(columns=rename_dict)

In [308]:
final_df

Category_Area,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
2018-01-01 00:00:00,9752.0,9715.0,7875.0,8050.0,0.0,0.0
2018-01-01 01:00:00,9498.0,9410.0,7874.0,7953.0,0.0,0.0
2018-01-01 02:00:00,9368.0,9242.0,7819.0,7849.0,0.0,0.0
2018-01-01 03:00:00,9362.0,9260.0,7768.0,7875.0,0.0,0.0
2018-01-01 04:00:00,9435.0,9364.0,7789.0,7850.0,0.0,0.0
...,...,...,...,...,...,...
2023-12-31 19:00:00,11864.0,12586.0,9536.0,11345.0,1411.0,3189.0
2023-12-31 20:00:00,11636.0,12542.0,9187.0,10750.0,1341.0,2829.0
2023-12-31 21:00:00,11897.0,12790.0,9112.0,10381.0,1246.0,2572.0
2023-12-31 22:00:00,11784.0,12770.0,8913.0,9925.0,1228.0,2249.0


In [309]:
# Specify the path to the Excel file you want to create
excel_file_path = '/Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_FI.xlsx'

# Export the melted DataFrame to an Excel file
final_df.to_excel(excel_file_path, index=True)

print(f'DataFrame successfully saved to {excel_file_path}')

DataFrame successfully saved to /Users/evenbakke/Documents/Master Thesis/MasterThesis-/production_consumption_FI.xlsx
