In [26]:
import pandas as pd 

# NordPool prices (System price and area prices)

## Day-ahead System price (EUR)

In [230]:
systemprice_df = pd.read_excel("System_combined.xlsx")
systemprice_df.set_index("DateTime", inplace=True)
systemprice_df.sort_index(ascending=True)
systemprice_df = systemprice_df.drop("Unnamed: 0", axis = 1)
systemprice_df

Unnamed: 0_level_0,systemPrice
DateTime,Unnamed: 1_level_1
2018-01-01 00:00:00,26.263485
2018-01-01 01:00:00,26.382834
2018-01-01 02:00:00,26.043561
2018-01-01 03:00:00,24.585894
2018-01-01 04:00:00,24.626124
...,...
2023-12-31 19:00:00,49.594203
2023-12-31 20:00:00,44.985186
2023-12-31 21:00:00,44.755875
2023-12-31 22:00:00,43.986141


In [231]:
missing_values_count = systemprice_df.isnull().sum()
print(missing_values_count)

systemPrice    6
dtype: int64


Linear interpolation:

Why? Despite the seasonality and trends in your data, because the amount of missing data is minimal, linear interpolation can be an effective and simple method to fill these gaps without introducing significant errors. It assumes a straight-line relationship between the points before and after the missing data, which, given the small number of consecutive missing values, is unlikely to distort any seasonal or trending behavior significantly.

In [232]:
# There are six missing values for each of the columns
# Linear Interpolation
systemprice_df = systemprice_df.interpolate(method='linear')
missing_values_count = systemprice_df.isnull().sum()
print(missing_values_count)

systemPrice    0
dtype: int64


## Bidding Prices NO, SE, DK, FI (EUR)

In [233]:
bidding_df = pd.read_excel("bidding_prices.xlsx")
bidding_df.set_index("DateTime", inplace=True)
bidding_df.sort_index(ascending=True)
bidding_df

Unnamed: 0_level_0,NO1 EUR/MWh,NO2 EUR/MWh,NO3 EUR/MWh,NO4 EUR/MWh,NO5 EUR/MWh,SE1 EUR/MWh,SE2 EUR/MWh,SE3 EUR/MWh,SE4 EUR/MWh,DK1 EUR/MWh,DK2 EUR/MWh,FIN EUR/MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-01 00:00:00,26.33,26.33,26.33,26.33,26.33,26.33,26.33,26.33,26.33,21.80,26.33,26.33
2018-01-01 01:00:00,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43
2018-01-01 02:00:00,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10
2018-01-01 03:00:00,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70
2018-01-01 04:00:00,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,62.88,62.88,39.64,38.14,62.88,39.64,39.64,39.64,39.64,39.64,39.64,59.99
2023-12-31 20:00:00,62.60,62.60,34.89,34.89,62.60,34.89,34.89,34.89,34.89,34.89,34.89,40.99
2023-12-31 21:00:00,62.25,62.25,29.60,29.60,62.25,29.60,29.60,29.60,29.60,29.60,29.60,53.83
2023-12-31 22:00:00,61.77,61.77,28.67,28.67,61.77,28.67,28.67,28.67,28.67,28.67,28.67,55.49


In [234]:
# There are 52 560 rows, but there should be 52 584. 

# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
bidding_df.index = pd.to_datetime(bidding_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(bidding_df.index)

# Print the missing dates
print(missing_dates)

DatetimeIndex(['2020-12-31 00:00:00', '2020-12-31 01:00:00',
               '2020-12-31 02:00:00', '2020-12-31 03:00:00',
               '2020-12-31 04:00:00', '2020-12-31 05:00:00',
               '2020-12-31 06:00:00', '2020-12-31 07:00:00',
               '2020-12-31 08:00:00', '2020-12-31 09:00:00',
               '2020-12-31 10:00:00', '2020-12-31 11:00:00',
               '2020-12-31 12:00:00', '2020-12-31 13:00:00',
               '2020-12-31 14:00:00', '2020-12-31 15:00:00',
               '2020-12-31 16:00:00', '2020-12-31 17:00:00',
               '2020-12-31 18:00:00', '2020-12-31 19:00:00',
               '2020-12-31 20:00:00', '2020-12-31 21:00:00',
               '2020-12-31 22:00:00', '2020-12-31 23:00:00'],
              dtype='datetime64[ns]', freq=None)


In [235]:
#Creating missing rows with NaN values for the specific date 

# Create a date range for the missing day, from 00:00 to 23:00
missing_day_range = pd.date_range(start='2020-12-31 00:00:00', end='2020-12-31 23:00:00', freq='H')

# Create a temporary DataFrame with the missing datetime range
temp_df = pd.DataFrame(index=missing_day_range)

# Combine the temporary DataFrame with your original DataFrame
bidding_df = bidding_df.combine_first(temp_df)


In [236]:
missing_values_count = bidding_df.isnull().sum()
print(missing_values_count)

NO1 EUR/MWh    30
NO2 EUR/MWh    30
NO3 EUR/MWh    30
NO4 EUR/MWh    30
NO5 EUR/MWh    30
SE1 EUR/MWh    30
SE2 EUR/MWh    30
SE3 EUR/MWh    30
SE4 EUR/MWh    30
DK1 EUR/MWh    30
DK2 EUR/MWh    30
FIN EUR/MWh    30
dtype: int64


In [237]:
bidding_df = bidding_df.interpolate(method='linear')
missing_values_count = bidding_df.isnull().sum()
print(missing_values_count)

NO1 EUR/MWh    0
NO2 EUR/MWh    0
NO3 EUR/MWh    0
NO4 EUR/MWh    0
NO5 EUR/MWh    0
SE1 EUR/MWh    0
SE2 EUR/MWh    0
SE3 EUR/MWh    0
SE4 EUR/MWh    0
DK1 EUR/MWh    0
DK2 EUR/MWh    0
FIN EUR/MWh    0
dtype: int64


## Merging the datafames 

In [238]:
# Merge the DataFrames on their indexes
nordpool_prices_df = pd.merge(systemprice_df, bidding_df, left_index=True, right_index=True)

In [239]:
nordpool_prices_df

Unnamed: 0_level_0,systemPrice,NO1 EUR/MWh,NO2 EUR/MWh,NO3 EUR/MWh,NO4 EUR/MWh,NO5 EUR/MWh,SE1 EUR/MWh,SE2 EUR/MWh,SE3 EUR/MWh,SE4 EUR/MWh,DK1 EUR/MWh,DK2 EUR/MWh,FIN EUR/MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-01 00:00:00,26.263485,26.33,26.33,26.33,26.33,26.33,26.33,26.33,26.33,26.33,21.80,26.33,26.33
2018-01-01 01:00:00,26.382834,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43
2018-01-01 02:00:00,26.043561,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10
2018-01-01 03:00:00,24.585894,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70
2018-01-01 04:00:00,24.626124,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,49.594203,62.88,62.88,39.64,38.14,62.88,39.64,39.64,39.64,39.64,39.64,39.64,59.99
2023-12-31 20:00:00,44.985186,62.60,62.60,34.89,34.89,62.60,34.89,34.89,34.89,34.89,34.89,34.89,40.99
2023-12-31 21:00:00,44.755875,62.25,62.25,29.60,29.60,62.25,29.60,29.60,29.60,29.60,29.60,29.60,53.83
2023-12-31 22:00:00,43.986141,61.77,61.77,28.67,28.67,61.77,28.67,28.67,28.67,28.67,28.67,28.67,55.49


In [240]:
# Check for missing values in each column
missing_values_count = nordpool_prices_df.isnull().sum()
print(missing_values_count)

systemPrice    0
NO1 EUR/MWh    0
NO2 EUR/MWh    0
NO3 EUR/MWh    0
NO4 EUR/MWh    0
NO5 EUR/MWh    0
SE1 EUR/MWh    0
SE2 EUR/MWh    0
SE3 EUR/MWh    0
SE4 EUR/MWh    0
DK1 EUR/MWh    0
DK2 EUR/MWh    0
FIN EUR/MWh    0
dtype: int64


In [241]:
# Assume you have a new list of column names
new_columns = ['System Price EUR/MWh', 'NO1 Price EUR/MWh', 'NO2 Price EUR/MWh', 'NO3 Price EUR/MWh', 'NO4 Price EUR/MWh', 'NO5 Price EUR/MWh', 
               'SE1 Price EUR/MWh', 'SE2 Price EUR/MWh', 'SE3 Price EUR/MWh', 'SE4 Price EUR/MWh', 'DK1 Price EUR/MWh', 'DK2 Price EUR/MWh', 'FI Price EUR/MWh']  # List all new column names

# Replace all column names
nordpool_prices_df.columns = new_columns

# Now, df has all its columns renamed.


In [242]:
nordpool_prices_df

Unnamed: 0_level_0,System Price EUR/MWh,NO1 Price EUR/MWh,NO2 Price EUR/MWh,NO3 Price EUR/MWh,NO4 Price EUR/MWh,NO5 Price EUR/MWh,SE1 Price EUR/MWh,SE2 Price EUR/MWh,SE3 Price EUR/MWh,SE4 Price EUR/MWh,DK1 Price EUR/MWh,DK2 Price EUR/MWh,FI Price EUR/MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-01 00:00:00,26.263485,26.33,26.33,26.33,26.33,26.33,26.33,26.33,26.33,26.33,21.80,26.33,26.33
2018-01-01 01:00:00,26.382834,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43,26.43
2018-01-01 02:00:00,26.043561,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10,26.10
2018-01-01 03:00:00,24.585894,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70,24.70
2018-01-01 04:00:00,24.626124,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74,24.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,49.594203,62.88,62.88,39.64,38.14,62.88,39.64,39.64,39.64,39.64,39.64,39.64,59.99
2023-12-31 20:00:00,44.985186,62.60,62.60,34.89,34.89,62.60,34.89,34.89,34.89,34.89,34.89,34.89,40.99
2023-12-31 21:00:00,44.755875,62.25,62.25,29.60,29.60,62.25,29.60,29.60,29.60,29.60,29.60,29.60,53.83
2023-12-31 22:00:00,43.986141,61.77,61.77,28.67,28.67,61.77,28.67,28.67,28.67,28.67,28.67,28.67,55.49


In [243]:
# Extract to excel 
# Export the DataFrame to an Excel file
nordpool_prices_df.to_excel('NordPool Prices.xlsx', sheet_name='NordPool Prices')

# If you run into any issues related to Excel file size or performance, consider breaking your data into multiple sheets or saving in a different format.


# Market data (Volume, Production, Consumption) (Need to fix EEX, EPX, OMEL)

## NordPool Volume 

In [246]:
volume_df = pd.read_excel("tradingvolume.xlsx")
volume_df.set_index("DateTime", inplace=True)
volume_df.sort_index(ascending=True)
volume_df

Unnamed: 0_level_0,Volume traded MWh
DateTime,Unnamed: 1_level_1
2018-01-01 00:00:00,39676.5
2018-01-01 01:00:00,39492.2
2018-01-01 02:00:00,38381.8
2018-01-01 03:00:00,37492.3
2018-01-01 04:00:00,37443.1
...,...
2023-12-31 19:00:00,49873.0
2023-12-31 20:00:00,48269.9
2023-12-31 21:00:00,47554.1
2023-12-31 22:00:00,46830.7


In [247]:
missing_values_count = volume_df.isnull().sum()
print(missing_values_count)

Volume traded MWh    6
dtype: int64


In [248]:
# There are six missing values for each of the columns
# Linear Interpolation
volume_df = volume_df.interpolate(method='linear')
missing_values_count = volume_df.isnull().sum()
print(missing_values_count)

Volume traded MWh    0
dtype: int64


In [249]:
volume_df

Unnamed: 0_level_0,Volume traded MWh
DateTime,Unnamed: 1_level_1
2018-01-01 00:00:00,39676.5
2018-01-01 01:00:00,39492.2
2018-01-01 02:00:00,38381.8
2018-01-01 03:00:00,37492.3
2018-01-01 04:00:00,37443.1
...,...
2023-12-31 19:00:00,49873.0
2023-12-31 20:00:00,48269.9
2023-12-31 21:00:00,47554.1
2023-12-31 22:00:00,46830.7


## Production & Consumption 

### NO

In [256]:
operating_data_NO_df = pd.read_excel("production_consumption_NO.xlsx")
# Assuming operating_data_NO_df is already read from the Excel file
# Rename the "Unnamed: 0" column to "DateTime"
operating_data_NO_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)

# Set the "DateTime" column as the index
operating_data_NO_df.set_index('DateTime', inplace=True)

# Optionally, you can sort the DataFrame by the index (DateTime) if needed
operating_data_NO_df.sort_index(ascending=True, inplace=True)

# Display the DataFrame to verify changes
operating_data_NO_df

Unnamed: 0_level_0,Total Consumption MWh,Day-ahead consumption prognosis MWh,Total Production MWh,Day-ahead production prognosis MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:00:00,16989.0,16422.0,14131.0,13710.0
2018-01-01 01:00:00,16762.0,16272.0,14297.0,13935.0
2018-01-01 02:00:00,16509.0,16101.0,13365.0,12926.0
2018-01-01 03:00:00,16109.0,16011.0,12076.0,11698.0
2018-01-01 04:00:00,16152.0,15975.0,12083.0,11710.0
...,...,...,...,...
2023-12-31 19:00:00,20049.0,20119.0,15660.0,16027.0
2023-12-31 20:00:00,19641.0,19852.0,15288.0,15570.0
2023-12-31 21:00:00,19155.0,19639.0,15301.0,15184.0
2023-12-31 22:00:00,18784.0,19342.0,14829.0,14762.0


In [257]:
missing_values_count = operating_data_NO_df.isnull().sum()
print(missing_values_count)

Total Consumption MWh                  6
Day-ahead consumption prognosis MWh    6
Total Production MWh                   6
Day-ahead production prognosis MWh     6
dtype: int64


In [258]:

# Count the number of zeros in each column
zero_count = (operating_data_NO_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

Total Consumption MWh                  0
Day-ahead consumption prognosis MWh    0
Total Production MWh                   0
Day-ahead production prognosis MWh     0
dtype: int64


In [259]:
operating_data_NO_df = operating_data_NO_df.interpolate(method='linear')
missing_values_count = operating_data_NO_df.isnull().sum()
print(missing_values_count)

Total Consumption MWh                  0
Day-ahead consumption prognosis MWh    0
Total Production MWh                   0
Day-ahead production prognosis MWh     0
dtype: int64


In [260]:
operating_data_NO_df

Unnamed: 0_level_0,Total Consumption MWh,Day-ahead consumption prognosis MWh,Total Production MWh,Day-ahead production prognosis MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:00:00,16989.0,16422.0,14131.0,13710.0
2018-01-01 01:00:00,16762.0,16272.0,14297.0,13935.0
2018-01-01 02:00:00,16509.0,16101.0,13365.0,12926.0
2018-01-01 03:00:00,16109.0,16011.0,12076.0,11698.0
2018-01-01 04:00:00,16152.0,15975.0,12083.0,11710.0
...,...,...,...,...
2023-12-31 19:00:00,20049.0,20119.0,15660.0,16027.0
2023-12-31 20:00:00,19641.0,19852.0,15288.0,15570.0
2023-12-31 21:00:00,19155.0,19639.0,15301.0,15184.0
2023-12-31 22:00:00,18784.0,19342.0,14829.0,14762.0


In [261]:
# Assume you have a new list of column names
new_columns = ['Total Consumption MWh NO', 'Day-ahead Consumption Prognosis MWh NO', 'Total Production MWh NO', 'Day-ahead Production Prognosis MWh NO']  # List all new column names

# Replace all column names
operating_data_NO_df.columns = new_columns

operating_data_NO_df

Unnamed: 0_level_0,Total Consumption MWh NO,Day-ahead Consumption Prognosis MWh NO,Total Production MWh NO,Day-ahead Production Prognosis MWh NO
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:00:00,16989.0,16422.0,14131.0,13710.0
2018-01-01 01:00:00,16762.0,16272.0,14297.0,13935.0
2018-01-01 02:00:00,16509.0,16101.0,13365.0,12926.0
2018-01-01 03:00:00,16109.0,16011.0,12076.0,11698.0
2018-01-01 04:00:00,16152.0,15975.0,12083.0,11710.0
...,...,...,...,...
2023-12-31 19:00:00,20049.0,20119.0,15660.0,16027.0
2023-12-31 20:00:00,19641.0,19852.0,15288.0,15570.0
2023-12-31 21:00:00,19155.0,19639.0,15301.0,15184.0
2023-12-31 22:00:00,18784.0,19342.0,14829.0,14762.0


### SE

In [262]:
operating_data_SE_df = pd.read_excel("production_consumption_SE.xlsx")
# Assuming operating_data_NO_df is already read from the Excel file
# Rename the "Unnamed: 0" column to "DateTime"
operating_data_SE_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)

# Set the "DateTime" column as the index
operating_data_SE_df.set_index('DateTime', inplace=True)

# Optionally, you can sort the DataFrame by the index (DateTime) if needed
operating_data_SE_df.sort_index(ascending=True, inplace=True)

# Display the DataFrame to verify changes
operating_data_SE_df

Unnamed: 0_level_0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,15419,15563,16874,18029,2206,1917
2018-01-01 01:00:00,15171,15229,16274,16989,2197,1772
2018-01-01 02:00:00,14909,15259,16234,16910,2245,1702
2018-01-01 03:00:00,14813,15026,16104,16675,2182,1710
2018-01-01 04:00:00,14713,14844,15990,16539,2154,1664
...,...,...,...,...,...,...
2023-12-31 19:00:00,18742,18917,20679,20448,4946,4354
2023-12-31 20:00:00,18232,18251,19923,19655,5068,4443
2023-12-31 21:00:00,17869,17801,19339,19064,5146,4544
2023-12-31 22:00:00,17494,17233,18917,18673,5188,4628


In [263]:
# Assuming your DataFrame is named operating_data_SE_df
# Replace 'column_name1' and 'column_name2' with the actual names of the columns you want to drop

operating_data_SE_df.drop(['Day-ahead wind production prognosis MWh', 'Settled wind production MWh'], axis=1, inplace=True)

In [264]:
missing_values_count = operating_data_SE_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh    0
Total Consumption MWh                  0
Day-ahead production prognosis MWh     0
Total Production MWh                   0
dtype: int64


In [265]:
# Count the number of zeros in each column
zero_count = (operating_data_SE_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

Day-ahead consumption prognosis MWh    48
Total Consumption MWh                   4
Day-ahead production prognosis MWh      0
Total Production MWh                    1
dtype: int64


In [266]:
import numpy as np
# Replace all zeros with NaN across the DataFrame
operating_data_SE_df = operating_data_SE_df.replace(0, np.nan)
missing_values_count = operating_data_SE_df.isnull().sum()
print(missing_values_count)


Day-ahead consumption prognosis MWh    48
Total Consumption MWh                   4
Day-ahead production prognosis MWh      0
Total Production MWh                    1
dtype: int64


In [267]:
# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
operating_data_SE_df.index = pd.to_datetime(operating_data_SE_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(operating_data_SE_df.index)

# Print the missing dates
print(missing_dates)

DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
               '2020-03-29 02:00:00', '2021-03-28 02:00:00',
               '2022-03-27 02:00:00', '2023-03-26 02:00:00'],
              dtype='datetime64[ns]', freq=None)


In [268]:
import pandas as pd

# Provided missing DatetimeIndex
missing_dates = pd.DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
                                  '2020-03-29 02:00:00', '2021-03-28 02:00:00',
                                  '2022-03-27 02:00:00', '2023-03-26 02:00:00'])

# Create a temporary DataFrame with the missing datetime range
# Since these are specific timestamps, we directly use them as the index for the temp_df
temp_df = pd.DataFrame(index=missing_dates)

# Combine the temporary DataFrame with your original DataFrame
# This will add the missing timestamps with NaN values for all columns
operating_data_SE_df = operating_data_SE_df.combine_first(temp_df)

# Ensure the combined DataFrame is sorted
operating_data_SE_df.sort_index(inplace=True)


In [269]:
missing_values_count = operating_data_SE_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh    54
Total Consumption MWh                  10
Day-ahead production prognosis MWh      6
Total Production MWh                    7
dtype: int64


In [270]:
operating_data_SE_df = operating_data_SE_df.interpolate(method='linear')
missing_values_count = operating_data_SE_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh    0
Total Consumption MWh                  0
Day-ahead production prognosis MWh     0
Total Production MWh                   0
dtype: int64


In [271]:
operating_data_SE_df

Unnamed: 0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh
2018-01-01 00:00:00,15419.0,15563.0,16874.0,18029.0
2018-01-01 01:00:00,15171.0,15229.0,16274.0,16989.0
2018-01-01 02:00:00,14909.0,15259.0,16234.0,16910.0
2018-01-01 03:00:00,14813.0,15026.0,16104.0,16675.0
2018-01-01 04:00:00,14713.0,14844.0,15990.0,16539.0
...,...,...,...,...
2023-12-31 19:00:00,18742.0,18917.0,20679.0,20448.0
2023-12-31 20:00:00,18232.0,18251.0,19923.0,19655.0
2023-12-31 21:00:00,17869.0,17801.0,19339.0,19064.0
2023-12-31 22:00:00,17494.0,17233.0,18917.0,18673.0


In [272]:
# Assume you have a new list of column names
new_columns = ['Day-ahead Consumption Prognosis MWh SE', 'Total Consumption MWh SE', 'Day-ahead Production Prognosis MWh SE', 'Total Production MWh SE']  # List all new column names

# Replace all column names
operating_data_SE_df.columns = new_columns

operating_data_SE_df

Unnamed: 0,Day-ahead Consumption Prognosis MWh SE,Total Consumption MWh SE,Day-ahead Production Prognosis MWh SE,Total Production MWh SE
2018-01-01 00:00:00,15419.0,15563.0,16874.0,18029.0
2018-01-01 01:00:00,15171.0,15229.0,16274.0,16989.0
2018-01-01 02:00:00,14909.0,15259.0,16234.0,16910.0
2018-01-01 03:00:00,14813.0,15026.0,16104.0,16675.0
2018-01-01 04:00:00,14713.0,14844.0,15990.0,16539.0
...,...,...,...,...
2023-12-31 19:00:00,18742.0,18917.0,20679.0,20448.0
2023-12-31 20:00:00,18232.0,18251.0,19923.0,19655.0
2023-12-31 21:00:00,17869.0,17801.0,19339.0,19064.0
2023-12-31 22:00:00,17494.0,17233.0,18917.0,18673.0


### DK

In [273]:
operating_data_DK_df = pd.read_excel("production_consumption_DK.xlsx")
# Assuming operating_data_NO_df is already read from the Excel file
# Rename the "Unnamed: 0" column to "DateTime"
operating_data_DK_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)

# Set the "DateTime" column as the index
operating_data_DK_df.set_index('DateTime', inplace=True)

# Optionally, you can sort the DataFrame by the index (DateTime) if needed
operating_data_DK_df.sort_index(ascending=True, inplace=True)

# Display the DataFrame to verify changes
operating_data_DK_df

Unnamed: 0_level_0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,3422,3420,1324,3583,1792,2378
2018-01-01 01:00:00,3289,3306,1329,3536,1701,2376
2018-01-01 02:00:00,3156,3116,1206,3369,1944,2171
2018-01-01 03:00:00,3026,3021,1175,3394,2197,2237
2018-01-01 04:00:00,2939,2919,1152,3389,2365,2236
...,...,...,...,...,...,...
2023-12-31 19:00:00,4441,4460,4976,5614,3703,4521
2023-12-31 20:00:00,4189,4163,4753,5189,3558,4161
2023-12-31 21:00:00,4017,4033,4519,4885,3381,3897
2023-12-31 22:00:00,3861,3871,4252,4625,3172,3633


In [274]:
# Assuming your DataFrame is named operating_data_SE_df
# Replace 'column_name1' and 'column_name2' with the actual names of the columns you want to drop

operating_data_DK_df.drop(['Day-ahead wind production prognosis MWh', 'Settled wind production MWh'], axis=1, inplace=True)

In [275]:
missing_values_count = operating_data_DK_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh    0
Total Consumption MWh                  0
Day-ahead production prognosis MWh     0
Total Production MWh                   0
dtype: int64


In [276]:
# Count the number of zeros in each column
zero_count = (operating_data_DK_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

Day-ahead consumption prognosis MWh     0
Total Consumption MWh                   3
Day-ahead production prognosis MWh     24
Total Production MWh                    1
dtype: int64


In [277]:
import numpy as np
# Replace all zeros with NaN across the DataFrame
operating_data_DK_df = operating_data_DK_df.replace(0, np.nan)
missing_values_count = operating_data_DK_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh     0
Total Consumption MWh                   3
Day-ahead production prognosis MWh     24
Total Production MWh                    1
dtype: int64


In [278]:
# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
operating_data_DK_df.index = pd.to_datetime(operating_data_DK_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(operating_data_DK_df.index)

# Print the missing dates
print(missing_dates)

DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
               '2020-03-29 02:00:00', '2021-03-28 02:00:00',
               '2022-03-27 02:00:00', '2023-03-26 02:00:00'],
              dtype='datetime64[ns]', freq=None)


In [279]:
import pandas as pd

# Provided missing DatetimeIndex
missing_dates = pd.DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
                                  '2020-03-29 02:00:00', '2021-03-28 02:00:00',
                                  '2022-03-27 02:00:00', '2023-03-26 02:00:00'])

# Create a temporary DataFrame with the missing datetime range
# Since these are specific timestamps, we directly use them as the index for the temp_df
temp_df = pd.DataFrame(index=missing_dates)

# Combine the temporary DataFrame with your original DataFrame
# This will add the missing timestamps with NaN values for all columns
operating_data_DK_df = operating_data_DK_df.combine_first(temp_df)

# Ensure the combined DataFrame is sorted
operating_data_DK_df.sort_index(inplace=True)

In [280]:
operating_data_DK_df = operating_data_DK_df.interpolate(method='linear')
missing_values_count = operating_data_DK_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh    0
Total Consumption MWh                  0
Day-ahead production prognosis MWh     0
Total Production MWh                   0
dtype: int64


In [281]:
operating_data_DK_df

Unnamed: 0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh
2018-01-01 00:00:00,3422.0,3420.0,1324.0,3583.0
2018-01-01 01:00:00,3289.0,3306.0,1329.0,3536.0
2018-01-01 02:00:00,3156.0,3116.0,1206.0,3369.0
2018-01-01 03:00:00,3026.0,3021.0,1175.0,3394.0
2018-01-01 04:00:00,2939.0,2919.0,1152.0,3389.0
...,...,...,...,...
2023-12-31 19:00:00,4441.0,4460.0,4976.0,5614.0
2023-12-31 20:00:00,4189.0,4163.0,4753.0,5189.0
2023-12-31 21:00:00,4017.0,4033.0,4519.0,4885.0
2023-12-31 22:00:00,3861.0,3871.0,4252.0,4625.0


In [282]:
# Assume you have a new list of column names
new_columns = ['Day-ahead Consumption Prognosis MWh DK', 'Total Consumption MWh DK', 'Day-ahead Production Prognosis MWh DK', 'Total Production MWh DK']  # List all new column names

# Replace all column names
operating_data_DK_df.columns = new_columns

operating_data_DK_df

Unnamed: 0,Day-ahead Consumption Prognosis MWh DK,Total Consumption MWh DK,Day-ahead Production Prognosis MWh DK,Total Production MWh DK
2018-01-01 00:00:00,3422.0,3420.0,1324.0,3583.0
2018-01-01 01:00:00,3289.0,3306.0,1329.0,3536.0
2018-01-01 02:00:00,3156.0,3116.0,1206.0,3369.0
2018-01-01 03:00:00,3026.0,3021.0,1175.0,3394.0
2018-01-01 04:00:00,2939.0,2919.0,1152.0,3389.0
...,...,...,...,...
2023-12-31 19:00:00,4441.0,4460.0,4976.0,5614.0
2023-12-31 20:00:00,4189.0,4163.0,4753.0,5189.0
2023-12-31 21:00:00,4017.0,4033.0,4519.0,4885.0
2023-12-31 22:00:00,3861.0,3871.0,4252.0,4625.0


### FI

In [283]:
operating_data_FI_df = pd.read_excel("production_consumption_FI.xlsx")
# Assuming operating_data_NO_df is already read from the Excel file
# Rename the "Unnamed: 0" column to "DateTime"
operating_data_FI_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)

# Set the "DateTime" column as the index
operating_data_FI_df.set_index('DateTime', inplace=True)

# Optionally, you can sort the DataFrame by the index (DateTime) if needed
operating_data_FI_df.sort_index(ascending=True, inplace=True)

# Display the DataFrame to verify changes
operating_data_FI_df

Unnamed: 0_level_0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,9752,9715,7875,8050,0,0
2018-01-01 01:00:00,9498,9410,7874,7953,0,0
2018-01-01 02:00:00,9368,9242,7819,7849,0,0
2018-01-01 03:00:00,9362,9260,7768,7875,0,0
2018-01-01 04:00:00,9435,9364,7789,7850,0,0
...,...,...,...,...,...,...
2023-12-31 19:00:00,11864,12586,9536,11345,1411,3189
2023-12-31 20:00:00,11636,12542,9187,10750,1341,2829
2023-12-31 21:00:00,11897,12790,9112,10381,1246,2572
2023-12-31 22:00:00,11784,12770,8913,9925,1228,2249


In [284]:
# Assuming your DataFrame is named operating_data_SE_df
# Replace 'column_name1' and 'column_name2' with the actual names of the columns you want to drop

operating_data_FI_df.drop(['Day-ahead wind production prognosis MWh', 'Settled wind production MWh'], axis=1, inplace=True)

In [285]:
missing_values_count = operating_data_FI_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh    0
Total Consumption MWh                  0
Day-ahead production prognosis MWh     0
Total Production MWh                   0
dtype: int64


In [286]:
# Count the number of zeros in each column
zero_count = (operating_data_FI_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

Day-ahead consumption prognosis MWh     0
Total Consumption MWh                   1
Day-ahead production prognosis MWh     23
Total Production MWh                    2
dtype: int64


In [287]:
import numpy as np
# Replace all zeros with NaN across the DataFrame
operating_data_FI_df = operating_data_FI_df.replace(0, np.nan)
missing_values_count = operating_data_FI_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh     0
Total Consumption MWh                   1
Day-ahead production prognosis MWh     23
Total Production MWh                    2
dtype: int64


In [288]:
# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
operating_data_FI_df.index = pd.to_datetime(operating_data_FI_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(operating_data_FI_df.index)

# Print the missing dates
print(missing_dates)

DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
               '2020-03-29 02:00:00', '2021-03-28 02:00:00',
               '2022-03-27 02:00:00', '2023-03-26 02:00:00'],
              dtype='datetime64[ns]', freq=None)


In [289]:
import pandas as pd

# Provided missing DatetimeIndex
missing_dates = pd.DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
                                  '2020-03-29 02:00:00', '2021-03-28 02:00:00',
                                  '2022-03-27 02:00:00', '2023-03-26 02:00:00'])

# Create a temporary DataFrame with the missing datetime range
# Since these are specific timestamps, we directly use them as the index for the temp_df
temp_df = pd.DataFrame(index=missing_dates)

# Combine the temporary DataFrame with your original DataFrame
# This will add the missing timestamps with NaN values for all columns
operating_data_FI_df = operating_data_FI_df.combine_first(temp_df)

# Ensure the combined DataFrame is sorted
operating_data_FI_df.sort_index(inplace=True)

In [290]:
operating_data_FI_df = operating_data_FI_df.interpolate(method='linear')
missing_values_count = operating_data_FI_df.isnull().sum()
print(missing_values_count)

Day-ahead consumption prognosis MWh    0
Total Consumption MWh                  0
Day-ahead production prognosis MWh     0
Total Production MWh                   0
dtype: int64


In [291]:
operating_data_FI_df

Unnamed: 0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh
2018-01-01 00:00:00,9752.0,9715.0,7875.0,8050.0
2018-01-01 01:00:00,9498.0,9410.0,7874.0,7953.0
2018-01-01 02:00:00,9368.0,9242.0,7819.0,7849.0
2018-01-01 03:00:00,9362.0,9260.0,7768.0,7875.0
2018-01-01 04:00:00,9435.0,9364.0,7789.0,7850.0
...,...,...,...,...
2023-12-31 19:00:00,11864.0,12586.0,9536.0,11345.0
2023-12-31 20:00:00,11636.0,12542.0,9187.0,10750.0
2023-12-31 21:00:00,11897.0,12790.0,9112.0,10381.0
2023-12-31 22:00:00,11784.0,12770.0,8913.0,9925.0


In [292]:
# Assume you have a new list of column names
new_columns = ['Day-ahead Consumption Prognosis MWh FI', 'Total Consumption MWh FI', 'Day-ahead Production Prognosis MWh FI', 'Total Production MWh FI']  # List all new column names

# Replace all column names
operating_data_FI_df.columns = new_columns

operating_data_FI_df

Unnamed: 0,Day-ahead Consumption Prognosis MWh FI,Total Consumption MWh FI,Day-ahead Production Prognosis MWh FI,Total Production MWh FI
2018-01-01 00:00:00,9752.0,9715.0,7875.0,8050.0
2018-01-01 01:00:00,9498.0,9410.0,7874.0,7953.0
2018-01-01 02:00:00,9368.0,9242.0,7819.0,7849.0
2018-01-01 03:00:00,9362.0,9260.0,7768.0,7875.0
2018-01-01 04:00:00,9435.0,9364.0,7789.0,7850.0
...,...,...,...,...
2023-12-31 19:00:00,11864.0,12586.0,9536.0,11345.0
2023-12-31 20:00:00,11636.0,12542.0,9187.0,10750.0
2023-12-31 21:00:00,11897.0,12790.0,9112.0,10381.0
2023-12-31 22:00:00,11784.0,12770.0,8913.0,9925.0


## Merging the dataframes

In [293]:
# Assuming volume_df, operating_data_NO_df, operating_data_SE_df, operating_data_DK_df, operating_data_FI_df are your DataFrames
market_data_df = pd.concat([volume_df, operating_data_NO_df, operating_data_SE_df, operating_data_DK_df, operating_data_FI_df], axis=1)

# Now merged_df contains all the columns from the individual DataFrames, aligned by the DateTime index
market_data_df

Unnamed: 0,Volume traded MWh,Total Consumption MWh NO,Day-ahead Consumption Prognosis MWh NO,Total Production MWh NO,Day-ahead Production Prognosis MWh NO,Day-ahead Consumption Prognosis MWh SE,Total Consumption MWh SE,Day-ahead Production Prognosis MWh SE,Total Production MWh SE,Day-ahead Consumption Prognosis MWh DK,Total Consumption MWh DK,Day-ahead Production Prognosis MWh DK,Total Production MWh DK,Day-ahead Consumption Prognosis MWh FI,Total Consumption MWh FI,Day-ahead Production Prognosis MWh FI,Total Production MWh FI
2018-01-01 00:00:00,39676.5,16989.0,16422.0,14131.0,13710.0,15419.0,15563.0,16874.0,18029.0,3422.0,3420.0,1324.0,3583.0,9752.0,9715.0,7875.0,8050.0
2018-01-01 01:00:00,39492.2,16762.0,16272.0,14297.0,13935.0,15171.0,15229.0,16274.0,16989.0,3289.0,3306.0,1329.0,3536.0,9498.0,9410.0,7874.0,7953.0
2018-01-01 02:00:00,38381.8,16509.0,16101.0,13365.0,12926.0,14909.0,15259.0,16234.0,16910.0,3156.0,3116.0,1206.0,3369.0,9368.0,9242.0,7819.0,7849.0
2018-01-01 03:00:00,37492.3,16109.0,16011.0,12076.0,11698.0,14813.0,15026.0,16104.0,16675.0,3026.0,3021.0,1175.0,3394.0,9362.0,9260.0,7768.0,7875.0
2018-01-01 04:00:00,37443.1,16152.0,15975.0,12083.0,11710.0,14713.0,14844.0,15990.0,16539.0,2939.0,2919.0,1152.0,3389.0,9435.0,9364.0,7789.0,7850.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,49873.0,20049.0,20119.0,15660.0,16027.0,18742.0,18917.0,20679.0,20448.0,4441.0,4460.0,4976.0,5614.0,11864.0,12586.0,9536.0,11345.0
2023-12-31 20:00:00,48269.9,19641.0,19852.0,15288.0,15570.0,18232.0,18251.0,19923.0,19655.0,4189.0,4163.0,4753.0,5189.0,11636.0,12542.0,9187.0,10750.0
2023-12-31 21:00:00,47554.1,19155.0,19639.0,15301.0,15184.0,17869.0,17801.0,19339.0,19064.0,4017.0,4033.0,4519.0,4885.0,11897.0,12790.0,9112.0,10381.0
2023-12-31 22:00:00,46830.7,18784.0,19342.0,14829.0,14762.0,17494.0,17233.0,18917.0,18673.0,3861.0,3871.0,4252.0,4625.0,11784.0,12770.0,8913.0,9925.0


In [294]:
# Extract to excel
# Export the DataFrame to an Excel file
market_data_df.to_excel('Market Data REAL.xlsx', sheet_name='Market Data')

# If you run into any issues related to Excel file size or performance, consider breaking your data into multiple sheets or saving in a different format.


# Production data

## Hydro reservoir levels (NO, SE, FI)

In [191]:
waterreservoir_df = pd.read_excel("waterreservoirREAL.xlsx")
waterreservoir_df.set_index("DateTime", inplace=True)
waterreservoir_df.sort_index(ascending=True)
waterreservoir_df

Unnamed: 0_level_0,100% Norge GWh,Fylling Norge GWh,100% Sverige GWh,Fylling Sverige GWh,100%Finland GWh,Fylling Finland GWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,86920,60513,33680,21437,5530,4008
2018-01-01 01:00:00,86920,60513,33680,21437,5530,4008
2018-01-01 02:00:00,86920,60513,33680,21437,5530,4008
2018-01-01 03:00:00,86920,60513,33680,21437,5530,4008
2018-01-01 04:00:00,86920,60513,33680,21437,5530,4008
...,...,...,...,...,...,...
2023-12-31 19:00:00,87412,54857,34030,18943,5530,3586
2023-12-31 20:00:00,87412,54857,34030,18943,5530,3586
2023-12-31 21:00:00,87412,54857,34030,18943,5530,3586
2023-12-31 22:00:00,87412,54857,34030,18943,5530,3586


In [192]:
# Assuming waterreservoir_df is your DataFrame and all values are in GWh
waterreservoir_df = waterreservoir_df * 1000

# Now all values in waterreservoir_df are in MWh
# Assume you have a new list of column names
new_columns = ['Max Cap. Hydro levels NO MWh', 'Hydro levels NO MWh', 'Max Cap. Hydro levels SE MWh', 'Hydro levels SE MWh', 'Max Cap. Hydro levels FI MWh', 'Hydro levels FI MWh']  # List all new column names

# Replace all column names
waterreservoir_df.columns = new_columns
waterreservoir_df

Unnamed: 0_level_0,Max Cap. Hydro levels NO MWh,Hydro levels NO MWh,Max Cap. Hydro levels SE MWh,Hydro levels SE MWh,Max Cap. Hydro levels FI MWh,Hydro levels FI MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,86920000,60513000,33680000,21437000,5530000,4008000
2018-01-01 01:00:00,86920000,60513000,33680000,21437000,5530000,4008000
2018-01-01 02:00:00,86920000,60513000,33680000,21437000,5530000,4008000
2018-01-01 03:00:00,86920000,60513000,33680000,21437000,5530000,4008000
2018-01-01 04:00:00,86920000,60513000,33680000,21437000,5530000,4008000
...,...,...,...,...,...,...
2023-12-31 19:00:00,87412000,54857000,34030000,18943000,5530000,3586000
2023-12-31 20:00:00,87412000,54857000,34030000,18943000,5530000,3586000
2023-12-31 21:00:00,87412000,54857000,34030000,18943000,5530000,3586000
2023-12-31 22:00:00,87412000,54857000,34030000,18943000,5530000,3586000


In [193]:
missing_values_count = waterreservoir_df.isnull().sum()
print(missing_values_count)

Max Cap. Hydro levels NO MWh    0
Hydro levels NO MWh             0
Max Cap. Hydro levels SE MWh    0
Hydro levels SE MWh             0
Max Cap. Hydro levels FI MWh    0
Hydro levels FI MWh             0
dtype: int64


In [194]:
# Count the number of zeros in each column
zero_count = (operating_data_FI_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

Day-ahead Consumption Prognosis MWh FI    0
Total Consumption MWh FI                  0
Day-ahead Production Prognosis MWh FI     0
Total Production MWh FI                   0
dtype: int64


In [195]:
# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date + ' 23:00:00', freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
waterreservoir_df.index = pd.to_datetime(waterreservoir_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(waterreservoir_df.index)

# Print the missing dates
print(missing_dates)
print(len(missing_dates))

DatetimeIndex(['2020-12-28 00:00:00', '2020-12-28 01:00:00',
               '2020-12-28 02:00:00', '2020-12-28 03:00:00',
               '2020-12-28 04:00:00', '2020-12-28 05:00:00',
               '2020-12-28 06:00:00', '2020-12-28 07:00:00',
               '2020-12-28 08:00:00', '2020-12-28 09:00:00',
               ...
               '2021-01-03 14:00:00', '2021-01-03 15:00:00',
               '2021-01-03 16:00:00', '2021-01-03 17:00:00',
               '2021-01-03 18:00:00', '2021-01-03 19:00:00',
               '2021-01-03 20:00:00', '2021-01-03 21:00:00',
               '2021-01-03 22:00:00', '2021-01-03 23:00:00'],
              dtype='datetime64[ns]', length=168, freq=None)
168


In [196]:
# Reindex the DataFrame to include the complete datetime range
waterreservoir_df = waterreservoir_df.reindex(complete_date_range)

# After this operation, waterreservoir_df will include rows for all missing dates with NaN values for their columns.


In [197]:
waterreservoir_df

Unnamed: 0,Max Cap. Hydro levels NO MWh,Hydro levels NO MWh,Max Cap. Hydro levels SE MWh,Hydro levels SE MWh,Max Cap. Hydro levels FI MWh,Hydro levels FI MWh
2018-01-01 00:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 01:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 02:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 03:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 04:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
...,...,...,...,...,...,...
2023-12-31 19:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0
2023-12-31 20:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0
2023-12-31 21:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0
2023-12-31 22:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0


In [198]:
missing_values_count = waterreservoir_df.isnull().sum()
print(missing_values_count)

Max Cap. Hydro levels NO MWh    168
Hydro levels NO MWh             168
Max Cap. Hydro levels SE MWh    168
Hydro levels SE MWh             168
Max Cap. Hydro levels FI MWh    168
Hydro levels FI MWh             168
dtype: int64


In [199]:
# Fill missing values with the last known value
waterreservoir_df.fillna(method='ffill', inplace=True)

# If there are any leading NaN values before the first valid observation in any column,
# they would remain unchanged because there's no previous value to carry forward.
# You might want to check for and handle such cases if they exist.
missing_values_count = waterreservoir_df.isnull().sum()
print(missing_values_count)

Max Cap. Hydro levels NO MWh    0
Hydro levels NO MWh             0
Max Cap. Hydro levels SE MWh    0
Hydro levels SE MWh             0
Max Cap. Hydro levels FI MWh    0
Hydro levels FI MWh             0
dtype: int64


In [200]:
waterreservoir_df

Unnamed: 0,Max Cap. Hydro levels NO MWh,Hydro levels NO MWh,Max Cap. Hydro levels SE MWh,Hydro levels SE MWh,Max Cap. Hydro levels FI MWh,Hydro levels FI MWh
2018-01-01 00:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 01:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 02:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 03:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
2018-01-01 04:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0
...,...,...,...,...,...,...
2023-12-31 19:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0
2023-12-31 20:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0
2023-12-31 21:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0
2023-12-31 22:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0


## Wind Production

## SE

In [201]:
operating_data_SE_df = pd.read_excel("production_consumption_SE.xlsx")
# Assuming operating_data_NO_df is already read from the Excel file
# Rename the "Unnamed: 0" column to "DateTime"
operating_data_SE_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)

# Set the "DateTime" column as the index
operating_data_SE_df.set_index('DateTime', inplace=True)

# Optionally, you can sort the DataFrame by the index (DateTime) if needed
operating_data_SE_df.sort_index(ascending=True, inplace=True)

# Display the DataFrame to verify changes
operating_data_SE_df

Unnamed: 0_level_0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,15419,15563,16874,18029,2206,1917
2018-01-01 01:00:00,15171,15229,16274,16989,2197,1772
2018-01-01 02:00:00,14909,15259,16234,16910,2245,1702
2018-01-01 03:00:00,14813,15026,16104,16675,2182,1710
2018-01-01 04:00:00,14713,14844,15990,16539,2154,1664
...,...,...,...,...,...,...
2023-12-31 19:00:00,18742,18917,20679,20448,4946,4354
2023-12-31 20:00:00,18232,18251,19923,19655,5068,4443
2023-12-31 21:00:00,17869,17801,19339,19064,5146,4544
2023-12-31 22:00:00,17494,17233,18917,18673,5188,4628


In [202]:
# Assuming your DataFrame is named operating_data_SE_df
# Replace 'column_name1' and 'column_name2' with the actual names of the columns you want to drop

operating_data_SE_df.drop(['Day-ahead consumption prognosis MWh', 'Total Consumption MWh', 'Day-ahead production prognosis MWh', 'Total Production MWh' ], axis=1, inplace=True)
operating_data_SE_df

Unnamed: 0_level_0,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,2206,1917
2018-01-01 01:00:00,2197,1772
2018-01-01 02:00:00,2245,1702
2018-01-01 03:00:00,2182,1710
2018-01-01 04:00:00,2154,1664
...,...,...
2023-12-31 19:00:00,4946,4354
2023-12-31 20:00:00,5068,4443
2023-12-31 21:00:00,5146,4544
2023-12-31 22:00:00,5188,4628


In [203]:
# Count the number of zeros in each column
zero_count = (operating_data_SE_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

import numpy as np
# Replace all zeros with NaN across the DataFrame
operating_data_SE_df = operating_data_SE_df.replace(0, np.nan)
missing_values_count = operating_data_SE_df.isnull().sum()
print(missing_values_count)


Day-ahead wind production prognosis MWh       48
Settled wind production MWh                12958
dtype: int64
Day-ahead wind production prognosis MWh       48
Settled wind production MWh                12958
dtype: int64


In [204]:
# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
operating_data_SE_df.index = pd.to_datetime(operating_data_SE_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(operating_data_SE_df.index)

# Print the missing dates
print(missing_dates)

DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
               '2020-03-29 02:00:00', '2021-03-28 02:00:00',
               '2022-03-27 02:00:00', '2023-03-26 02:00:00'],
              dtype='datetime64[ns]', freq=None)


In [205]:
import pandas as pd

# Provided missing DatetimeIndex
missing_dates = pd.DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
                                  '2020-03-29 02:00:00', '2021-03-28 02:00:00',
                                  '2022-03-27 02:00:00', '2023-03-26 02:00:00'])

# Create a temporary DataFrame with the missing datetime range
# Since these are specific timestamps, we directly use them as the index for the temp_df
temp_df = pd.DataFrame(index=missing_dates)

# Combine the temporary DataFrame with your original DataFrame
# This will add the missing timestamps with NaN values for all columns
operating_data_SE_df = operating_data_SE_df.combine_first(temp_df)

# Ensure the combined DataFrame is sorted
operating_data_SE_df.sort_index(inplace=True)


In [206]:
missing_values_count = operating_data_SE_df.isnull().sum()
print(missing_values_count)

Day-ahead wind production prognosis MWh       54
Settled wind production MWh                12964
dtype: int64


In [208]:
# Interpolating missing values for both columns using the 'time' method
operating_data_SE_df[['Settled wind production MWh', 'Day-ahead wind production prognosis MWh']] = operating_data_SE_df[['Settled wind production MWh', 'Day-ahead wind production prognosis MWh']].interpolate(method='time')

# Check if there are any remaining missing values in these columns after interpolation
remaining_na_settled = operating_data_SE_df['Settled wind production MWh'].isna().sum()
remaining_na_prognosis = operating_data_SE_df['Day-ahead wind production prognosis MWh'].isna().sum()

print(f"Remaining missing values in 'Settled wind production MWh': {remaining_na_settled}")
print(f"Remaining missing values in 'Day-ahead wind production prognosis MWh': {remaining_na_prognosis}")


Remaining missing values in 'Settled wind production MWh': 0
Remaining missing values in 'Day-ahead wind production prognosis MWh': 0


In [209]:
operating_data_SE_df

Unnamed: 0,Day-ahead wind production prognosis MWh,Settled wind production MWh
2018-01-01 00:00:00,2206.0,1917.0
2018-01-01 01:00:00,2197.0,1772.0
2018-01-01 02:00:00,2245.0,1702.0
2018-01-01 03:00:00,2182.0,1710.0
2018-01-01 04:00:00,2154.0,1664.0
...,...,...
2023-12-31 19:00:00,4946.0,4354.0
2023-12-31 20:00:00,5068.0,4443.0
2023-12-31 21:00:00,5146.0,4544.0
2023-12-31 22:00:00,5188.0,4628.0


In [210]:
# Assume you have a new list of column names
new_columns = ['Day-ahead wind production prognosis MWh SE', 'Settled wind production MWh SE']  # List all new column names

# Replace all column names
operating_data_SE_df.columns = new_columns

operating_data_SE_df

Unnamed: 0,Day-ahead wind production prognosis MWh SE,Settled wind production MWh SE
2018-01-01 00:00:00,2206.0,1917.0
2018-01-01 01:00:00,2197.0,1772.0
2018-01-01 02:00:00,2245.0,1702.0
2018-01-01 03:00:00,2182.0,1710.0
2018-01-01 04:00:00,2154.0,1664.0
...,...,...
2023-12-31 19:00:00,4946.0,4354.0
2023-12-31 20:00:00,5068.0,4443.0
2023-12-31 21:00:00,5146.0,4544.0
2023-12-31 22:00:00,5188.0,4628.0


## DK

In [211]:
operating_data_DK_df = pd.read_excel("production_consumption_DK.xlsx")
# Assuming operating_data_NO_df is already read from the Excel file
# Rename the "Unnamed: 0" column to "DateTime"
operating_data_DK_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)

# Set the "DateTime" column as the index
operating_data_DK_df.set_index('DateTime', inplace=True)

# Optionally, you can sort the DataFrame by the index (DateTime) if needed
operating_data_DK_df.sort_index(ascending=True, inplace=True)

# Display the DataFrame to verify changes
operating_data_DK_df

Unnamed: 0_level_0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,3422,3420,1324,3583,1792,2378
2018-01-01 01:00:00,3289,3306,1329,3536,1701,2376
2018-01-01 02:00:00,3156,3116,1206,3369,1944,2171
2018-01-01 03:00:00,3026,3021,1175,3394,2197,2237
2018-01-01 04:00:00,2939,2919,1152,3389,2365,2236
...,...,...,...,...,...,...
2023-12-31 19:00:00,4441,4460,4976,5614,3703,4521
2023-12-31 20:00:00,4189,4163,4753,5189,3558,4161
2023-12-31 21:00:00,4017,4033,4519,4885,3381,3897
2023-12-31 22:00:00,3861,3871,4252,4625,3172,3633


In [212]:
operating_data_DK_df.drop(['Day-ahead consumption prognosis MWh', 'Total Consumption MWh', 'Day-ahead production prognosis MWh', 'Total Production MWh' ], axis=1, inplace=True)
operating_data_DK_df

Unnamed: 0_level_0,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,1792,2378
2018-01-01 01:00:00,1701,2376
2018-01-01 02:00:00,1944,2171
2018-01-01 03:00:00,2197,2237
2018-01-01 04:00:00,2365,2236
...,...,...
2023-12-31 19:00:00,3703,4521
2023-12-31 20:00:00,3558,4161
2023-12-31 21:00:00,3381,3897
2023-12-31 22:00:00,3172,3633


In [213]:
# Count the number of zeros in each column
zero_count = (operating_data_DK_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

import numpy as np
# Replace all zeros with NaN across the DataFrame
operating_data_DK_df = operating_data_DK_df.replace(0, np.nan)
missing_values_count = operating_data_DK_df.isnull().sum()
print(missing_values_count)


Day-ahead wind production prognosis MWh    13
Settled wind production MWh                 4
dtype: int64
Day-ahead wind production prognosis MWh    13
Settled wind production MWh                 4
dtype: int64


In [214]:
# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
operating_data_DK_df.index = pd.to_datetime(operating_data_DK_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(operating_data_DK_df.index)

# Print the missing dates
print(missing_dates)

DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
               '2020-03-29 02:00:00', '2021-03-28 02:00:00',
               '2022-03-27 02:00:00', '2023-03-26 02:00:00'],
              dtype='datetime64[ns]', freq=None)


In [215]:
import pandas as pd

# Provided missing DatetimeIndex
missing_dates = pd.DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
                                  '2020-03-29 02:00:00', '2021-03-28 02:00:00',
                                  '2022-03-27 02:00:00', '2023-03-26 02:00:00'])

# Create a temporary DataFrame with the missing datetime range
# Since these are specific timestamps, we directly use them as the index for the temp_df
temp_df = pd.DataFrame(index=missing_dates)

# Combine the temporary DataFrame with your original DataFrame
# This will add the missing timestamps with NaN values for all columns
operating_data_DK_df = operating_data_DK_df.combine_first(temp_df)

# Ensure the combined DataFrame is sorted
operating_data_DK_df.sort_index(inplace=True)

In [216]:
# Interpolating missing values for both columns using the 'time' method
operating_data_DK_df[['Settled wind production MWh', 'Day-ahead wind production prognosis MWh']] = operating_data_DK_df[['Settled wind production MWh', 'Day-ahead wind production prognosis MWh']].interpolate(method='time')

# Check if there are any remaining missing values in these columns after interpolation
remaining_na_settled = operating_data_DK_df['Settled wind production MWh'].isna().sum()
remaining_na_prognosis = operating_data_DK_df['Day-ahead wind production prognosis MWh'].isna().sum()

print(f"Remaining missing values in 'Settled wind production MWh': {remaining_na_settled}")
print(f"Remaining missing values in 'Day-ahead wind production prognosis MWh': {remaining_na_prognosis}")

Remaining missing values in 'Settled wind production MWh': 0
Remaining missing values in 'Day-ahead wind production prognosis MWh': 0


In [217]:
# Assume you have a new list of column names
new_columns = ['Day-ahead wind production prognosis MWh DK', 'Settled wind production MWh DK']  # List all new column names

# Replace all column names
operating_data_DK_df.columns = new_columns

operating_data_DK_df

Unnamed: 0,Day-ahead wind production prognosis MWh DK,Settled wind production MWh DK
2018-01-01 00:00:00,1792.0,2378.0
2018-01-01 01:00:00,1701.0,2376.0
2018-01-01 02:00:00,1944.0,2171.0
2018-01-01 03:00:00,2197.0,2237.0
2018-01-01 04:00:00,2365.0,2236.0
...,...,...
2023-12-31 19:00:00,3703.0,4521.0
2023-12-31 20:00:00,3558.0,4161.0
2023-12-31 21:00:00,3381.0,3897.0
2023-12-31 22:00:00,3172.0,3633.0


## FI

In [218]:
operating_data_FI_df = pd.read_excel("production_consumption_FI.xlsx")
# Assuming operating_data_NO_df is already read from the Excel file
# Rename the "Unnamed: 0" column to "DateTime"
operating_data_FI_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)

# Set the "DateTime" column as the index
operating_data_FI_df.set_index('DateTime', inplace=True)

# Optionally, you can sort the DataFrame by the index (DateTime) if needed
operating_data_FI_df.sort_index(ascending=True, inplace=True)

# Display the DataFrame to verify changes
operating_data_FI_df

Unnamed: 0_level_0,Day-ahead consumption prognosis MWh,Total Consumption MWh,Day-ahead production prognosis MWh,Total Production MWh,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00,9752,9715,7875,8050,0,0
2018-01-01 01:00:00,9498,9410,7874,7953,0,0
2018-01-01 02:00:00,9368,9242,7819,7849,0,0
2018-01-01 03:00:00,9362,9260,7768,7875,0,0
2018-01-01 04:00:00,9435,9364,7789,7850,0,0
...,...,...,...,...,...,...
2023-12-31 19:00:00,11864,12586,9536,11345,1411,3189
2023-12-31 20:00:00,11636,12542,9187,10750,1341,2829
2023-12-31 21:00:00,11897,12790,9112,10381,1246,2572
2023-12-31 22:00:00,11784,12770,8913,9925,1228,2249


In [219]:
operating_data_FI_df.drop(['Day-ahead consumption prognosis MWh', 'Total Consumption MWh', 'Day-ahead production prognosis MWh', 'Total Production MWh' ], axis=1, inplace=True)
operating_data_FI_df

Unnamed: 0_level_0,Day-ahead wind production prognosis MWh,Settled wind production MWh
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,0,0
2018-01-01 01:00:00,0,0
2018-01-01 02:00:00,0,0
2018-01-01 03:00:00,0,0
2018-01-01 04:00:00,0,0
...,...,...
2023-12-31 19:00:00,1411,3189
2023-12-31 20:00:00,1341,2829
2023-12-31 21:00:00,1246,2572
2023-12-31 22:00:00,1228,2249


In [220]:
# Count the number of zeros in each column
zero_count = (operating_data_FI_df == 0).sum()

# Print the count of zeros in each column
print(zero_count)

import numpy as np
# Replace all zeros with NaN across the DataFrame
operating_data_FI_df = operating_data_FI_df.replace(0, np.nan)
missing_values_count = operating_data_FI_df.isnull().sum()
print(missing_values_count)

Day-ahead wind production prognosis MWh    42907
Settled wind production MWh                 2723
dtype: int64
Day-ahead wind production prognosis MWh    42907
Settled wind production MWh                 2723
dtype: int64


In [221]:
operating_data_FI_df.drop(['Day-ahead wind production prognosis MWh'], axis=1, inplace=True)
operating_data_FI_df

Unnamed: 0_level_0,Settled wind production MWh
DateTime,Unnamed: 1_level_1
2018-01-01 00:00:00,
2018-01-01 01:00:00,
2018-01-01 02:00:00,
2018-01-01 03:00:00,
2018-01-01 04:00:00,
...,...
2023-12-31 19:00:00,3189.0
2023-12-31 20:00:00,2829.0
2023-12-31 21:00:00,2572.0
2023-12-31 22:00:00,2249.0


In [222]:
# Create a complete datetime range for your data
start_date = '2018-01-01'
end_date = '2023-12-31'
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Convert your DataFrame's index to a DatetimeIndex if it's not already
operating_data_FI_df.index = pd.to_datetime(operating_data_FI_df.index)

# Find the missing dates by comparing the complete date range with your DataFrame's index
missing_dates = complete_date_range.difference(operating_data_FI_df.index)

# Print the missing dates
print(missing_dates)

DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
               '2020-03-29 02:00:00', '2021-03-28 02:00:00',
               '2022-03-27 02:00:00', '2023-03-26 02:00:00'],
              dtype='datetime64[ns]', freq=None)


In [223]:
import pandas as pd

# Provided missing DatetimeIndex
missing_dates = pd.DatetimeIndex(['2018-03-25 02:00:00', '2019-03-31 02:00:00',
                                  '2020-03-29 02:00:00', '2021-03-28 02:00:00',
                                  '2022-03-27 02:00:00', '2023-03-26 02:00:00'])

# Create a temporary DataFrame with the missing datetime range
# Since these are specific timestamps, we directly use them as the index for the temp_df
temp_df = pd.DataFrame(index=missing_dates)

# Combine the temporary DataFrame with your original DataFrame
# This will add the missing timestamps with NaN values for all columns
operating_data_FI_df = operating_data_FI_df.combine_first(temp_df)

# Ensure the combined DataFrame is sorted
operating_data_FI_df.sort_index(inplace=True)

In [224]:
# Interpolating missing values using the 'time' method
operating_data_FI_df['Settled wind production MWh'] = operating_data_FI_df['Settled wind production MWh'].interpolate(method='time')

# Check if there are any remaining missing values in the column after interpolation
remaining_na = operating_data_FI_df['Settled wind production MWh'].isna().sum()
print(f"Remaining missing values in 'Settled wind production MWh': {remaining_na}")


Remaining missing values in 'Settled wind production MWh': 48


In [225]:
operating_data_FI_df['Settled wind production MWh'] = operating_data_FI_df['Settled wind production MWh'].fillna(method='bfill')


In [226]:
missing_values_count = operating_data_FI_df.isnull().sum()
print(missing_values_count)

Settled wind production MWh    0
dtype: int64


In [227]:
# Assume you have a new list of column names
new_columns = ['Settled wind production MWh FI']  # List all new column names

# Replace all column names
operating_data_FI_df.columns = new_columns

operating_data_FI_df

Unnamed: 0,Settled wind production MWh FI
2018-01-01 00:00:00,672.0
2018-01-01 01:00:00,672.0
2018-01-01 02:00:00,672.0
2018-01-01 03:00:00,672.0
2018-01-01 04:00:00,672.0
...,...
2023-12-31 19:00:00,3189.0
2023-12-31 20:00:00,2829.0
2023-12-31 21:00:00,2572.0
2023-12-31 22:00:00,2249.0


## Merging the dataframes

In [228]:
import pandas as pd

# Concatenate the DataFrames along columns
production_data_df = pd.concat([waterreservoir_df, operating_data_SE_df, operating_data_DK_df, operating_data_FI_df], axis=1)
production_data_df


Unnamed: 0,Max Cap. Hydro levels NO MWh,Hydro levels NO MWh,Max Cap. Hydro levels SE MWh,Hydro levels SE MWh,Max Cap. Hydro levels FI MWh,Hydro levels FI MWh,Day-ahead wind production prognosis MWh SE,Settled wind production MWh SE,Day-ahead wind production prognosis MWh DK,Settled wind production MWh DK,Settled wind production MWh FI
2018-01-01 00:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0,2206.0,1917.0,1792.0,2378.0,672.0
2018-01-01 01:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0,2197.0,1772.0,1701.0,2376.0,672.0
2018-01-01 02:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0,2245.0,1702.0,1944.0,2171.0,672.0
2018-01-01 03:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0,2182.0,1710.0,2197.0,2237.0,672.0
2018-01-01 04:00:00,86920000.0,60513000.0,33680000.0,21437000.0,5530000.0,4008000.0,2154.0,1664.0,2365.0,2236.0,672.0
...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0,4946.0,4354.0,3703.0,4521.0,3189.0
2023-12-31 20:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0,5068.0,4443.0,3558.0,4161.0,2829.0
2023-12-31 21:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0,5146.0,4544.0,3381.0,3897.0,2572.0
2023-12-31 22:00:00,87412000.0,54857000.0,34030000.0,18943000.0,5530000.0,3586000.0,5188.0,4628.0,3172.0,3633.0,2249.0


In [229]:
# Export the DataFrame to an Excel file
production_data_df.to_excel('Production Data REAL.xlsx', sheet_name='Production Data')

# If you run into any issues related to Excel file size or performance, consider breaking your data into multiple sheets or saving in a different format.


### Production and Consumption in MWh for Norway?

In [3]:

load_df = pd.read_excel("load_combined.xlsx")
load_df.set_index("Time(Local)", inplace=True)
load_df.sort_index(ascending=True)
load_df = load_df.drop("Unnamed: 0", axis = 1)
load_df

Unnamed: 0_level_0,Production,Consumption
Time(Local),Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,14131,16989
2018-01-01 01:00:00,14299,16764
2018-01-01 02:00:00,13365,16509
2018-01-01 03:00:00,12076,16109
2018-01-01 04:00:00,12083,16152
...,...,...
2023-12-31 19:00:00,15691,20049
2023-12-31 20:00:00,15390,19641
2023-12-31 21:00:00,15316,19156
2023-12-31 22:00:00,14817,18785
