In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
# Concatenate the csv files into a one big csv file containing only the relevant columns.

price_paths = []
production_paths = []
src_path = '../dat/bundesnetzagentur'
dst_path = '../dat/bundesnetzagentur/pre_processed_data'

# read all files in dictionary 'path'
for file in os.listdir(src_path):
    if (file.startswith('Grosshandelspreise_')):
        price_paths.append(file)
    elif (file.startswith('Realisierte_Erzeugung_')):
        production_paths.append(file)

# Prepare data for bidding zone DE/LU (2019-2021)

## Extract prices

In [None]:
price_files_2018_2021 = ['Grosshandelspreise_2018_2019.csv',
                         'Grosshandelspreise_2020_2021.csv']

prices_de_lu = pd.DataFrame()

for file in price_files_2018_2021:
    df = pd.read_csv(os.path.join(src_path, file),
                     delimiter=';',
                     # Use European decimal and thousands symbols, directly parse dates
                     decimal=',', 
                     thousands='.',
                     # read '-' as np.nan
                     na_values=['-'],
                     parse_dates=[['Datum', 'Uhrzeit']])
    prices_de_lu = prices_de_lu.append(df)
    
prices_de_lu = prices_de_lu.set_index('Datum_Uhrzeit')
prices_de_lu = prices_de_lu.sort_index()

# Extract prices from 2019 onwards for bidding zone DE/LU
prices_de_lu = prices_de_lu.loc['2019':, 'Deutschland/Luxemburg[€/MWh]']
prices_de_lu

## Extract production data

In [None]:
erzeugung_in_regelzonen = os.path.join(src_path, 'erzeugung_in_regelzonen')
prod_files_de_lu = ['Realisierte_Erzeugung_DE_LU_2018-10-01_2019.csv',
                    'Realisierte_Erzeugung_DE_LU_2020_2021.csv']

prod_de_lu = pd.DataFrame()

for file in prod_files_de_lu:
    df = pd.read_csv(os.path.join(erzeugung_in_regelzonen, file),
                     delimiter=';',
                     # Use European decimal and thousands symbols, directly parse dates
                     decimal=',', 
                     thousands='.',
                     # read '-' as np.nan
                     na_values=['-'],
                     parse_dates=[['Datum', 'Uhrzeit']])
    prod_de_lu = prod_de_lu.append(df)

prod_de_lu = prod_de_lu.set_index('Datum_Uhrzeit')
prod_de_lu = prod_de_lu.sort_index()

# Extract production from 2019 onwards
prod_de_lu = prod_de_lu.loc['2019':]

# Resample to hours (same as prices)
prod_de_lu = prod_de_lu.resample('H').mean()

prod_de_lu

In [None]:
# Combine onshore and offshore wind
prod_de_lu['Wind'] = prod_de_lu.loc[:, ('Wind Offshore[MWh]', 'Wind Onshore[MWh]')].sum(axis=1)

In [None]:
combined = prod_de_lu.join(prices_de_lu)

In [None]:
column_names_map = {
    'Biomasse[MWh]': 'Biomass',
    'Wasserkraft[MWh]': 'Hydropower',
    'Wind Offshore[MWh]': 'Offshore Wind', 
    'Wind Onshore[MWh]': 'Onshore Wind',
    'Photovoltaik[MWh]': 'Solar',
    'Sonstige Erneuerbare[MWh]': 'Other Renewables',
    'Kernenergie[MWh]': 'Nuclear',
    'Braunkohle[MWh]': 'Brown Coal',
    'Steinkohle[MWh]': 'Black Coal',
    'Erdgas[MWh]': 'Natural Gas',
    'Pumpspeicher[MWh]': 'Pumped-Storage Hydro',
    'Sonstige Konventionelle[MWh]': 'Other Conventional Sources',
    'Deutschland/Luxemburg[€/MWh]': 'Price'
}

combined = combined.rename(columns=column_names_map)
combined['Solar & Wind'] = combined[['Solar', 'Wind']].sum(axis=1)
combined.index = combined.index.rename('time')
combined

In [None]:
renewable    = ['Biomass',
                'Hydropower',
                'Wind',
                'Solar',
                'Other Renewables',
                'Pumped-Storage Hydro']

# Definition of 'renewable' - does Nuclear count?
# combined['Total Renewable'] = combined[renewable].sum(axis=1)
# combined
# conventional = ['Nuclear', 'Brown Coal']

In [None]:
combined.to_csv(os.path.join(dst_path, 'prod_price_de_lu_2019_2021.csv'))

# Join data

In [None]:
# Price

# The price zones seem to have changed on 1.10.2018
# DE/AT/LU till 30.9.2018, DE/LU from 1.10.2018 onwards

price_cols = ['Datum_Uhrzeit', 'Deutschland/Luxemburg[€/MWh]']
price_list = []
for file in price_paths:
    df = pd.read_csv(src_path+'/'+file,
                     delimiter=';',
                     decimal=',', 
                     thousands='.',
                     parse_dates=[['Datum', 'Uhrzeit']])
    df = df[price_cols].values.tolist()
    price_list += df

price_df = pd.DataFrame(price_list, columns=[price_cols])
price_df = price_df.rename(columns={'Deutschland/Luxemburg[€/MWh]': 'Preis[€/MWh]'})

price_df.to_csv(dst_path+'/Grosshandelspreise_2015_2021.csv', sep=';', index=False)

In [None]:
# Production

production_cols = ['Datum_Uhrzeit', 
                   'Wasserkraft[MWh]', 
                   'Wind Offshore[MWh]', 
                   'Wind Onshore[MWh]', 
                   'Photovoltaik[MWh]', 
                   'Sonstige Erneuerbare[MWh]']

production_list = []

for file in production_paths:
    df = pd.read_csv(src_path+'/'+file, 
                     delimiter=';', 
                     decimal=',',
                     thousands='.',
                     parse_dates=[['Datum', 'Uhrzeit']])
    df = df[production_cols].values.tolist()
    production_list += df

production_df = pd.DataFrame(production_list, columns=[production_cols])

sum_cols = production_cols
sum_cols.remove('Datum_Uhrzeit')

production_df['Total[MWh]'] = production_df[sum_cols].sum(axis=1)

production_df.to_csv(dst_path+'/Realisierte_Erzeugung_2015_2021.csv', sep=';', index=False)