In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook
import datetime

import time
from tqdm.auto import tqdm

import requests
import glob
import os
import shutil


# 1. Weather data: 

Source: [Deutscher Wetter Dienst (German Weather Service)](https://www.dwd.de/DE/Home/home_node.html)

[Python API](https://wetterdienst.readthedocs.io/en/latest/overview.html), 
[Rest API](https://dwd.api.bund.dev/)

More information about the data can be found [here](https://www.dwd.de/DE/Home/home_node.html;jsessionid=5F6860A891AB172540563F2A56F045B3.live31092)


In [13]:
from wetterdienst import Wetterdienst
from wetterdienst.provider.dwd.observation import DwdObservationRequest, DwdObservationDataset, DwdObservationPeriod, DwdObservationResolution

# Create API instance
API = Wetterdienst(provider="dwd", network="observation")

# Create request
request = DwdObservationRequest(
    parameter=[DwdObservationDataset.SOLAR, DwdObservationDataset.WIND],
    resolution=DwdObservationResolution.HOURLY,
    start_date=datetime.datetime(2015, 1, 1),
    end_date=datetime.datetime(2023, 11, 19),
    #period=DwdObservationPeriod.HISTORICAL,
)

# Create df for available stations
d_stations = request.all().df.to_pandas()
d_stations = d_stations[['station_id', 'state', 'name']]

# Parameters that should be fetched
parameters = ['wind_speed', 'sunshine_duration', 'radiation_global']

# Print parameters that are available with hourly resolution
DwdObservationRequest.discover()['hourly'];


In [14]:
fetch_data = True
if fetch_data:
    # Loop through all stations and add values for all parameters to dictionary
    # Structure of dictionary: key: parameter_state (i.e. wind_speed_Berlin)
    # Note: For each key, values are 2d since there are multiple stations per state
    data = {}
    for idx in tqdm(range(len(d_stations))):
        d = request.filter_by_station_id(station_id=(d_stations['station_id'][idx], ))
        state = d_stations['state'][idx]

        # do actual query
        for result in d.values.query():
            d = result.df.to_pandas()
        if idx==0:
            date = d['date'][d['parameter']=='end_of_interval']
            data['time_unix'] = [dt.timestamp() for dt in list(date)]
        
        # Get values for each parameter
        for param in parameters:
            values = d[d['parameter']==param]['value']
            if sum(values.isna())<0.2*len(values):
                if not state==None:
                    column_name = param+'_'+state
                    if column_name in data:
                        data[column_name].append(list(values))
                    else:
                        data[column_name] = [list(values)]
                        

  0%|          | 0/446 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
df = data.copy()

for key in list(df.keys()):
    #print(key)
    df[key] = np.nanmean(df[key], axis=0)

df = pd.DataFrame(df)
df['time_unix'] = data['time_unix']
df.to_csv('Weather_data.csv')
df.head()

# 2. Energy data: 

Source: [Energy-charts.info](https://energy-charts.info/)

[Rest API](https://api.energy-charts.info/)
Here, I collect 'public_power', note that this does not include power produced for industrial self supply

The website is maintained by Fraunhofer Institute for Solar Energy Systems (ISE) a German organization for applied research.
ISE collects data from both federal agencies and private companies, see [here](https://energy-charts.info/sources.html?l=en&c=DE
) for more information 

Further information about the different categories of power production can be found [here](https://energy-charts.info/explanations.html?l=en&c=DE)

In [26]:
# Format: YYYY-MM-DD
date_start = '2015-01-01'
date_end = '2023-11-20'

# 'de' for Germany
country_code = 'de'

# Get request
t0 = time.time()
response = requests.get(f'https://api.energy-charts.info/public_power?country={country_code}&start={date_start}T00%3A00%2B01%3A00&end={date_end}T23%3A45%2B01%3A00')
print(response)
print(time.time()-t0)

<Response [200]>
69.57104015350342


In [None]:
# Extract data from response and convert to pandas df
data = {}
t = response.json()['unix_seconds']
data.update({'time_unix':t})
for elem in response.json()['production_types']:
    data.update({elem['name']: elem['data']})
        
df = pd.DataFrame(data)

# Save df as csv
df.to_csv('Energy_data.csv')
df.head()

# Combine and modify data

In [8]:
def get_state_abr(state):
    d={
    'Baden-Württemberg':'BW',
    'Bayern':'BY',
    'Berlin':'BE',
    'Brandenburg':'BB',
    'Bremen':'HB',
    'Hamburg':'HH',
    'Hessen':'HE',
    'Mecklenburg-Vorpommern':'MV',
    'Niedersachsen':'NI',
    'Nordrhein-Westfalen':'NW',
    'Rheinland-Pfalz':'RP',
    'Saarland':'SL',
    'Sachsen':'SN',
    'Sachsen-Anhalt':'ST',
    'Schleswig-Holstein':'SH',
    'Thüringen':'TH'}
    return d[state]

# Import weather data
df_weather = pd.read_csv('Weather_data.csv')
df_weather['date_time'] = df_weather['time_unix'].apply(datetime.datetime.fromtimestamp).dt.strftime('%Y-%m-%d-%H')
df_weather = df_weather.drop(['time_unix', 'Unnamed: 0'], axis='columns')
df_weather = df_weather.set_index('date_time')

def rename(string):
    state = string.split('_')[-1]
    old_key = string.split('_')[0]+'_'+string.split('_')[1]
    new_key = old_key+'_'+get_state_abr(state)  
    return new_key

df_weather = df_weather.rename(rename, axis='columns')

# Import energy data
df_energy = pd.read_csv('Energy_data.csv')
df_energy['date_time'] = df_energy['time_unix'].apply(datetime.datetime.fromtimestamp).dt.strftime('%Y-%m-%d-%H')
df_energy.set_index('date_time')
df_energy = df_energy.drop(['time_unix', 'Unnamed: 0'], axis='columns')
df_energy = df_energy.groupby('date_time').mean()

# Joine data frames along date_time column, create some extra features
df = df_energy.join(df_weather)
df['date_time'] = df.index
df['date_time'] = df['date_time'].apply(lambda string : datetime.datetime.strptime(string, '%Y-%m-%d-%H'))
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.weekday
df['date_year'] = df['date_time'].dt.strftime('%Y-%m-%d')
df['hour'] = df['date_time'].dt.strftime('%H')
df['date'] = df['date_time'].dt.strftime('%m-%d')
df = df.drop('date_time', axis='columns')

# Replace None values by median
for key in df.select_dtypes(include=np.number):
    df[key] = df[key].fillna(np.nanmedian(df[key]))

df.to_csv('Energy_production_weather.csv')
os.remove('Weather_data.csv')
os.remove('Energy_data.csv')