In [13]:
from imports import *

# data gathering and preparation

<center><b>This notebook collects weather and soil data from different sources on the level of German districts.<br>The data set then can be joined with the official crop yield statstics from the official sources.</b>

## identify relevant weather stations

the list of weather stations in Germany can be found here:

https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/annual/kl/historical/KL_Jahreswerte_Beschreibung_Stationen.txt

In [14]:
w_stations = pd.read_fwf('data sources/KL_Monatswerte_Beschreibung_Stationen.txt', 
                         colspecs=[(0, 5), (6, 14), (15, 23), (24, 38), (43, 49), (53, 60), (61, 100), (102, 123)], 
                         encoding='iso8859_15')

w_stations.columns=['Stations_id', 'von_datum', 'bis_datum', 'Stationshoehe', 
                    'geoBreite', 'geoLaenge', 'Stationsname', 'Bundesland']
w_stations.drop(0, axis=0, inplace=True)

w_stations['von_datum'] = pd.to_numeric(w_stations['von_datum'])
w_stations['bis_datum'] = pd.to_numeric(w_stations['bis_datum'])
w_stations['Stationshoehe'] = pd.to_numeric(w_stations['Stationshoehe'])
w_stations['geoBreite'] = pd.to_numeric(w_stations['geoBreite'])
w_stations['geoLaenge'] = pd.to_numeric(w_stations['geoLaenge'])

# keep only those weather stations providing data from 1999 onwards
w_stations = w_stations[w_stations['bis_datum']>19990000]

w_stations.head()

Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland
2,3,18510101,20110331,202,50.782,6.0941,Aachen,Nordrhein-Westfalen
3,44,19710301,20210930,44,52.933,8.237,Großenkneten,Niedersachsen
4,52,19730101,20011231,46,53.662,10.199,Ahrensburg-Wulfsdorf,Schleswig-Holstein
7,71,19861101,20191231,759,48.215,8.9784,Albstadt-Badkap,Baden-Württemberg
9,73,19520701,20210930,340,48.615,13.0506,Aldersbach-Kriestorf,Bayern


The list of weather stations provides the geographical position and the federal state, but not the district that is needed to combine the weather data with the official crop data. So, the certain district of a weather station is retrieved from opentreetmap by using the overpass API.

_The following cell needs some time to execute._

In [15]:
overpass_url = 'http://overpass-api.de/api/interpreter'

#requesting district code of lat/lon position of weather stations
lk = []

for station in np.arange(0, len(w_stations)):
    
    lat = w_stations.iloc[station, 4]
    lon = w_stations.iloc[station, 5]

    req = str('"""is_in('+str(lat)+','+str(lon) + ');area._[admin_level~"6"];out;"""')
    
    overpass_query = eval(req)
    response = requests.get(overpass_url, params={'data': overpass_query})
    
    soup = BeautifulSoup(response.text, 'lxml')
    try:
        #only the first five digits for district
        lk = lk + [soup.find(k="de:amtlicher_gemeindeschluessel")['v'][0:5]] 
    except:
        #to be defined as missing values
        lk = lk + ['99999'] 

#adding district code to data frame and save
w_stations['GemKey5'] = lk
w_stations.head()
#w_stations.to_csv('data sources/weather_stations_GemKey.csv')

In [17]:
w_stations.to_csv('data sources/weather_stations_GemKey.csv')

## retreive weather data for each district

Now, we collect the relevant weather data from all listed stations on the district level. If there are more than one weather stations in a certain district, mean values were computed.

In [19]:
# path to DWD open data portal
path = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/monthly/kl/historical/'

# read all files in directory and put them in a list
# weather data is provided with zip files
site_content = requests.get(path).text
soup = BeautifulSoup(site_content, 'html.parser').find_all('a')
files = [x.attrs['href'] for x in soup]

# Liste bereinigen: nur zip-Dateien
zipfiles = []
for i in files:
    if i[-3:] == 'zip':
        zipfiles.append(i)
    else:
        pass

# empty data frame for merging masked data
empty_df = pd.DataFrame(index=(np.arange(1999, 2021)), columns=(np.arange(1, 13)))

In [20]:
# this function opens the zip file of a given weather station (Station ID),
# extracts all relevant weather indicators, and returns them as dictionary

def get_indicators(statID):
    statID = '_' + statID + '_'

    for i in zipfiles:
        if (statID in i) == True:
            hit = i
        else:
            pass

    # complete URL
    filepath = path+hit

    goal = urlopen(filepath)
    zippedfiles = zipfile.ZipFile(BytesIO(goal.read()))

    for i in zippedfiles.namelist():
        if ('produkt_klima_monat' in i) == True:
            datafile = i
        else:
            pass

    # unpack files and write in data frame
    data = zippedfiles.read(datafile).decode()
    df = pd.read_csv(StringIO(data), sep=';')

    # MO_TT: Monatsmittel der tägl. Lufttemperatur in 2m Höhe
    # MO_SD_S: Monatssumme der Sonnenscheindauer
    # MO_FK: Monatsmittel der tägl. Windstärke
    # MO_RR: Monatssumme der Niederschlagshöhe

    df = df[df['MESS_DATUM_BEGINN']>19990000][['MESS_DATUM_BEGINN', 'MO_TT', 'MO_SD_S', 'MO_FK', 'MO_RR']]
    df = df[df['MESS_DATUM_BEGINN']<20210000]

    # get year and month
    df['Jahr'] = [str(x)[0:4] for x in df['MESS_DATUM_BEGINN']]
    df['Monat'] = [str(x)[4:6] for x in df['MESS_DATUM_BEGINN']]
    df['Jahr'] = pd.to_numeric(df['Jahr'])
    df['Monat'] = pd.to_numeric(df['Monat'])

    df.drop(['MESS_DATUM_BEGINN'], axis=1, inplace=True)

    # missing values
    df.replace({-999: np.nan}, inplace=True)

    # bringing stuff together
    empty_df = pd.DataFrame(index=(np.arange(1999, 2021)), columns=(np.arange(1, 13)))
    
    df_TT = df.pivot(index='Jahr', columns='Monat', values='MO_TT')
    df_SD = df.pivot(index='Jahr', columns='Monat', values='MO_SD_S')
    df_FK = df.pivot(index='Jahr', columns='Monat', values='MO_FK')
    df_RR = df.pivot(index='Jahr', columns='Monat', values='MO_RR')
    df_TT = empty_df.fillna(df_TT)
    df_SD = empty_df.fillna(df_SD)
    df_FK = empty_df.fillna(df_FK)
    df_RR = empty_df.fillna(df_RR)

    # as Dictionary
    station_indicators = {'TT': df_TT, 'SD': df_SD, 'FK': df_FK, 'RR': df_RR}
    
    return station_indicators

In [21]:
# if there is more than one weather station in a particular district,
# this functions comines the values

def combine_stations(stat_ids):
    
    station_ids = stat_ids
    station_values = {}

    #if more than one weather station in district
    if len(station_ids) > 1:

        for i in station_ids:
            station_values[i] = get_indicators(i)

        TT_list = []
        SD_list = []
        FK_list = []
        RR_list = []

        for i in station_values.keys():
            TT_list = TT_list + [station_values[i]['TT']]
            SD_list = SD_list + [station_values[i]['SD']]
            FK_list = FK_list + [station_values[i]['FK']]
            RR_list = RR_list + [station_values[i]['RR']]

        raw_df_TT = pd.concat(TT_list)
        raw_df_SD = pd.concat(SD_list)
        raw_df_FK = pd.concat(FK_list)
        raw_df_RR = pd.concat(RR_list)

        empty_df = pd.DataFrame(index=(np.arange(1999, 2021)), columns=(np.arange(1, 13)))

        df_TT_full = empty_df.copy()
        df_SD_full = empty_df.copy()
        df_FK_full = empty_df.copy()
        df_RR_full = empty_df.copy()

        for i in empty_df.index:
            df_TT_full.loc[i] = raw_df_TT.loc[i].mean(axis=0)
            df_SD_full.loc[i] = raw_df_SD.loc[i].mean(axis=0)
            df_FK_full.loc[i] = raw_df_FK.loc[i].mean(axis=0)
            df_RR_full.loc[i] = raw_df_RR.loc[i].mean(axis=0)
    
    #if just one weather station in district
    else:
        _ = get_indicators(station_ids[0])
        df_TT_full = _['TT']
        df_SD_full = _['SD']
        df_FK_full = _['FK']
        df_RR_full = _['RR']

    return {'TT':df_TT_full, 'SD':df_SD_full, 'FK':df_FK_full, 'RR':df_RR_full}  

In [22]:
# create dictionary with district keys and weather stations id
gemkeys = w_stations['GemKey5'].unique().tolist()
gemkey_stat = {x: w_stations['Stations_id'][w_stations['GemKey5']==x].values.tolist() for x in gemkeys}

# create dictionary of dictionaries with weather data for each district as the mean
# of all data from the weather stations of this district
weather_district = {}
for i in gemkey_stat.keys():
    try:
        weather_district[i] = combine_stations(gemkey_stat[i])
    except:
        weather_district[i] = {'TT':empty_df, 'SD':empty_df, 'FK':empty_df, 'RR':empty_df}

# save to file
with open('data sources/weather_district.pkl', 'wb') as file:
    pickle.dump(weather_district, file)

## retreive soil data for each district

Soil data (soil moisture index, SMI) is collected from the "UFZ Dürremonitor". UFZ provides the SMI for the surface (20cm) and deeper ground (180cm). The datasets can be downloded here: https://www.ufz.de/index.php?de=37937

However, the data is stored in netcdf files and the geographical information is provided by coordinates using the Gauß-Krüger-Zone. So we have to transform it to longitude and latitude.

In [32]:
import netCDF4 as nc
from pyproj import Transformer
from geopy.geocoders import Nominatim

# empty data frame to fill later
empty_df = pd.DataFrame(index=(np.arange(1999, 2021)), columns=(np.arange(1, 13)))

# reading UFZ data set
ds_ob = nc.Dataset('data sources/248980_SMI_SM_L02_Oberboden_monatlich_1951-2020_inv.nc')
ds_gb = nc.Dataset('data sources/248981_SMI_SM_Lall_Gesamtboden_monatlich_1951-2020_inv.nc')

# UFZ data is grid data
# transform from lat/lon (EPSG:4326) to Gauß-Krüger Zone 4 (EPSG: 31468)
transformer = Transformer.from_crs(4326, 31468)

In [33]:
def get_smi (ds, lat, lon):
    # transform
    nor, eas = transformer.transform(lat, lon)
    
    # find nearest point
    near_north = min(ds['northing'][:], key=lambda x:abs(x-nor))
    near_east = min(ds['easting'][:], key=lambda x:abs(x-eas))

    # get index
    ix_northing = list(ds['northing']).index(near_north)
    ix_easting = list(ds['easting']).index(near_east)

    #read SMIi for region
    smi_raw = ds['SMI'][:, ix_northing, ix_easting].data

    # sort list by years
    smi_years = np.reshape(smi_raw, [70, 12])

    # set year as index for data frame
    smi = pd.DataFrame(smi_years, columns=[np.arange(1, 13)], index=np.arange(1951, 2021))

    return smi

In [34]:
def combine_smi(ds, GemKey5):
    ob_full = empty_df.copy()

    smi_values =[]

    if len(gemkey_stat[GemKey5]) > 1:
        
        dstrct_stations = np.array(w_stations[['geoBreite', 'geoLaenge']][w_stations['GemKey5']==GemKey5])
           
        for i in dstrct_stations:
            lat = i[0]
            lon = i[1]

            try:
                smi_values = smi_values + [get_smi(ds, lat, lon).loc[1999:2020].replace(-9999, np.nan)]
            except:
                smi_values = smi_values + [empty_df.copy()]

        try:
            ob_raw = pd.concat(smi_values)
            for i in empty_df.index:
                ob_full.loc[i] = ob_raw.loc[i].mean(axis=0).tolist()
        except:
            ob_full = empty_df.copy()
    
    else:
        dstrct_stations = np.array(w_stations[['geoBreite', 'geoLaenge']][w_stations['GemKey5']==GemKey5])
        lat = dstrct_stations[0][0]
        lon = dstrct_stations[0][1]
        try:
            ob_full = get_smi(ds, lat, lon).loc[1999:2020].replace(-9999, np.nan)
        except:
            ob_full = empty_df.copy()
    
    return ob_full

In [35]:
# create dictionary with district keys and weather stations id
gemkeys = w_stations['GemKey5'].unique().tolist()
gemkey_stat = {x: w_stations['Stations_id'][w_stations['GemKey5']==x].values.tolist() for x in gemkeys}

# collect SMI values for each German district
smi_ob = {}
smi_gb = {}
for i in gemkey_stat.keys():
    smi_ob[i] = combine_smi(ds_ob, i)
    smi_gb[i] = combine_smi(ds_gb, i)

# append smi data frames to weather data frames
for i in weather_district.keys():
    weather_district[i]['SMI_OB'] = smi_ob[i].copy()
    weather_district[i]['SMI_GB'] = smi_gb[i].copy()

# saving final data set
with open('data sources/weather_district.pkl', 'wb') as file:
    pickle.dump(weather_district, file)

## create filter

* filter is used for crop-specific growth periods later
    * <tt>df</tt>: data frame with weather data
    * <tt>months</tt>: list of months of growing season i.e. <tt>[10, 11, 12, 1, 2, 3]</tt>
    * <tt>cropyear</tt>: year of the harvest

In [16]:
def apply_growingseason(df, months, cropyear):
    start = months[0]-1
    stop  = months[-1]

    if months[0] > months[-1]:
        #empty filter
        filtr_clean = np.repeat(np.nan, 24).reshape(2, 12)

        #active filter
        filtr = filtr_clean
        filtr[0][start:] = 1
        filtr[1][:stop]  = 1
        
        dx = df.loc[(cropyear-1): cropyear].copy()
        dx = dx * filtr
        dx = dx.values.flatten().tolist()
        dx = np.array([x for x in dx if x > 0])      

    else:
        #empty filter
        filtr_clean = np.repeat(np.nan, 12)

        #active filter
        filtr = filtr_clean
        filtr[start:stop] = 1
        
        dx = df.loc[cropyear].copy()
        dx = dx * filtr
        dx = np.array(dx.dropna().tolist())
    
    return dx

In [230]:
x = weather_district['09275']['RR']
croptime = np.array([11, 12, 1, 2, 3])
apply_growingseason(x, croptime, 2000)

array([ 47.2 , 101.  ,  55.  ,  99.85, 152.5 ])

# done