# Data Transformation

### Ideen für Zukunft des Projekts
- Map Plots

In [1]:
from pathlib import Path
import pandas as pd
import math

In [2]:
path_base = Path.cwd()

# export path
path_export = Path.joinpath(path_base, "exports")
path_export.mkdir(parents=True, exist_ok=True)

In [3]:
# import the data from Notebook 01
df_temp = pd.read_pickle(Path.joinpath(path_export, "temp.pkl"))
df_temp_stations = pd.read_pickle(Path.joinpath(path_export, "temp_stations.pkl"))
df_prec = pd.read_pickle(Path.joinpath(path_export, "prec.pkl"))
df_prec_stations = pd.read_pickle(Path.joinpath(path_export, "prec_stations.pkl"))
df_sun = pd.read_pickle(Path.joinpath(path_export, "sun.pkl"))
df_sun_stations = pd.read_pickle(Path.joinpath(path_export, "sun_stations.pkl"))

## Integrate Geodata into Landkreise Frame

In [4]:
# load RKI Covid-19 data in order to build a Landkreis-ID lookup table
df_rki = pd.read_csv("https://www.arcgis.com/sharing/rest/content/items/f10774f1c63e40168479a1feb6c7ca74/data")
df_landkreise = df_rki.drop_duplicates('Landkreis')[['Landkreis', 'IdLandkreis', 'Bundesland', 'IdBundesland']]
df_landkreise

Unnamed: 0,Landkreis,IdLandkreis,Bundesland,IdBundesland
0,LK Steinburg,1061,Schleswig-Holstein,1
61,LK Stormarn,1062,Schleswig-Holstein,1
366,SK Hamburg,2000,Hamburg,2
1000,LK Pinneberg,1056,Schleswig-Holstein,1
1339,LK Plön,1057,Schleswig-Holstein,1
...,...,...,...,...
116746,LK Jerichower Land,15086,Sachsen-Anhalt,15
116779,LK Mansfeld-Südharz,15087,Sachsen-Anhalt,15
116813,LK Saalekreis,15088,Sachsen-Anhalt,15
116912,LK Salzlandkreis,15089,Sachsen-Anhalt,15


In [5]:
# load geographical data of the Landkreise in Germany
df_districts_geo = pd.read_csv("https://public.opendatasoft.com/explore/dataset/landkreise-in-germany/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B", ";")
df_districts_geo
# Our districtId is in column "Cca 2"

Unnamed: 0,Geo Point,Geo Shape,Id 0,ISO,Name 0,Id 1,Name 1,Id 2,Name 2,Hasc 2,Ccn 2,Cca 2,Type 2,Engtype 2,Nl Name 2,Varname 2
0,"47.9925229956,7.81807596197","{""type"": ""Polygon"", ""coordinates"": [[[7.790447...",86,DEU,Germany,1,Baden-Württemberg,12,Freiburg im Breisgau,DE.BW.FB,0,8311.0,Stadtkreis,District,,
1,"48.5964037974,10.527764168","{""type"": ""Polygon"", ""coordinates"": [[[10.61448...",86,DEU,Germany,2,Bayern,68,Dillingen an der Donau,DE.BY.DD,0,9773.0,Landkreis,District,,
2,"49.4362114486,11.0827553426","{""type"": ""MultiPolygon"", ""coordinates"": [[[[11...",86,DEU,Germany,2,Bayern,107,Nürnberg,DE.BY.NR,0,9564.0,Kreisfreie Stadt,District,,
3,"49.2159614099,11.5665579197","{""type"": ""Polygon"", ""coordinates"": [[[11.46063...",86,DEU,Germany,2,Bayern,110,Neumarkt in der Oberpfalz,DE.BY.NO,0,9373.0,Landkreis,District,,
4,"47.8443777181,12.1087247511","{""type"": ""Polygon"", ""coordinates"": [[[12.05431...",86,DEU,Germany,2,Bayern,122,Rosenheim,DE.BY.RH,0,9163.0,Kreisfreie Stadt,District,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,"49.416649982,8.36068108599","{""type"": ""Polygon"", ""coordinates"": [[[8.353994...",86,DEU,Germany,11,Rheinland-Pfalz,323,Rhein-Pfalz-Kreis,DE.RP.RZ,0,7338.0,Landkreis,District,,
399,"49.762938832,6.65505102152","{""type"": ""Polygon"", ""coordinates"": [[[6.734675...",86,DEU,Germany,11,Rheinland-Pfalz,328,Trier,DE.RP.TI,0,7211.0,Kreisfreie Stadt,District,,
400,"51.4256710773,11.865474038","{""type"": ""MultiPolygon"", ""coordinates"": [[[[11...",86,DEU,Germany,13,Sachsen-Anhalt,349,Saalekreis,DE.ST.SL,0,15088.0,Landkreis,District,,
401,"51.8201400674,12.7015882396","{""type"": ""Polygon"", ""coordinates"": [[[12.42280...",86,DEU,Germany,13,Sachsen-Anhalt,352,Wittenberg,DE.ST.WT,0,15091.0,Landkreis,District,,


In [6]:
df_lk = pd.merge(df_landkreise, df_districts_geo, left_on="IdLandkreis", right_on="Cca 2")[['Landkreis', 'Name 2', 'Type 2','IdLandkreis', 'Bundesland', 'Geo Point']]
df_lk = df_lk.rename(columns={'Name 2': 'Name kurz', 'Type 2': 'Typ'})
df_lk

Unnamed: 0,Landkreis,Name kurz,Typ,IdLandkreis,Bundesland,Geo Point
0,LK Steinburg,Steinburg,Kreis,1061,Schleswig-Holstein,"53.9289451889,9.51938189615"
1,LK Stormarn,Stormarn,Kreis,1062,Schleswig-Holstein,"53.7208005726,10.3316398811"
2,SK Hamburg,Hamburg,Kreisfreie Stadt,2000,Hamburg,"53.5463773461,10.0207944156"
3,LK Pinneberg,Pinneberg,Kreis,1056,Schleswig-Holstein,"53.7180840651,9.73686669973"
4,LK Plön,Plön,Kreis,1057,Schleswig-Holstein,"54.2433885939,10.3636951573"
...,...,...,...,...,...,...
394,LK Jerichower Land,Jerichower Land,Landkreis,15086,Sachsen-Anhalt,"52.2607898277,12.0265188373"
395,LK Mansfeld-Südharz,Mansfeld-Südharz,Landkreis,15087,Sachsen-Anhalt,"51.5356901778,11.3563584729"
396,LK Saalekreis,Saalekreis,Landkreis,15088,Sachsen-Anhalt,"51.4256710773,11.865474038"
397,LK Salzlandkreis,Salzlandkreis,Landkreis,15089,Sachsen-Anhalt,"51.8518591913,11.6427256195"


In [7]:
# split up column "Geo Point" into two seperate numerical columns
df_lk['latitude'], df_lk['longitude'] = df_lk['Geo Point'].str.split(',', 1).str
df_lk[['latitude', 'longitude']] = df_lk[['latitude', 'longitude']].apply(pd.to_numeric)
df_lk.drop(columns=['Geo Point'], inplace=True)
df_lk

  


Unnamed: 0,Landkreis,Name kurz,Typ,IdLandkreis,Bundesland,latitude,longitude
0,LK Steinburg,Steinburg,Kreis,1061,Schleswig-Holstein,53.928945,9.519382
1,LK Stormarn,Stormarn,Kreis,1062,Schleswig-Holstein,53.720801,10.331640
2,SK Hamburg,Hamburg,Kreisfreie Stadt,2000,Hamburg,53.546377,10.020794
3,LK Pinneberg,Pinneberg,Kreis,1056,Schleswig-Holstein,53.718084,9.736867
4,LK Plön,Plön,Kreis,1057,Schleswig-Holstein,54.243389,10.363695
...,...,...,...,...,...,...,...
394,LK Jerichower Land,Jerichower Land,Landkreis,15086,Sachsen-Anhalt,52.260790,12.026519
395,LK Mansfeld-Südharz,Mansfeld-Südharz,Landkreis,15087,Sachsen-Anhalt,51.535690,11.356358
396,LK Saalekreis,Saalekreis,Landkreis,15088,Sachsen-Anhalt,51.425671,11.865474
397,LK Salzlandkreis,Salzlandkreis,Landkreis,15089,Sachsen-Anhalt,51.851859,11.642726


In [8]:
df_temp_stations

Unnamed: 0,station_id,start_date,end_date,altitude,latitude,longitude,name,state
0,3,1950-04-01,2011-03-31,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen
1,44,2007-04-01,2020-04-23,44,52.9336,8.2370,Großenkneten,Niedersachsen
2,52,1976-01-01,1988-01-01,46,53.6623,10.1990,Ahrensburg-Wulfsdorf,Schleswig-Holstein
3,71,2009-12-01,2020-01-02,759,48.2156,8.9784,Albstadt-Badkap,Baden-Württemberg
4,73,2007-04-01,2020-04-23,340,48.6159,13.0506,Aldersbach-Kriestorf,Bayern
...,...,...,...,...,...,...,...,...
653,14138,2009-09-15,2015-12-31,73,52.1655,14.1224,Falkenberg (Grenzschichtmessfeld),Brandenburg
654,15000,2011-04-01,2020-04-23,231,50.7983,6.0244,Aachen-Orsbach,Nordrhein-Westfalen
655,15207,2013-11-01,2020-04-23,317,51.2835,9.3590,Schauenburg-Elgershausen,Hessen
656,15444,2014-09-01,2020-04-23,593,48.4418,9.9216,Ulm-Mähringen,Baden-Württemberg


### Landkreise that are not covered by this dataset
The RKI dataset gives data for 412 Landkreise, however, the dataset from _opendatasoft_ provides geospatial coordinates only for 399 of them.

The Landkreise for which no geospatial data exists will be neglected in the following. If we have a look at them, we see that its mostly the districts of Berlin that are special, so we treat Berlin as a whole in the future:

In [9]:
# some of the Landkreise are not covered by BOTH datasets, so they will be omitted
pd.concat([df_lk, df_landkreise]).drop_duplicates(['IdLandkreis'], keep=False)

Unnamed: 0,Landkreis,Name kurz,Typ,IdLandkreis,Bundesland,latitude,longitude,IdBundesland
2988,LK Göttingen,,,3159,Niedersachsen,,,3.0
99504,SK Berlin Mitte,,,11001,Berlin,,,11.0
101184,SK Berlin Friedrichshain-Kreuzberg,,,11002,Berlin,,,11.0
101581,SK Berlin Pankow,,,11003,Berlin,,,11.0
102000,SK Berlin Neukölln,,,11008,Berlin,,,11.0
102329,SK Berlin Treptow-Köpenick,,,11009,Berlin,,,11.0
102592,SK Berlin Marzahn-Hellersdorf,,,11010,Berlin,,,11.0
102846,SK Berlin Lichtenberg,,,11011,Berlin,,,11.0
108049,SK Berlin Reinickendorf,,,11012,Berlin,,,11.0
109000,SK Berlin Steglitz-Zehlendorf,,,11006,Berlin,,,11.0


## Strategy

- iterate over all Landkreise
- assign every Landkreis the weather station that is closest to it

In [10]:
def assign_weather_station_to_landkreis(df_stations, df_lk):
    """Compares the center of each Landkreis with the location of each weather station
       and finds the one station that is closest to a particular Landkreis center.
    """
    closest_station_dict = {}
    for lk_idx, lk_row in df_lk.iterrows():
        idLandkreis = lk_row['IdLandkreis']
        for idx, row in df_stations.iterrows():    
            # calculate distance between station and landkreis center
            lk_lat = lk_row['latitude']
            lk_lon = lk_row['longitude']

            station_lat = row['latitude']
            station_lon = row['longitude']

            a = station_lat - lk_lat
            b = station_lon - lk_lon
            distance = math.sqrt(a*a + b*b)

            if idLandkreis not in closest_station_dict.keys():
                closest_station_dict[idLandkreis] = {'station_id': row['station_id'], 'distance': distance}
            else:
                # check if current station is closer to landkreis
                if distance < closest_station_dict[idLandkreis]['distance']:
                    closest_station_dict[idLandkreis] = {'station_id': row['station_id'], 'distance': distance}
    df = pd.DataFrame.from_dict(closest_station_dict, orient='index').reset_index()
    df.rename(columns={'index': "IdLandkreis"}, inplace=True)
    return df

In [11]:
# these variables contain the assiciation of Landkreis to weather station
temp_lk_stations = assign_weather_station_to_landkreis(df_temp_stations, df_lk)
prec_lk_stations = assign_weather_station_to_landkreis(df_prec_stations, df_lk)
sun_lk_stations = assign_weather_station_to_landkreis(df_sun_stations, df_lk)
sun_lk_stations

Unnamed: 0,IdLandkreis,station_id,distance
0,1061,2429,0.078823
1,1062,5058,0.046394
2,2000,1975,0.092774
3,1056,4039,0.141532
4,1057,6163,0.078876
...,...,...,...
394,15086,1052,0.144261
395,15087,5814,0.048906
396,15088,2878,0.037144
397,15089,445,0.074507


## Plot Landkreis-Weather Station Assignment

In [25]:
temp_stations = temp_lk_stations.merge(df_lk, on='IdLandkreis').merge(df_temp_stations, on='station_id').drop(columns=['Name kurz', 'Typ', 'Bundesland', 'start_date', 'end_date', 'state'])
temp_stations.rename(columns={'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude'}, inplace=True)
temp_stations

Unnamed: 0,IdLandkreis,station_id,distance,Landkreis,lk_latitude,lk_longitude,altitude,station_latitude,station_longitude,name
0,1061,2429,0.078823,LK Steinburg,53.928945,9.519382,21,53.9897,9.5696,Itzehoe
1,1062,52,0.144968,LK Stormarn,53.720801,10.331640,46,53.6623,10.1990,Ahrensburg-Wulfsdorf
2,2000,6254,0.068223,SK Hamburg,53.546377,10.020794,19,53.5408,9.9528,Hamburg-Lotsenhöft
3,1056,4039,0.141532,LK Pinneberg,53.718084,9.736867,11,53.7331,9.8776,Quickborn
4,1057,6163,0.078876,LK Plön,54.243389,10.363695,27,54.1654,10.3519,Dörnick
...,...,...,...,...,...,...,...,...,...,...
394,15086,1052,0.144261,LK Jerichower Land,52.260790,12.026519,80,52.2174,12.1641,Drewitz bei Burg
395,15087,198,0.173563,LK Mansfeld-Südharz,51.535690,11.356358,164,51.3745,11.2920,Artern
396,15088,2878,0.037144,LK Saalekreis,51.425671,11.865474,118,51.3910,11.8788,"Lauchstädt, Bad"
397,15089,445,0.074507,LK Salzlandkreis,51.851859,11.642726,84,51.8218,11.7109,Bernburg/Saale (Nord)


In [37]:
import plotly.express as px
import plotly.graph_objects as go
token = "pk.eyJ1IjoiZnlubndpIiwiYSI6ImNrODk3YmF6MzAzcDczbWs5NXdhaGpyNzYifQ.vHweJb-1hjDeE21tTs7tGQ"
#px.set_mapbox_access_token(token)
#fig = px.scatter_mapbox(temp_stations, lat='lk_latitude', lon='lk_longitude', color='IdLandkreis', size='distance', size_max=12)

fig = go.Figure()
fig.add_trace(go.Scattermapbox(lat=''))



#fig.show()

In [13]:
#df_temp.groupby(['station_id', pd.Grouper(key='date', freq='D')]).mean().reset_index().merge(temp_lk_stations, on='station_id').merge(df_lk, on="IdLandkreis")
temp = df_temp.groupby(['station_id', pd.Grouper(key='date', freq='D')]).mean().reset_index()
prec = df_prec.groupby(['station_id', pd.Grouper(key='date', freq='D')]).cumsum().reset_index()
sun = df_sun.groupby(['station_id', pd.Grouper(key='date', freq='D')]).cumsum().reset_index()

In [14]:
sun

Unnamed: 0,index,quality,SD_SO
0,7826,3,0.0
1,7827,6,0.0
2,7828,9,0.0
3,7829,12,0.0
4,7830,15,0.0
...,...,...,...
716503,10390,20,802.0
716504,10391,21,802.0
716505,10392,22,802.0
716506,10393,23,802.0


In [62]:
df_combined = df_lk.merge(df_lk_stations, on='IdLandkreis').merge(df_temp_stations, on='station_id')
df_combined

Unnamed: 0,Landkreis,Name kurz,Typ,IdLandkreis,Bundesland,latitude_x,longitude_x,station_id,distance,start_date,end_date,altitude,latitude_y,longitude_y,name,state
0,LK Steinburg,Steinburg,Kreis,1061,Schleswig-Holstein,53.928945,9.519382,2429,0.078823,2002-01-01,2020-04-23,21,53.9897,9.5696,Itzehoe,Schleswig-Holstein
1,LK Stormarn,Stormarn,Kreis,1062,Schleswig-Holstein,53.720801,10.331640,52,0.144968,1976-01-01,1988-01-01,46,53.6623,10.1990,Ahrensburg-Wulfsdorf,Schleswig-Holstein
2,SK Hamburg,Hamburg,Kreisfreie Stadt,2000,Hamburg,53.546377,10.020794,6254,0.068223,2002-01-01,2007-07-02,19,53.5408,9.9528,Hamburg-Lotsenhöft,Hamburg
3,LK Pinneberg,Pinneberg,Kreis,1056,Schleswig-Holstein,53.718084,9.736867,4039,0.141532,1988-01-11,2020-04-23,11,53.7331,9.8776,Quickborn,Schleswig-Holstein
4,LK Plön,Plön,Kreis,1057,Schleswig-Holstein,54.243389,10.363695,6163,0.078876,2001-04-01,2020-04-23,27,54.1654,10.3519,Dörnick,Schleswig-Holstein
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,LK Jerichower Land,Jerichower Land,Landkreis,15086,Sachsen-Anhalt,52.260790,12.026519,1052,0.144261,2004-07-01,2020-04-23,80,52.2174,12.1641,Drewitz bei Burg,Sachsen-Anhalt
395,LK Mansfeld-Südharz,Mansfeld-Südharz,Landkreis,15087,Sachsen-Anhalt,51.535690,11.356358,198,0.173563,1961-01-01,2020-04-23,164,51.3745,11.2920,Artern,Thüringen
396,LK Saalekreis,Saalekreis,Landkreis,15088,Sachsen-Anhalt,51.425671,11.865474,2878,0.037144,2017-08-01,2020-04-23,118,51.3910,11.8788,"Lauchstädt, Bad",Sachsen-Anhalt
397,LK Salzlandkreis,Salzlandkreis,Landkreis,15089,Sachsen-Anhalt,51.851859,11.642726,445,0.074507,2005-07-01,2020-04-23,84,51.8218,11.7109,Bernburg/Saale (Nord),Sachsen-Anhalt
