# Data Transformation

I have:
- data frame containing Landkreis IDs
- data frame containing Landkreis Locations
- data frame containing Weather Data

Goal:
- assign every Landkreis ONE weather station
- combine the three data frames into one

In [1]:
from pathlib import Path
import pandas as pd
import math
from tqdm import tqdm

In [2]:
path_base = Path.cwd()

# export path
path_export = Path.joinpath(path_base, "exports")
path_export.mkdir(parents=True, exist_ok=True)

In [3]:
# import the data from Notebook 01
df_temp = pd.read_pickle(Path.joinpath(path_export, "temp.pkl"))
df_temp_stations = pd.read_pickle(Path.joinpath(path_export, "temp_stations.pkl"))

df_prec = pd.read_pickle(Path.joinpath(path_export, "prec.pkl"))
df_prec_stations = pd.read_pickle(Path.joinpath(path_export, "prec_stations.pkl"))

df_sun = pd.read_pickle(Path.joinpath(path_export, "sun.pkl"))
df_sun_stations = pd.read_pickle(Path.joinpath(path_export, "sun_stations.pkl"))

df_wind = pd.read_pickle(Path.joinpath(path_export, "wind.pkl"))
df_wind_stations = pd.read_pickle(Path.joinpath(path_export, "wind_stations.pkl"))

## Integrate Geodata into Landkreise Frame

In [4]:
# load RKI Covid-19 data in order to build a Landkreis-ID lookup table
df_rki = pd.read_csv("https://www.arcgis.com/sharing/rest/content/items/f10774f1c63e40168479a1feb6c7ca74/data")
df_landkreise = df_rki.drop_duplicates('Landkreis')[['Landkreis', 'IdLandkreis', 'Bundesland', 'IdBundesland']]
df_landkreise

Unnamed: 0,Landkreis,IdLandkreis,Bundesland,IdBundesland
0,SK Flensburg,1001,Schleswig-Holstein,1
34,SK Kiel,1002,Schleswig-Holstein,1
297,SK Lübeck,1003,Schleswig-Holstein,1
449,SK Neumünster,1004,Schleswig-Holstein,1
524,LK Dithmarschen,1051,Schleswig-Holstein,1
...,...,...,...,...
137473,LK Saalfeld-Rudolstadt,16073,Thüringen,16
137539,LK Saale-Holzland-Kreis,16074,Thüringen,16
137599,LK Saale-Orla-Kreis,16075,Thüringen,16
137725,LK Greiz,16076,Thüringen,16


In [5]:
# load geographical data of the Landkreise in Germany
df_districts_geo = pd.read_csv("https://public.opendatasoft.com/explore/dataset/landkreise-in-germany/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B", ";")
df_districts_geo
# Our districtId is in column "Cca 2"

Unnamed: 0,Geo Point,Geo Shape,Id 0,ISO,Name 0,Id 1,Name 1,Id 2,Name 2,Hasc 2,Ccn 2,Cca 2,Type 2,Engtype 2,Nl Name 2,Varname 2
0,"47.9925229956,7.81807596197","{""type"": ""Polygon"", ""coordinates"": [[[7.790447...",86,DEU,Germany,1,Baden-Württemberg,12,Freiburg im Breisgau,DE.BW.FB,0,8311.0,Stadtkreis,District,,
1,"48.5964037974,10.527764168","{""type"": ""Polygon"", ""coordinates"": [[[10.61448...",86,DEU,Germany,2,Bayern,68,Dillingen an der Donau,DE.BY.DD,0,9773.0,Landkreis,District,,
2,"49.4362114486,11.0827553426","{""type"": ""MultiPolygon"", ""coordinates"": [[[[11...",86,DEU,Germany,2,Bayern,107,Nürnberg,DE.BY.NR,0,9564.0,Kreisfreie Stadt,District,,
3,"49.2159614099,11.5665579197","{""type"": ""Polygon"", ""coordinates"": [[[11.46063...",86,DEU,Germany,2,Bayern,110,Neumarkt in der Oberpfalz,DE.BY.NO,0,9373.0,Landkreis,District,,
4,"47.8443777181,12.1087247511","{""type"": ""Polygon"", ""coordinates"": [[[12.05431...",86,DEU,Germany,2,Bayern,122,Rosenheim,DE.BY.RH,0,9163.0,Kreisfreie Stadt,District,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,"52.6800481605,11.2270452746","{""type"": ""Polygon"", ""coordinates"": [[[11.51008...",86,DEU,Germany,13,Sachsen-Anhalt,339,Altmarkkreis Salzwedel,DE.ST.AS,0,15081.0,Landkreis,District,,
399,"51.7954450255,12.1432020671","{""type"": ""MultiPolygon"", ""coordinates"": [[[[11...",86,DEU,Germany,13,Sachsen-Anhalt,340,Anhalt-Bitterfeld,DE.ST.AB,0,15082.0,Landkreis,District,,
400,"52.2207122989,11.3478384337","{""type"": ""Polygon"", ""coordinates"": [[[11.06190...",86,DEU,Germany,13,Sachsen-Anhalt,341,Börde,DE.ST.BR,0,15083.0,Landkreis,District,,
401,"50.9558166403,13.1375068135","{""type"": ""Polygon"", ""coordinates"": [[[13.18351...",86,DEU,Germany,14,Sachsen,361,Mittelsachsen,DE.SN.MT,0,14522.0,Landkreis,District,,


In [6]:
df_lk = pd.merge(df_landkreise, df_districts_geo, left_on="IdLandkreis", right_on="Cca 2")[['Landkreis', 'Name 2', 'Type 2','IdLandkreis', 'Bundesland', 'Geo Point']]
df_lk = df_lk.rename(columns={'Name 2': 'Name kurz', 'Type 2': 'Typ'})
df_lk

Unnamed: 0,Landkreis,Name kurz,Typ,IdLandkreis,Bundesland,Geo Point
0,SK Flensburg,Flensburg,Kreisfreie Stadt,1001,Schleswig-Holstein,"54.7849933768,9.43852835486"
1,SK Kiel,Kiel,Kreisfreie Stadt,1002,Schleswig-Holstein,"54.3248406926,10.1322443646"
2,SK Lübeck,Lübeck,Kreisfreie Stadt,1003,Schleswig-Holstein,"53.8723167338,10.7272831058"
3,SK Neumünster,Neumünster,Kreisfreie Stadt,1004,Schleswig-Holstein,"54.0811244365,9.98448195474"
4,LK Dithmarschen,Dithmarschen,Kreis,1051,Schleswig-Holstein,"54.1329109614,9.10781447873"
...,...,...,...,...,...,...
394,LK Saalfeld-Rudolstadt,Saalfeld-Rudolstadt,Landkreis,16073,Thüringen,"50.637797959,11.3091162493"
395,LK Saale-Holzland-Kreis,Saale-Holzland-Kreis,Landkreis,16074,Thüringen,"50.904172137,11.7315307817"
396,LK Saale-Orla-Kreis,Saale-Orla-Kreis,Landkreis,16075,Thüringen,"50.5808480206,11.7105737336"
397,LK Greiz,Greiz,Landkreis,16076,Thüringen,"50.7484595538,12.0740705739"


In [7]:
# split up column "Geo Point" into two seperate numerical columns
df_lk['latitude'], df_lk['longitude'] = df_lk['Geo Point'].str.split(',', 1).str
df_lk[['latitude', 'longitude']] = df_lk[['latitude', 'longitude']].apply(pd.to_numeric)
df_lk.drop(columns=['Geo Point'], inplace=True)
df_lk

  


Unnamed: 0,Landkreis,Name kurz,Typ,IdLandkreis,Bundesland,latitude,longitude
0,SK Flensburg,Flensburg,Kreisfreie Stadt,1001,Schleswig-Holstein,54.784993,9.438528
1,SK Kiel,Kiel,Kreisfreie Stadt,1002,Schleswig-Holstein,54.324841,10.132244
2,SK Lübeck,Lübeck,Kreisfreie Stadt,1003,Schleswig-Holstein,53.872317,10.727283
3,SK Neumünster,Neumünster,Kreisfreie Stadt,1004,Schleswig-Holstein,54.081124,9.984482
4,LK Dithmarschen,Dithmarschen,Kreis,1051,Schleswig-Holstein,54.132911,9.107814
...,...,...,...,...,...,...,...
394,LK Saalfeld-Rudolstadt,Saalfeld-Rudolstadt,Landkreis,16073,Thüringen,50.637798,11.309116
395,LK Saale-Holzland-Kreis,Saale-Holzland-Kreis,Landkreis,16074,Thüringen,50.904172,11.731531
396,LK Saale-Orla-Kreis,Saale-Orla-Kreis,Landkreis,16075,Thüringen,50.580848,11.710574
397,LK Greiz,Greiz,Landkreis,16076,Thüringen,50.748460,12.074071


In [8]:
df_wind_stations

Unnamed: 0,station_id,start_date,end_date,altitude,latitude,longitude,name,state
0,3,1937-01-01,2011-03-31,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen
1,11,1980-09-01,2020-05-17,680,47.9737,8.5205,Donaueschingen (Landeplatz),Baden-Württemberg
2,44,1969-01-01,1995-11-30,44,52.9336,8.2370,Großenkneten,Niedersachsen
3,52,1969-01-01,2001-12-31,46,53.6623,10.1990,Ahrensburg-Wulfsdorf,Schleswig-Holstein
4,72,1978-09-01,1995-05-31,794,48.2766,9.0001,Albstadt-Onstmettingen,Baden-Württemberg
...,...,...,...,...,...,...,...,...
514,15444,2014-09-01,2020-05-17,593,48.4418,9.9216,Ulm-Mähringen,Baden-Württemberg
515,15520,2015-09-01,2020-05-17,656,47.8190,12.2956,Frasdorf-Greimelberg,Bayern
516,15547,2016-10-01,2020-05-17,215,49.1259,9.1428,Heilbronn/Neckar,Baden-Württemberg
517,15976,2020-04-01,2020-05-17,216,49.9529,8.2107,Ober-Olm/Bellem,Rheinland-Pfalz


### Landkreise that are not covered by this dataset
The RKI dataset gives data for 412 Landkreise, however, the dataset from _opendatasoft_ provides geospatial coordinates only for 399 of them.

The Landkreise for which no geospatial data exists will be neglected in the following. If we have a look at them, we see that its mostly the districts of Berlin that are special, so we treat Berlin as a whole in the future:

In [9]:
# some of the Landkreise are not covered by BOTH datasets, so they will be omitted
pd.concat([df_lk, df_landkreise]).drop_duplicates(['IdLandkreis'], keep=False)

Unnamed: 0,Landkreis,Name kurz,Typ,IdLandkreis,Bundesland,latitude,longitude,IdBundesland
6877,LK Göttingen,,,3159,Niedersachsen,,,3.0
120750,SK Berlin Mitte,,,11001,Berlin,,,11.0
121534,SK Berlin Friedrichshain-Kreuzberg,,,11002,Berlin,,,11.0
121979,SK Berlin Pankow,,,11003,Berlin,,,11.0
122573,SK Berlin Charlottenburg-Wilmersdorf,,,11004,Berlin,,,11.0
123221,SK Berlin Spandau,,,11005,Berlin,,,11.0
123485,SK Berlin Steglitz-Zehlendorf,,,11006,Berlin,,,11.0
123972,SK Berlin Tempelhof-Schöneberg,,,11007,Berlin,,,11.0
124571,SK Berlin Neukölln,,,11008,Berlin,,,11.0
125220,SK Berlin Treptow-Köpenick,,,11009,Berlin,,,11.0


### Add Göttingen
As _Geo Point_ I take the coordinates of the city of Göttingen.

In [10]:
df_lk = pd.concat([df_lk, df_landkreise[df_landkreise['IdLandkreis'] == 3159]])
df_lk.set_index('IdLandkreis', inplace=True)
df_lk

Unnamed: 0_level_0,Landkreis,Name kurz,Typ,Bundesland,latitude,longitude,IdBundesland
IdLandkreis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1001,SK Flensburg,Flensburg,Kreisfreie Stadt,Schleswig-Holstein,54.784993,9.438528,
1002,SK Kiel,Kiel,Kreisfreie Stadt,Schleswig-Holstein,54.324841,10.132244,
1003,SK Lübeck,Lübeck,Kreisfreie Stadt,Schleswig-Holstein,53.872317,10.727283,
1004,SK Neumünster,Neumünster,Kreisfreie Stadt,Schleswig-Holstein,54.081124,9.984482,
1051,LK Dithmarschen,Dithmarschen,Kreis,Schleswig-Holstein,54.132911,9.107814,
...,...,...,...,...,...,...,...
16074,LK Saale-Holzland-Kreis,Saale-Holzland-Kreis,Landkreis,Thüringen,50.904172,11.731531,
16075,LK Saale-Orla-Kreis,Saale-Orla-Kreis,Landkreis,Thüringen,50.580848,11.710574,
16076,LK Greiz,Greiz,Landkreis,Thüringen,50.748460,12.074071,
16077,LK Altenburger Land,Altenburger Land,Landkreis,Thüringen,50.956425,12.399131,


In [11]:
df_lk.loc[3159, 'Name kurz'] = "Göttingen"
df_lk.loc[3159, 'Typ'] = "Landkreis"
df_lk.loc[3159, 'latitude'] = 51.540120
df_lk.loc[3159, 'longitude'] = 9.930627

df_lk.loc[3159]

Landkreis        LK Göttingen
Name kurz           Göttingen
Typ                 Landkreis
Bundesland      Niedersachsen
latitude              51.5401
longitude             9.93063
IdBundesland                3
Name: 3159, dtype: object

In [12]:
df_lk.reset_index(inplace=True)
df_lk

Unnamed: 0,IdLandkreis,Landkreis,Name kurz,Typ,Bundesland,latitude,longitude,IdBundesland
0,1001,SK Flensburg,Flensburg,Kreisfreie Stadt,Schleswig-Holstein,54.784993,9.438528,
1,1002,SK Kiel,Kiel,Kreisfreie Stadt,Schleswig-Holstein,54.324841,10.132244,
2,1003,SK Lübeck,Lübeck,Kreisfreie Stadt,Schleswig-Holstein,53.872317,10.727283,
3,1004,SK Neumünster,Neumünster,Kreisfreie Stadt,Schleswig-Holstein,54.081124,9.984482,
4,1051,LK Dithmarschen,Dithmarschen,Kreis,Schleswig-Holstein,54.132911,9.107814,
...,...,...,...,...,...,...,...,...
395,16074,LK Saale-Holzland-Kreis,Saale-Holzland-Kreis,Landkreis,Thüringen,50.904172,11.731531,
396,16075,LK Saale-Orla-Kreis,Saale-Orla-Kreis,Landkreis,Thüringen,50.580848,11.710574,
397,16076,LK Greiz,Greiz,Landkreis,Thüringen,50.748460,12.074071,
398,16077,LK Altenburger Land,Altenburger Land,Landkreis,Thüringen,50.956425,12.399131,


## Matching – Old Approach
- iterate over all Landkreise
- assign every Landkreis the weather station that is closest to it

A new approach can be found in notebook _04 - Advanced Station Assignment_

In [13]:
def assign_weather_station_to_landkreis(df_stations, df_lk, df_weather):
    """Compares the center of each Landkreis with the location of each weather station
       and finds the ONE station that is closest to a particular Landkreis center.
    """
    
    # filter out stations that don't provide data in df_weather
    not_allowed = pd.concat([df_stations, df_weather]).drop_duplicates('station_id', keep=False)
    df_stations = df_stations[~df_stations['station_id'].isin(not_allowed['station_id'])]
    
    closest_station_dict = {}
    for lk_idx, lk_row in tqdm(df_lk.iterrows(), total=df_lk.shape[0]):
        idLandkreis = lk_row['IdLandkreis']
        for idx, row in df_stations.iterrows():
            
            # calculate distance between station and landkreis center
            lk_lat = lk_row['latitude']
            lk_lon = lk_row['longitude']

            station_lat = row['latitude']
            station_lon = row['longitude']

            a = station_lat - lk_lat
            b = station_lon - lk_lon
            distance = math.sqrt(a*a + b*b)

            if idLandkreis not in closest_station_dict.keys():
                closest_station_dict[idLandkreis] = {'station_id': row['station_id'], 'distance': distance}
            else:
                # check if current station is closer to landkreis
                if distance < closest_station_dict[idLandkreis]['distance']:
                    closest_station_dict[idLandkreis] = {'station_id': row['station_id'], 'distance': distance}
    df = pd.DataFrame.from_dict(closest_station_dict, orient='index').reset_index()
    df.rename(columns={'index': "IdLandkreis"}, inplace=True)
    return df

In [14]:
# these variables contain the assiciation of Landkreis to weather station
temp_lk_stations = assign_weather_station_to_landkreis(df_temp_stations, df_lk, df_temp)
prec_lk_stations = assign_weather_station_to_landkreis(df_prec_stations, df_lk, df_prec)
sun_lk_stations = assign_weather_station_to_landkreis(df_sun_stations, df_lk, df_sun)
wind_lk_stations = assign_weather_station_to_landkreis(df_wind_stations, df_lk, df_wind)
temp_lk_stations

100%|██████████| 400/400 [00:57<00:00,  7.01it/s]
100%|██████████| 400/400 [02:29<00:00,  2.68it/s]
100%|██████████| 400/400 [00:39<00:00, 10.16it/s]
100%|██████████| 400/400 [00:21<00:00, 18.75it/s]


Unnamed: 0,IdLandkreis,station_id,distance
0,1001,1666,0.079469
1,1002,2564,0.053728
2,1003,3086,0.075366
3,1004,7427,0.085809
4,1051,1200,0.116370
...,...,...,...
395,16074,2444,0.149998
396,16075,4464,0.094418
397,16076,7419,0.087473
398,16077,4997,0.060851


In [15]:
# merge data
temp_stations = temp_lk_stations.merge(df_lk, on='IdLandkreis').merge(df_temp_stations, on='station_id').drop(columns=['Name kurz', 'Typ', 'Bundesland', 'start_date', 'end_date', 'state'])
temp_stations.rename(columns={'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude'}, inplace=True)

prec_stations = prec_lk_stations.merge(df_lk, on='IdLandkreis').merge(df_prec_stations, on='station_id').drop(columns=['Name kurz', 'Typ', 'Bundesland', 'start_date', 'end_date', 'state'])
prec_stations.rename(columns={'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude'}, inplace=True)

sun_stations = sun_lk_stations.merge(df_lk, on='IdLandkreis').merge(df_sun_stations, on='station_id').drop(columns=['Name kurz', 'Typ', 'Bundesland', 'start_date', 'end_date', 'state'])
sun_stations.rename(columns={'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude'}, inplace=True)

wind_stations = wind_lk_stations.merge(df_lk, on='IdLandkreis').merge(df_wind_stations, on='station_id').drop(columns=['Name kurz', 'Typ', 'Bundesland', 'start_date', 'end_date', 'state'])
wind_stations.rename(columns={'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude'}, inplace=True)


prec_stations

Unnamed: 0,IdLandkreis,station_id,distance,Landkreis,lk_latitude,lk_longitude,IdBundesland,altitude,station_latitude,station_longitude,name
0,1001,1130,0.173220,SK Flensburg,54.784993,9.438528,,17,54.6282,9.3649,Eggebek
1,1002,2564,0.053728,SK Kiel,54.324841,10.132244,,28,54.3776,10.1424,Kiel-Holtenau
2,1003,4602,0.072251,SK Lübeck,53.872317,10.727283,,26,53.9385,10.6983,"Schwartau,Bad -Groß Parin"
3,1004,7427,0.085809,SK Neumünster,54.081124,9.984482,,17,54.0188,9.9255,Padenstedt (Pony-Park)
4,1051,1200,0.116370,LK Dithmarschen,54.132911,9.107814,,3,54.0691,9.0105,Elpersbüttel
...,...,...,...,...,...,...,...,...,...,...,...
395,16074,550,0.066969,LK Saale-Holzland-Kreis,50.904172,11.731531,,344,50.9041,11.7985,Bobeck
396,16075,2992,0.073163,LK Saale-Orla-Kreis,50.580848,11.710574,,495,50.5276,11.6604,Remptendorf
397,16076,7419,0.087473,LK Greiz,50.748460,12.074071,,389,50.6610,12.0756,Langenwetzendorf-Göttendorf
398,16077,4997,0.060851,LK Altenburger Land,50.956425,12.399131,,196,50.9771,12.3419,Starkenberg-Tegkwitz


## Build Final Dataframe
Eventually, we want to have weather parameters for each day for each Landkreis. So far, the measurements are on an hourly resolution. I take the daily mean of the temperatures, and the sum of the precipitatino and sunshine hour data per day. 

Finally, all data is merged into a single dataframe that holds the temperature, precipitation and sunshine measurements for a particular day in a given Landkreis in one row.

In [26]:
temp = df_temp.groupby(['station_id', pd.Grouper(key='date', freq='D')]).mean().reset_index()
prec = df_prec.groupby(['station_id', pd.Grouper(key='date', freq='D')]).sum().reset_index()
sun = df_sun.groupby(['station_id', pd.Grouper(key='date', freq='D')]).sum().reset_index()
wind = df_wind.groupby(['station_id', pd.Grouper(key='date', freq='D')]).mean().reset_index()

In [37]:
prec.sort_values('R1_IND', ascending=False)

Unnamed: 0,station_id,date,quality,R1,R1_IND,WRTR
39545,2171,2020-03-11,24,19.5,24.0,-7896
66081,3513,2020-02-10,72,33.0,24.0,-7876
9625,460,2020-03-05,72,26.8,24.0,-7896
32157,1684,2020-02-23,72,22.1,24.0,-7896
69787,3722,2020-02-03,72,43.9,24.0,-23976
...,...,...,...,...,...,...
58290,3175,2020-04-01,24,0.0,0.0,-23976
58288,3175,2020-03-30,24,0.0,0.0,-23976
58286,3175,2020-03-28,24,0.0,0.0,-23976
58285,3175,2020-03-27,24,0.0,0.0,-23976


In [28]:
temp_lk_stations

Unnamed: 0,IdLandkreis,station_id,distance
0,1001,1666,0.079469
1,1002,2564,0.053728
2,1003,3086,0.075366
3,1004,7427,0.085809
4,1051,1200,0.116370
...,...,...,...
395,16074,2444,0.149998
396,16075,4464,0.094418
397,16076,7419,0.087473
398,16077,4997,0.060851


In [29]:
temp_final = temp.merge(temp_lk_stations, on="station_id").merge(df_lk, on="IdLandkreis").merge(df_temp_stations, on="station_id")
temp_final.drop(columns=['quality', 'station_id', 'Name kurz', 'start_date', 'end_date', 'altitude', 'state', 'Typ', 'Bundesland'], inplace=True)
temp_final.rename(columns={'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude', 'name': 'station_name'}, inplace=True)
temp_final = temp_final[['date', 'temperature', 'humidity', 'IdLandkreis', 'Landkreis', 'lk_latitude', 'lk_longitude', 'station_name', 'station_latitude', 'station_longitude']]
temp_final

Unnamed: 0,date,temperature,humidity,IdLandkreis,Landkreis,lk_latitude,lk_longitude,station_name,station_latitude,station_longitude
0,2020-01-01,-1.045833,98.625000,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
1,2020-01-02,-0.045833,97.458333,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
2,2020-01-03,6.495833,92.666667,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
3,2020-01-04,4.762500,88.916667,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
4,2020-01-05,4.162500,92.625000,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
...,...,...,...,...,...,...,...,...,...,...
55109,2020-05-13,5.670833,93.791667,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848
55110,2020-05-14,6.083333,95.750000,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848
55111,2020-05-15,6.262500,92.291667,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848
55112,2020-05-16,9.383333,73.458333,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848


In [30]:
prec_final = prec.merge(prec_lk_stations, on="station_id").merge(df_lk, on="IdLandkreis").merge(df_prec_stations, on="station_id")
prec_final.drop(columns=['R1_IND', 'quality', 'station_id', 'WRTR', 'Name kurz', 'start_date', 'end_date', 'altitude', 'state', 'Typ', 'Bundesland'], inplace=True)
prec_final.rename(columns={'R1': 'precipitation', 'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude', 'name': 'station_name'}, inplace=True)
prec_final = prec_final[['date', 'precipitation', 'IdLandkreis', 'Landkreis', 'lk_latitude', 'lk_longitude', 'station_name', 'station_latitude', 'station_longitude']]
prec_final

Unnamed: 0,date,precipitation,IdLandkreis,Landkreis,lk_latitude,lk_longitude,station_name,station_latitude,station_longitude
0,2020-01-01,0.0,6535,LK Vogelsbergkreis,50.638119,9.271380,Alsfeld-Eifa,50.7446,9.3450
1,2020-01-02,0.0,6535,LK Vogelsbergkreis,50.638119,9.271380,Alsfeld-Eifa,50.7446,9.3450
2,2020-01-03,0.7,6535,LK Vogelsbergkreis,50.638119,9.271380,Alsfeld-Eifa,50.7446,9.3450
3,2020-01-04,2.7,6535,LK Vogelsbergkreis,50.638119,9.271380,Alsfeld-Eifa,50.7446,9.3450
4,2020-01-05,0.4,6535,LK Vogelsbergkreis,50.638119,9.271380,Alsfeld-Eifa,50.7446,9.3450
...,...,...,...,...,...,...,...,...,...
54933,2020-05-13,2.3,9777,LK Ostallgäu,47.769959,10.639732,Kaufbeuren-Oberbeuren,47.8761,10.5848
54934,2020-05-14,4.7,9777,LK Ostallgäu,47.769959,10.639732,Kaufbeuren-Oberbeuren,47.8761,10.5848
54935,2020-05-15,5.9,9777,LK Ostallgäu,47.769959,10.639732,Kaufbeuren-Oberbeuren,47.8761,10.5848
54936,2020-05-16,0.0,9777,LK Ostallgäu,47.769959,10.639732,Kaufbeuren-Oberbeuren,47.8761,10.5848


In [31]:
sun_final = sun.merge(sun_lk_stations, on="station_id").merge(df_lk, on="IdLandkreis").merge(df_sun_stations, on="station_id")
sun_final.drop(columns=['quality', 'station_id', 'Name kurz', 'start_date', 'end_date', 'altitude', 'state', 'Typ', 'Bundesland'], inplace=True)
sun_final.rename(columns={'SD_SO': 'sunshine', 'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude', 'name': 'station_name'}, inplace=True)
sun_final = sun_final[['date', 'sunshine', 'IdLandkreis', 'Landkreis', 'lk_latitude', 'lk_longitude', 'station_name', 'station_latitude', 'station_longitude']]
sun_final

Unnamed: 0,date,sunshine,IdLandkreis,Landkreis,lk_latitude,lk_longitude,station_name,station_latitude,station_longitude
0,2020-01-01,0.0,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
1,2020-01-02,0.0,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
2,2020-01-03,0.0,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
3,2020-01-04,0.0,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
4,2020-01-05,0.0,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.2370
...,...,...,...,...,...,...,...,...,...
54079,2020-05-13,3.0,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848
54080,2020-05-14,0.0,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848
54081,2020-05-15,0.0,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848
54082,2020-05-16,712.0,9778,LK Unterallgäu,48.039813,10.389233,Kaufbeuren-Oberbeuren,47.8761,10.5848


In [32]:
wind_final = wind.merge(wind_lk_stations, on="station_id").merge(df_lk, on="IdLandkreis").merge(df_wind_stations, on="station_id")
wind_final.drop(columns=['quality', 'station_id', 'Name kurz', 'start_date', 'end_date', 'altitude', 'state', 'Typ', 'Bundesland'], inplace=True)
wind_final.rename(columns={'latitude_x': 'lk_latitude', 'longitude_x': 'lk_longitude', 'latitude_y': 'station_latitude', 'longitude_y': 'station_longitude', 'name': 'station_name'}, inplace=True)
wind_final = wind_final[['date', 'velocity','direction', 'IdLandkreis', 'Landkreis', 'lk_latitude', 'lk_longitude', 'station_name', 'station_latitude', 'station_longitude']]
wind_final

Unnamed: 0,date,velocity,direction,IdLandkreis,Landkreis,lk_latitude,lk_longitude,station_name,station_latitude,station_longitude
0,2020-01-01,1.383333,189.583333,8326,LK Schwarzwald-Baar-Kreis,48.019120,8.410964,Donaueschingen (Landeplatz),47.9737,8.5205
1,2020-01-02,2.762500,172.083333,8326,LK Schwarzwald-Baar-Kreis,48.019120,8.410964,Donaueschingen (Landeplatz),47.9737,8.5205
2,2020-01-03,3.091667,172.500000,8326,LK Schwarzwald-Baar-Kreis,48.019120,8.410964,Donaueschingen (Landeplatz),47.9737,8.5205
3,2020-01-04,3.008333,262.500000,8326,LK Schwarzwald-Baar-Kreis,48.019120,8.410964,Donaueschingen (Landeplatz),47.9737,8.5205
4,2020-01-05,1.537500,233.333333,8326,LK Schwarzwald-Baar-Kreis,48.019120,8.410964,Donaueschingen (Landeplatz),47.9737,8.5205
...,...,...,...,...,...,...,...,...,...,...
53824,2020-05-13,2.666667,81.666667,7340,LK Südwestpfalz,49.208275,7.658867,Sembach,49.5044,7.8618
53825,2020-05-14,4.433333,90.416667,7340,LK Südwestpfalz,49.208275,7.658867,Sembach,49.5044,7.8618
53826,2020-05-15,3.520833,92.500000,7340,LK Südwestpfalz,49.208275,7.658867,Sembach,49.5044,7.8618
53827,2020-05-16,1.862500,129.583333,7340,LK Südwestpfalz,49.208275,7.658867,Sembach,49.5044,7.8618


In [33]:
prec_final.groupby('IdLandkreis').count()

Unnamed: 0_level_0,date,precipitation,Landkreis,lk_latitude,lk_longitude,station_name,station_latitude,station_longitude
IdLandkreis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1001,138,138,138,138,138,138,138,138
1002,138,138,138,138,138,138,138,138
1003,138,138,138,138,138,138,138,138
1004,138,138,138,138,138,138,138,138
1051,138,138,138,138,138,138,138,138
...,...,...,...,...,...,...,...,...
16073,138,138,138,138,138,138,138,138
16074,138,138,138,138,138,138,138,138
16075,138,138,138,138,138,138,138,138
16076,138,138,138,138,138,138,138,138


## Export

In [34]:
temp_final.to_pickle(Path.joinpath(path_export, "02_temp_final.pkl"))
prec_final.to_pickle(Path.joinpath(path_export, "02_prec_final.pkl"))
sun_final.to_pickle(Path.joinpath(path_export, "02_sun_final.pkl"))
wind_final.to_pickle(Path.joinpath(path_export, "02_wind_final.pkl"))

df_lk.to_pickle(Path.joinpath(path_export, "02_landkreise.pkl"))

temp_stations.to_pickle(Path.joinpath(path_export, "02_temp_stations_assigned.pkl"))
prec_stations.to_pickle(Path.joinpath(path_export, "02_prec_stations_assigned.pkl"))
sun_stations.to_pickle(Path.joinpath(path_export, "02_sun_stations_assigned.pkl"))
wind_stations.to_pickle(Path.joinpath(path_export, "02_wind_stations_assigned.pkl"))

In [25]:
temp_final[temp_final['IdLandkreis'] == 3403].head(20)

Unnamed: 0,date,temperature,humidity,IdLandkreis,Landkreis,lk_latitude,lk_longitude,station_name,station_latitude,station_longitude
0,2020-01-01,-1.045833,98.625,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
1,2020-01-02,-0.045833,97.458333,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
2,2020-01-03,6.495833,92.666667,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
3,2020-01-04,4.7625,88.916667,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
4,2020-01-05,4.1625,92.625,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
5,2020-01-06,4.55,86.791667,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
6,2020-01-07,6.2625,87.583333,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
7,2020-01-08,9.316667,91.583333,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
8,2020-01-09,10.7875,94.083333,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
9,2020-01-10,7.333333,89.583333,3403,SK Oldenburg,53.144578,8.224359,Großenkneten,52.9336,8.237
