# Inspection of data from the Deutscher Wetter Dienst (german weather service)

## Collecting data on stations

In [1]:
from six.moves import urllib

In [2]:
dwd_station_url = "https://www.dwd.de/DE/leistungen/klimadatendeutschland/statliste/statlex_html.html?view=nasPublication&nn=16102"

res = urllib.request.urlopen(dwd_station_url)
print("HTML code:", res.code)

HTML code: 200


In [3]:
file_path = "../data/dwd_stations.html"

print("Writing content to", file_path)
with open(file_path, "w") as f:
    content = res.read().decode("utf-8")
    
    f.write(content)

Writing content to ../data/dwd_stations.html


**Parsing html file**

In [29]:
from bs4 import BeautifulSoup
import pandas as pd

In [117]:
file_path = "../data/dwd_stations.html"

print("Reading", file_path)
with open(file_path, "r") as f:
    content = f.readlines()
    
soup = BeautifulSoup("".join(content))

Reading ../data/dwd_stations.html


In [118]:
table = soup.find_all("tr")

columns = [e.text for e in table[1] if e.name == "th"]
station_data = pd.DataFrame(columns=columns)

for i,row in enumerate(table[2:]):
    
    if i % 1000 == 0: 
        print("Processed {:.2f} %".format(i/(len(table)-2)*100))
        
    values = [e.text for e in row.find_all("td")]
    
    station_data.loc[i] = values
    
station_data.head()

Processed 0.00 %
Processed 5.31 %
Processed 10.62 %
Processed 15.94 %
Processed 21.25 %
Processed 26.56 %
Processed 31.87 %
Processed 37.19 %
Processed 42.50 %
Processed 47.81 %
Processed 53.12 %
Processed 58.44 %
Processed 63.75 %
Processed 69.06 %
Processed 74.37 %
Processed 79.69 %
Processed 85.00 %
Processed 90.31 %
Processed 95.62 %


Unnamed: 0,Stationsname,Stations_ID,Kennung,Stations-kennung,Breite,Länge,Stations-höhe,Flussgebiet,Bundesland,Beginn,Ende
0,Aach,1,KL,2783,47.841,8.849,478,,BW,01.01.1937,30.06.1986
1,Aach,1,RR,70191,47.841,8.849,478,,BW,01.01.1912,30.06.1986
2,Aach/Hegau,10771,PE,10771,47.85,8.85,480,,BW,,
3,Aachen,3,EB,2205,50.7827,6.0941,202,803100.0,NW,01.01.1951,31.03.2011
4,Aachen,3,FF,2205,50.7827,6.0941,202,803100.0,NW,01.01.1937,31.03.2011


**Doing some tidying up**

In [114]:
import re

pattern = r'^[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]' # string begins with a special character
station_data.replace(to_replace={"Beginn": pattern, "Ende": pattern, "Flussgebiet": pattern}, 
                     regex=True, value='', inplace=True)

In [None]:
schema = {
    "Stationsname": str, 
    "Stations_ID": str, 
    "Kennung": str,
    "Stations-kennung": str,
    "Breite": float,
    "Länge": float,
    "Stations-höhe": float,
    "Flussgebiet": int,
    "Bundesland": str,
    "Beginn": str,
    "Ende": str
}

for key, var_type in schema.items():
    station_data[key] = station_data[key].apply(var_type)

In [115]:
file_path = "../data/dwd_stations.csv"
print("Writing content to", file_path)
station_data.to_csv(file_path, index=False, sep=",")

Writing content to ../data/dwd_stations.csv


## Analyzing data on stations

In [124]:
file_path = "../data/dwd_stations.csv"

schema = {
    "Stationsname": str, 
    "Stations_ID": str, 
    "Kennung": str,
    "Stations-kennung": str,
    "Breite": float,
    "Länge": float,
    "Stations-höhe": float,
    "Flussgebiet": 'Int64',
    "Bundesland": str,
    "Beginn": str,
    "Ende": str
}

station_data = pd.read_csv(file_path, sep=",", dtype=schema)

station_data.head()

Unnamed: 0,Stationsname,Stations_ID,Kennung,Stations-kennung,Breite,Länge,Stations-höhe,Flussgebiet,Bundesland,Beginn,Ende
0,Aach,1,KL,2783,47.841,8.849,478.0,,BW,01.01.1937,30.06.1986
1,Aach,1,RR,70191,47.841,8.849,478.0,,BW,01.01.1912,30.06.1986
2,Aach/Hegau,10771,PE,10771,47.85,8.85,480.0,,BW,,
3,Aachen,3,EB,2205,50.7827,6.0941,202.0,803100.0,NW,01.01.1951,31.03.2011
4,Aachen,3,FF,2205,50.7827,6.0941,202.0,803100.0,NW,01.01.1937,31.03.2011


In [125]:
print(station_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18824 entries, 0 to 18823
Data columns (total 11 columns):
Stationsname        18824 non-null object
Stations_ID         18824 non-null object
Kennung             18824 non-null object
Stations-kennung    18824 non-null object
Breite              18824 non-null float64
Länge               18824 non-null float64
Stations-höhe       18824 non-null float64
Flussgebiet         9984 non-null Int64
Bundesland          18824 non-null object
Beginn              12238 non-null object
Ende                12238 non-null object
dtypes: Int64(1), float64(3), object(7)
memory usage: 1.6+ MB
None


In [133]:
n = 3
num_rows = len(station_data)

for col in station_data.columns.values:
    lowest = sorted(station_data[col].unique())[:n]
    highest = sorted(station_data[col].unique())[-n:]
    
    s = """
    column = '{col}'
        # unique values = {num_unique}
        sparsity = {sparsity:.2f} %
        lowest {n} = {lowest}
        highest {n} = {highest}""".format(col=col, num_unique=station_data[col].nunique(), n=n, 
                                          lowest=lowest, highest=highest,
                                          sparsity=(1-station_data[col].nunique()/num_rows)*100)
    print(s)


    column = 'Stationsname'
        # unique values = 12504
        sparsity = 33.57 %
        lowest 3 = ['Aach', 'Aach/Hegau', 'Aachen']
        highest 3 = ['Ülsby', 'Ürzig', 'Üschersdorf']

    column = 'Stations_ID'
        # unique values = 12704
        sparsity = 32.51 %
        lowest 3 = ['1', '10', '100']
        highest 3 = ['9997', '9998', '9999']

    column = 'Kennung'
        # unique values = 13
        sparsity = 99.93 %
        lowest 3 = ['AE', 'EB', 'EF']
        highest 3 = ['SO', 'SY', 'TU']

    column = 'Stations-kennung'
        # unique values = 15219
        sparsity = 19.15 %
        lowest 3 = ['00001', '01007', '01020']
        highest 3 = ['Z002', 'Z003', 'Z004']

    column = 'Breite'
        # unique values = 4055
        sparsity = 78.46 %
        lowest 3 = [47.271, 47.339, 47.36]
        highest 3 = [54.923, 55.0, 55.011]

    column = 'Länge'
        # unique values = 4392
        sparsity = 76.67 %
        lowest 3 = [5.907, 5.983, 6.0]
        h

  """
  


TypeError: '<' not supported between instances of 'float' and 'str'

In [111]:
col = "Bundesland"
print(sorted(station_data[col].unique()[:5]))
print(sorted(station_data[col].unique()[-5:]))

['BW', 'BY', 'HE', 'NI', 'NW']
['BE', 'HB', 'HH', 'SL', 'TH']


In [126]:
print(station_data["Beginn"].apply(pd.to_datetime).unique())

['1937-01-01T00:00:00.000000000' '1912-01-01T00:00:00.000000000'
                           'NaT' ... '2000-09-11T00:00:00.000000000'
 '1900-01-08T00:00:00.000000000' '1990-05-17T00:00:00.000000000']


In [119]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers:

read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)
    Read a comma-separated values (csv) file into DataFrame.
    
    Also supports option

In [None]:
# distinct_station_names = 