# Tides

Notebook for analysing high and low tide measurement data for several locations in Germany.

Data source is Wasser- und Schifffahrtsverwaltung des Bundes (WSV) provided by Bundesanstalt für Gewässerkunde (BfG).

In [1]:
%matplotlib inline

import glob

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

matplotlib.rcParams['svg.fonttype'] = 'none'



Read in tide `.zrx` files. It's basically CSV with some weird meta data stuff on top.

Drop bad datasets and convert columns into categoricals.

In [2]:
dateparse = lambda x: pd.datetime.strptime(x, '%Y%m%d%H%M%S')

def parse_meta_data(filename):
    meta_info = []
    with open(filename, encoding='latin1') as f:
        for line in f:
            if not line.startswith('#'):
                break
            meta_info.extend(line[1:].split('|*|'))
    return [m for m in meta_info if m]


def read_tide(filename):
    print(filename)
    df = pd.read_csv(filename, comment='#', sep=' ',
                     header=None, names=['timestamp', 'value', ''],
                     usecols=['timestamp', 'value'],
                     parse_dates=[0], date_parser=dateparse)

    meta_info = parse_meta_data(filename)
    name = [m for m in meta_info if m.startswith('SNAME')]
    if name:
        name = name[0][len('SNAME'):]
        df['name'] = name
    else:
        df['name'] = None
    file_type = filename.split('.')[0].split('-')[-1]
    df['type'] = file_type
    
    df['filename'] = filename
    return df
    

filenames = glob.glob('Tnw+Thw-*/*.zrx')

# Drop Zehnerloch, because it's broken
filenames.remove('Tnw+Thw-2/Zehnerloch-Thw.zrx')

# Drop duplicate file Krautsand
filenames.remove('Tnw+Thw-2/Krautsand-Thw.zrx')
filenames.remove('Tnw+Thw-2/Krautsand-Tnw.zrx')

df = pd.concat([read_tide(f) for f in filenames])

# Drop Tönning from data, because it doesn't make sense, quote WSV:
# „Da die Wasserstandsdaten des Pegels Tönning durch das Eidersperrwerk
# beeinflusst sind, erscheint eine Darstellung in dem
# Diagramm „Hochwasser an deutschen Küsten von 1950 bis 2014“ hydrologisch
# nicht sinnvoll. Ich schlage vor, die MThw-Ganglinie des Pegels Tönning aus
# dem Diagramm zu entfernen.
df = df[df['name'] != 'TÖNNING']

df['type'] = df['type'].astype('category')
df['filename'] = df['filename'].astype('category')
df['name'] = df['name'].astype('category')
df.describe()
df.dtypes

Tnw+Thw-1/BÅsum-Thw.zrx
Tnw+Thw-1/BÅsum-Tnw.zrx
Tnw+Thw-1/DagebÅll-Thw.zrx
Tnw+Thw-1/DagebÅll-Tnw.zrx
Tnw+Thw-1/Helgoland-Binnenhafen-Thw.zrx
Tnw+Thw-1/Helgoland-Binnenhafen-Tnw.zrx
Tnw+Thw-1/Husum-Thw.zrx
Tnw+Thw-1/Husum-Tnw.zrx
Tnw+Thw-2/BHV-Alter-Leuchtturm-Thw.zrx
Tnw+Thw-2/BHV-Alter-Leuchtturm-Tnw.zrx
Tnw+Thw-2/Borkum-Fischerbalje-Thw.zrx
Tnw+Thw-2/Borkum-Fischerbalje-Tnw.zrx
Tnw+Thw-2/Borkum-SÅdstrand-Thw.zrx
Tnw+Thw-2/Borkum-SÅdstrand-Tnw.zrx
Tnw+Thw-2/Brokdorf-Thw.zrx
Tnw+Thw-2/Brokdorf-Tnw.zrx
Tnw+Thw-2/Cuxhaven-Steubenhîft-Thw.zrx
Tnw+Thw-2/Cuxhaven-Steubenhîft-Tnw.zrx
Tnw+Thw-2/Eider-Sperrwerk-AP-Thw.zrx
Tnw+Thw-2/Eider-Sperrwerk-AP-Tnw.zrx
Tnw+Thw-2/Emshîrn-Thw.zrx
Tnw+Thw-2/Emshîrn-Tnw.zrx
Tnw+Thw-2/HH-St-Pauli-Thw.zrx
Tnw+Thw-2/HH-St-Pauli-Tnw.zrx
Tnw+Thw-2/Hîrnum-Thw.zrx
Tnw+Thw-2/Hîrnum-Tnw.zrx
Tnw+Thw-2/Kollmar-Thw.zrx
Tnw+Thw-2/Kollmar-Tnw.zrx
Tnw+Thw-2/List-Thw.zrx
Tnw+Thw-2/List-Tnw.zrx
Tnw+Thw-2/LT-Alte-Weser-Thw.zrx
Tnw+Thw-2/LT-Alte-Weser-Tnw.zrx
Tnw+

timestamp    datetime64[ns]
value                object
name               category
type               category
filename           category
dtype: object

In [3]:
# Assumption is: these values are incorrect
INVALID_VALUES = [-777, 0]
df = df[~df.value.isin(INVALID_VALUES)]
df.describe()

Unnamed: 0,timestamp,value,name,type,filename
count,3509635,3509635.0,3509635,3509635,3509635
unique,3330633,980.0,45,2,89
top,1997-11-19 02:40:00,640.0,CUXHAVEN/STEUBENHÖFT,Tnw,Tnw+Thw-2/Cuxhaven-Steubenhîft-Thw.zrx
freq,5,18831.0,162672,1774365,81336
first,1900-01-01 00:16:00,,,,
last,2015-10-31 21:08:00,,,,


In [11]:
df['timestamp'] = pd.to_datetime(df["timestamp"])

In [12]:
df.name.value_counts()

CUXHAVEN/STEUBENHÖFT      162672
BORKUM SÜDSTRAND          112806
WHV ALTER VORHAFEN        112493
WITTDÜN                   111751
HUSUM                     111749
DAGEBÜLL                  111693
BÜSUM                     111281
LIST                      110924
HÖRNUM                    110924
HELGOLAND BINNENHAFEN     101772
HAMBURG-ST.PAULI           92925
PAPENBURG                  92804
EMDEN NEUE SEESCHLEUSE     92804
POGUM                      92804
Zollenspieker              92687
SCHULAU                    92687
STADERSAND                 92448
KOLLMAR                    91749
BROKDORF                   91748
NORDENHAM                  84572
GLÜCKSTADT                 78570
SCHARHÖRN                  78102
MELLUMPLATE                74574
NORDERNEY RIFFGAT          73279
BORKUM-FISCHERBALJE        72226
LEUCHTTURM ALTE WESER      71865
BHV ALTER LEUCHTTURM       70455
WEENER                     64218
HECHTHAUSEN                62583
MITTELGRUND                62575
BELUM     

In [13]:
len(df)

3509635

In [14]:
len(df.columns)

5

In [15]:
# Check for duplicate values
df_thw = df[(df.type == 'Thw')]
duplicated_thw = df_thw[['name', 'timestamp']].duplicated()
assert len(df_thw[duplicated_thw].sort_values(['name', 'timestamp'])) == 0

## Plotting

To quickly plot the different locations we groupby name and timestamp (which is a unique criteria for each row so the mean is not a real mean), unstack and transpose, and resample to create a table as seen below.

In [16]:
plot_data = df[(df.timestamp.dt.year >= 1950) & 
               (df.type == 'Thw')
              ].groupby(['name', 'timestamp']).value.mean().unstack().T.resample('A', how='mean')

# Try out different means
# plot_data = df.groupby(['name', 'timestamp']).value.mean().unstack().T.rolling_mean()
# plot_data = pd.rolling_max(plot_data, window=3, min_periods=1)
# plot_data = df.groupby(['timestamp', 'name']).value.resample('1W', how='mean')
plot_data.tail()


DataError: No numeric types to aggregate

In [None]:
fig  = plt.figure(figsize=(5, 10))
ax = plt.subplot(111)

plot_data.plot(ax=ax, color='gray', legend=False)
plt.tick_params(axis='y', which='both', labelleft='on', labelright='on')

In [None]:
# Check the mean of first 20 values and last twenty values
pd.DataFrame([plot_data[:20].mean(), plot_data[-20:].mean()]).T

To properly plot, let's check out when measurements started/ended for the locations.

In [None]:
first_measurement = df.groupby(['name']).timestamp.first()
stations_1970 = first_measurement[first_measurement.dt.year <= 1970]
stations_1950 = first_measurement[first_measurement.dt.year <= 1950]
# stations_1970
first_measurement.sort_values()

Cuxhaven location has the most data, trend seems pretty clear.

In [17]:
cuxhaven = df[df.name == 'CUXHAVEN/STEUBENHÖFT'].set_index('timestamp').value.resample('1A', how='mean')

cuxhaven

the new syntax is .resample(...).mean()
  if __name__ == '__main__':


DataError: No numeric types to aggregate

Let's take all high tide data between 1971 and 2014 (inclusive) from all stations that provide data since 1970.
Resample to one year and round.

In [None]:
data = df[(df.type == 'Thw') &
          (df.timestamp.dt.year >= 1971) &
          (df.timestamp.dt.year <= 2014) &
          (df.name.isin(stations_1970.index))].groupby(['name', 'timestamp']).value.mean().unstack().T
data = data.resample('1A', how='mean').apply(np.round)
# data = pd.rolling_mean(data.resample("1M", how='mean'), window=12, min_period=1)
data.head()

### Export different data subsets to CSV

In [None]:
data_1950 = df[(df.type == 'Thw')
          & (df.timestamp.dt.year >= 1951)
          & (df.timestamp.dt.year <= 2014)
          & (df.name.isin(stations_1950.index))]
len(data_1950)

In [None]:
data_1950_grouped = data_1950.groupby(['name', 'timestamp']).value.mean().unstack().T
data_1950_grouped = data_1950_grouped.resample('1A', how='mean').apply(np.round)
print(len(data_1950_grouped.columns))
data_1950_grouped.T.to_csv('../chart/data/data_1950.csv', encoding='utf-8')

In [None]:
data_1970 = df[(df.type == 'Thw')
          & (df.timestamp.dt.year >= 1971)
          & (df.timestamp.dt.year <= 2014)
          & (df.name.isin(stations_1970.index))]
len(data_1970)

In [None]:
data_1970_grouped = data_1970.groupby(['name', 'timestamp']).value.mean().unstack().T
data_1970_grouped = data_1970_grouped.resample('1A', how='mean').apply(np.round)
print(len(data.columns))
data_1970_grouped.T.to_csv('../chart/data/data_1970.csv', encoding='utf-8')

In [None]:
data_1970_grouped_5a = data_1970.groupby(['name', 'timestamp']).value.mean().unstack().T
# data = data.resample('5A', how='mean').apply(np.round)
data_1970_grouped_5a = pd.rolling_mean(data_1970_grouped_5a.resample("1A", how='mean'), window=10, min_periods=1).apply(np.round)
print(len(data_1970_grouped_5a.columns))
data_1970_grouped_5a.T.to_csv('../chart/data/data_1970_5a.csv', encoding='utf-8')

In [None]:
data = df[(df.type == 'Thw')
          & (df.timestamp.dt.year >= 1951)
          & (df.timestamp.dt.year <= 2014)
          & (df.name.isin(stations_1950.index))
         ].groupby(['name', 'timestamp']).value.mean().unstack().T
# data = data.resample('5A', how='mean').apply(np.round)
data = pd.rolling_mean(data.resample("1A", how='mean'), window=10, min_periods=1).apply(np.round)
data.T.to_csv('../chart/data/data_1950_5a.csv', encoding='utf-8')

In [None]:
# Cuxhaven data
cuxhaven_data = df[(df.type == 'Thw') & (df.name == 'CUXHAVEN/STEUBENHÖFT')]
print(len(cuxhaven_data))
(cuxhaven_data.groupby(['name', 'timestamp']).value.mean().unstack().T
         .resample('1A', how='mean').apply(np.round)
         .T.to_csv('../chart/data/cuxhaven.csv', encoding='utf-8'))

### Plot a grid of charts

In [None]:
def plot_grid(df, cols=4, **kwargs):
    fig, axes = plt.subplots(nrows=int(np.ceil(len(df.columns) / cols)), ncols=cols)
    fig.set_figheight(6)
    fig.set_figwidth(8)

    for i, x in enumerate(df.columns):
        ax = axes[i // cols, i % cols]
        df[x].plot(ax=ax, color='blue', legend=False, label=False,
                   kind='line', stacked=False, sharex=False,
                   figsize=(25, 50))
        ax.set_title(x)
        ax.xaxis.set_visible(False)
    return fig

fig = plot_grid(data)
fig.show()