In [1]:
%matplotlib notebook

import datetime

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import pandas as pd

import utide

print(utide.__version__)

0.2.6


Look at the data file to see what structure it has.

In [2]:
with open('can1998.dtf') as f:
    lines = f.readlines()
print(''.join(lines[:30]))

         0 1998  1  1  0.0000     1.200 0
      3600 1998  1  1  1.0000     1.430 0
      7200 1998  1  1  2.0000     1.730 0
     10800 1998  1  1  3.0000     2.030 0
     14400 1998  1  1  4.0000     2.380 0
     18000 1998  1  1  5.0000     2.540 0
     21600 1998  1  1  6.0000     2.460 0
     25200 1998  1  1  7.0000     2.270 0
     28800 1998  1  1  8.0000     1.980 0
     32400 1998  1  1  9.0000     1.670 0
     36000 1998  1  1 10.0000     1.550 0
     39600 1998  1  1 11.0000     1.630 0
     43200 1998  1  1 12.0000     1.810 0
     46800 1998  1  1 13.0000     1.980 0
     50400 1998  1  1 14.0000     2.010 0
     54000 1998  1  1 15.0000     1.980 0
     57600 1998  1  1 16.0000     2.130 0
     61200 1998  1  1 17.0000     2.280 0
     64800 1998  1  1 18.0000     2.330 0
     68400 1998  1  1 19.0000     2.280 0
     72000 1998  1  1 20.0000     1.930 0
     75600 1998  1  1 21.0000     1.650 0
     79200 1998  1  1 22.0000     1.380 0
     82800 1998  1  1 23.0000     

It looks like the fields are seconds, year, month, day, hour, elevation, flag.  We need a date parser function to combine the date and time fields into a single value to be used as the datetime index.

In [3]:

def date_parser(year, month, day, hour):
    year, month, day, hour = map(int, (year, month, day, hour))
    return datetime.datetime(year, month, day, hour)

# Names of the columns that will be used to make a "datetime" column:
parse_dates = dict(datetime=['year', 'month', 'day','hour'])

# Names of the original columns in the file, including only
# the ones we will use; we are skipping the first, which appears
# to be seconds from the beginning.
names = ['year', 'month', 'day', 'hour', 'elev', 'flag']

obs = pd.read_table('can1998.dtf',
                    names=names,
                    skipinitialspace=True,
                    delim_whitespace=True,
                    index_col='datetime',
                    usecols=range(1, 7),
                    na_values='9.990',
                    parse_dates=parse_dates,
                    date_parser=date_parser,
                   )
obs.head(6)

        Use pd.to_datetime instead.

  return generic_parser(date_parser, *date_cols)


Unnamed: 0_level_0,elev,flag
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
1998-01-01 00:00:00,1.2,0
1998-01-01 01:00:00,1.43,0
1998-01-01 02:00:00,1.73,0
1998-01-01 03:00:00,2.03,0
1998-01-01 04:00:00,2.38,0
1998-01-01 05:00:00,2.54,0


Although there are no elevations marked bad via special value, which should be `nan` after reading the file, the flag value of 2 indicates the values are unreliable, so we will mark them with `nan`, calculate the deviations of the elevations from their mean (stored in a new column called "anomaly"), and then interpolate to fill in the `nan` values in the anomaly.

In [4]:
bad = obs['flag'] == 2
corrected = obs['flag'] == 1

obs.loc[bad, 'elev'] = np.nan
obs['anomaly'] = obs['elev'] - obs['elev'].mean()
obs['anomaly'] = obs['anomaly'].interpolate()
print('{} points were flagged "bad" and interpolated'.format(bad.sum()))
print('{} points were flagged "corrected" and left unchanged'.format(corrected.sum()))


10 points were flagged "bad" and interpolated
212 points were flagged "corrected" and left unchanged


The utide package works with ordinary numpy arrays, not with Pandas Series or Dataframes, so we need to make a `time` variable in floating point days since a given epoch, and use the `values` attribute of the elevation anomaly (a Pandas Series) to extract the underlying numpy ndarray.

In [5]:
obs.index.to_pydatetime()

array([datetime.datetime(1998, 1, 1, 0, 0),
       datetime.datetime(1998, 1, 1, 1, 0),
       datetime.datetime(1998, 1, 1, 2, 0), ...,
       datetime.datetime(1998, 12, 31, 21, 0),
       datetime.datetime(1998, 12, 31, 22, 0),
       datetime.datetime(1998, 12, 31, 23, 0)], dtype=object)

In [None]:
time = mdates.date2num(obs.index.to_pydatetime())

coef = utide.solve(time, obs['anomaly'].values,
                   lat=-25,
                   method='ols',
                   conf_int='MC')

The amplitudes and phases from the fit are now in the `coef` data structure (a Bunch), which can be used directly in the `reconstruct` function to generate a hindcast or forecast of the tides at the times specified in the `time` array.

In [None]:
print(coef.keys())

In [None]:
tide = utide.reconstruct(time, coef)

The output from the reconstruction is also a Bunch:

In [None]:
print(tide.keys())

In [None]:
#t = obs.index.values  # dtype is '<M8[ns]' (numpy datetime64)
# It is more efficient to supply the time directly as matplotlib
# datenum floats:
t = tide.t_mpl

fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, sharey=True, sharex=True)

ax0.plot(t, obs.anomaly, label=u'Observations', color='C0')
ax1.plot(t, tide.h, label=u'Tide Fit', color='C1')
ax2.plot(t, obs.anomaly - tide.h, label=u'Residual', color='C2')
ax2.xaxis_date()
fig.legend(ncol=3, loc='upper center')
fig.autofmt_xdate()