## Fit several models to seaflow + Zinser, save parameter samples as .csv for analysis

### load data from files and plot

In [1]:
#%matplotlib notebook
import netCDF4 as nc4
import numpy as np
import pandas as pd
import dateutil.parser
import sys
import pystan

# load data
datafiles = {
    'seaflow':'../data/SeaFlow_SizeDist_regrid-25-8.nc',
    'zinser':'../data/Zinser_SizeDist_calibrated-26-6.nc',
}

itestfiles = {
    'seaflow':'../data/Zinser_SizeDist_calibrated-26-6-itest.csv', # same as zinser
    'zinser':'../data/Zinser_SizeDist_calibrated-26-6-itest.csv',         
}

desc = {
    'seaflow':'SeaFlow dataset',
    'zinser':'Zinser dataset',    
}

data_gridded = {}
for dataname in datafiles:
    data_gridded[dataname] = {}
    with nc4.Dataset(datafiles[dataname]) as nc:
        for var in nc.variables:
            data_gridded[dataname][var] = nc.variables[var][:]

# Now we load in count data
for dataname in datafiles:

    if 'seaflow' in dataname:
        # Extract SeaFlow cell counts
        seaflow = pd.read_csv('../data/SeaFlow_PSD_hourlyCOUNT_m32.csv')
        seaflow_counts = seaflow.values[:, 2:].T.astype(int)

        # Redefine parameters to match 32-size class count data. We only keep PAR from the 25-size class data.
        data_gridded[dataname]['m'] = seaflow_counts.shape[0]
        data_gridded[dataname]['size_bounds'] = seaflow.columns[2:].values.astype(float)/1000 # extract size classes from dataframe
        data_gridded[dataname]['v_min'] = data_gridded[dataname]['size_bounds'][0] # note these seem to be on a different scale
        data_gridded[dataname]['delta_v_inv'] = int(np.round(1.0/np.log2(data_gridded[dataname]['size_bounds'][1]/data_gridded[dataname]['size_bounds'][0])))
        data_gridded[dataname]['w_obs'] = (seaflow_counts/np.sum(seaflow_counts, axis=0)[None, :]).astype(float)
        data_gridded[dataname]['counts'] = seaflow_counts
        data_gridded[dataname]['obs_time'] = np.empty(shape=seaflow_counts.shape[1])

        # Extract time stamps for each observation from SeaFlow data
        ii = 0
        for timestamp in np.asarray(seaflow['time'], dtype=str):
            datetime = dateutil.parser.isoparse(timestamp)
            if ii == 0:
                initial = datetime
            data_gridded['seaflow']['obs_time'][ii] = (datetime - initial).total_seconds()/60
            ii += 1

    elif 'zinser' in dataname:

        # Extract Zinser cell counts
        zinser = pd.read_csv('../data/Zinser_Figure2A.csv')
        #zinser_counts = zinser.values[:,1].astype(int) # cells A column
        #zinser_counts = zinser.values[:,2].astype(int) # cells B column
        zinser_counts = np.mean(zinser.values, axis=1).astype(int) # mean of both columns
        
        # Add counts to Zinser data
        data_gridded[dataname]['counts'] = (data_gridded[dataname]['w_obs'] * zinser_counts).astype(int)
        data_gridded[dataname]['obs_time'] = data_gridded[dataname]['time']

    desc[dataname] += ' (m={data[m]}, $\Delta_v^{{-1}}$={data[delta_v_inv]})'.format(data=data_gridded[dataname])

In [2]:
# prepare data for Stan model

if 'data' not in globals():
    data = {}
if 'mcmcs' not in globals():
    mcmcs = {}
if 'models' not in globals():
    models = {}

for dataname in data_gridded:
    dt = 20 # in units of minutes
    
    data[dataname] = {'dt':dt}
    for v in ('m','v_min','delta_v_inv'):
        data[dataname][v] = data_gridded[dataname][v]

    if 'seaflow' in dataname:
        limit_days = 1
        data[dataname]['obs'] = data_gridded[dataname]['w_obs']
        data[dataname]['t_obs'] = data_gridded[dataname]['obs_time']
                
        # median filter PAR
        # see: medianfilter_par.ipynb
        n = len(data_gridded[dataname]['PAR'])
        wsh = 30 # half of median filter window size; window size is 2*wsh+1
        par = np.array([np.median(data_gridded[dataname]['PAR'][max(0,i-wsh):min(n,i+wsh+1)]) for i in range(n)])
    else:
        limit_days = 2
        
        data[dataname]['obs'] = data_gridded[dataname]['w_obs']
        data[dataname]['t_obs'] = data_gridded[dataname]['obs_time']
        par = data_gridded[dataname]['PAR']
        
    if limit_days > 0:
        limit_minutes = limit_days*1440
        
        ind_obs = data[dataname]['t_obs'] < limit_minutes
        data[dataname]['t_obs'] = data[dataname]['t_obs'][ind_obs]
        data[dataname]['obs'] = data[dataname]['obs'][:,ind_obs]
        
        data[dataname]['nt'] = int(limit_minutes//data[dataname]['dt'])

    data[dataname]['nt_obs'] = data[dataname]['t_obs'].size
    
    # load cross-validation testing indices and add them to data
    data[dataname]['i_test'] = np.loadtxt(itestfiles[dataname]).astype(int)
    # remove last index, so that dimensions agree
    data[dataname]['i_test'] = data[dataname]['i_test'][:-1]
    
    # add light data
    t = np.arange(data[dataname]['nt'])*data[dataname]['dt']
    data[dataname]['E'] = np.interp(t, xp=data_gridded[dataname]['time'], fp=par)
    
    # real count data
    data[dataname]['obs_count'] = data_gridded[dataname]['counts'][:, ind_obs]
    
    # consistency check
    if len(data[dataname]['i_test']) != data[dataname]['nt_obs']:
        raise ValueError('Invalid number of testing indices for "{}" (expected {}, got {}).'.format(dataname,data[dataname]['nt_obs'],len(data[dataname]['i_test'])))

  a.partition(kth, axis=axis, kind=kind, order=order)


Compile models

In [3]:
modelfiles_prior = {
#     'm1p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_freedelta_normparam_trackgrowth_xval_PRIOR.stan',
#     'm2p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_normparam_trackgrowth_xval_PRIOR.stan',
#     'm3p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_gammaiv6_normparam_trackgrowth_xval_PRIOR.stan',
#     'm4p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respv1_normparam_trackgrowth_xval_PRIOR.stan',
#     'm5p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respv2_normparam_trackgrowth_xval_PRIOR.stan',
#     'm6p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respiv6_normparam_trackgrowth_xval_PRIOR.stan',
#     'm7p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respiv7_normparam_trackgrowth_xval_PRIOR.stan',
#     'm8p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respiv6_normparam_trackgrowth_xval_PRIOR.stan', 
#     'm9p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respiv7_normparam_trackgrowth_xval_PRIOR.stan',
#     'm10p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respv2_normparam_trackgrowth_xval_PRIOR.stan',
    'm5update':'../stancode_gallery1_priors/matrixmodel_mlmultinom_estinilnorm2_monodelta2_respv2_normparam_trackgrowth_xval2_prioronly.stan'
}

In [4]:
#models = {}
#for name in modelfiles:
#        models[name] = pystan.StanModel(file=modelfiles[name], model_name=name, obfuscate_model_name=False)

In [5]:
models_prior = {}
for name in modelfiles_prior:
    models_prior[name] = pystan.StanModel(file=modelfiles_prior[name], model_name=name, obfuscate_model_name=False)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL m5update NOW.


In [6]:
mcmcs_prior = {}

for name in models_prior:
    mcmcs_prior[name] = models_prior[name].sampling(data=data['zinser'],iter=2000)
    df = pystan.misc.to_dataframe(mcmcs_prior[name])
    df.to_csv("prior_{name}.csv".format(name=name),index=False)

To run all diagnostics call pystan.check_hmc_diagnostics(fit)


In [7]:
data

{'seaflow': {'dt': 20,
  'm': 32,
  'v_min': 0.013573,
  'delta_v_inv': 10,
  'obs': array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
         [4.34720396e-03, 4.05943006e-03, 3.81017545e-03, 3.28914828e-03,
          3.25653559e-03, 3.22385913e-03, 3.84346749e-03, 3.70390727e-03,
          3.75848192e-03, 3.99292230e-03, 4.52378706e-03, 5.95169387e-03,
          7.79996620e-03, 9.13787563e-03, 9.96103257e-03, 1.13888784e-02,
          1.18531265e-02, 9.45830725e-03, 9.02943361e-03, 7.35245522e-03,
          5.64268050e-03, 5.40238626e-03, 5.17307018e-03, 3.99656625e-03],
         [2.15247920e-02, 1