## Fit several models to seaflow + Zinser, save parameter samples as .csv for analysis

### load data from files and plot

In [14]:
#%matplotlib notebook
import netCDF4 as nc4
import numpy as np
import pystan

# load data
datafiles = {
    #'seaflow':'../data/SeaFlow_SizeDist_regrid-25-8.nc',
    'zinser':'../data/Zinser_SizeDist_calibrated-26-6.nc',
}
itestfiles = {
    #'seaflow':'../data/Zinser_SizeDist_calibrated-26-6-itest.csv', # same as zinser
    'zinser':'../data/Zinser_SizeDist_calibrated-26-6-itest.csv',       
}
desc = {
    #'seaflow':'SeaFlow dataset',
    'zinser':'Zinser dataset',    
}
data_gridded = {}
for k in datafiles:
    data_gridded[k] = {}
    with nc4.Dataset(datafiles[k]) as nc:
        for var in nc.variables:
            data_gridded[k][var] = nc.variables[var][:]
    desc[k] += ' (m={data[m]}, $\Delta_v^{{-1}}$={data[delta_v_inv]})'.format(data=data_gridded[k])

data_gridded[k]['PAR'] *= 200.0/22.0 # make light similar for this experiment (it is later normalized by E_star) 

In [18]:
data = {}

for k in data_gridded:
    dt = 20 # in units of minutes    
    data[k] = {'dt':dt}
    data[k]['prior_only'] = 1
    for v in ('m','v_min','delta_v_inv'):
        data[k][v] = data_gridded[k][v]
    if 'seaflow' in k:
        limit_days = 1
        # new: average SeaFlow data in hourly bins
        binsize = 60 # in minutes
        numbins = int(np.ceil(data_gridded[k]['time'][-1]/binsize))      
        data[k]['obs'] = np.full((data[k]['m'],numbins), fill_value=np.nan)
        data[k]['t_obs'] = np.full(numbins, fill_value=np.nan)
 
        i = 0
        for ibin in range(numbins):
            binind = np.logical_and(data_gridded[k]['time'] >= ibin*binsize,
                                    data_gridded[k]['time'] < (ibin+1)*binsize)
            if np.any(binind):
                # TODO we may want to make this a sum when dealing with counts
                data[k]['obs'][:,i] = np.mean(data_gridded[k]['w_obs'][:,binind], axis=1)
                data[k]['t_obs'][i] = (ibin+0.5) * binsize
                i += 1        
        data[k]['obs'] = data[k]['obs'][:,:i]
        data[k]['t_obs'] = data[k]['t_obs'][:i]        
        
        n = len(data_gridded[k]['PAR'])  
        wsh = 30 # half of median filter window size; window size is 2*wsh+1
        par = np.array([np.median(data_gridded[k]['PAR'][max(0,i-wsh):min(n,i+wsh+1)]) for i in range(n)]) # median filter PAR; see: medianfilter_par.ipynb
    else:
        limit_days = 2        
        data[k]['obs'] = data_gridded[k]['w_obs']
        data[k]['t_obs'] = data_gridded[k]['time']
        par = data_gridded[k]['PAR']
        
    if limit_days > 0:
        limit_minutes = limit_days*1440        
        ind_obs = data[k]['t_obs'] < limit_minutes
        data[k]['t_obs'] = data[k]['t_obs'][ind_obs]
        data[k]['obs'] = data[k]['obs'][:,ind_obs]        
        data[k]['nt'] = int(limit_minutes//data[k]['dt'])

    data[k]['nt_obs'] = data[k]['t_obs'].size
    
#     # load cross-validation testing indices and add them to data
    data[k]['i_test'] = np.loadtxt(itestfiles[k]).astype(int)
    data[k]['i_test'] = data[k]['i_test'][:-1]                          # remove last index, so that dimensions agree

    t = np.arange(data[k]['nt'])*data[k]['dt']
    data[k]['E'] = np.interp(t, xp=data_gridded[k]['time'], fp=par)     # add light data
    
    # for now, add pseudo-count data
    data[k]['obs_count'] = (1000*data[k]['obs']).astype(int)
    
    # consistency check
    #if len(data[k]['i_test']) != data[k]['nt_obs']:
    #    raise ValueError('Invalid number of testing indices for "{}" (expected {}, got {}).'.format(k,data[k]['nt_obs'],len(data[k]['i_test'])))

Compile models

In [36]:
modelfiles_prior = {
    'm1p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_freedelta_normparam_trackgrowth_xval_PRIOR.stan',
    'm2p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_normparam_trackgrowth_xval_PRIOR.stan',
    'm3p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_gammaiv6_normparam_trackgrowth_xval_PRIOR.stan',
    'm4p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respv1_normparam_trackgrowth_xval_PRIOR.stan',
    'm5p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respv2_normparam_trackgrowth_xval_PRIOR.stan',
    'm6p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respiv6_normparam_trackgrowth_xval_PRIOR.stan',
    'm7p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta_respiv7_normparam_trackgrowth_xval_PRIOR.stan',
    'm8p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respiv6_normparam_trackgrowth_xval_PRIOR.stan', 
    'm9p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respiv7_normparam_trackgrowth_xval_PRIOR.stan',
    'm10p':'../stancode_gallery1_priors/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respv2_normparam_trackgrowth_xval_PRIOR.stan',
}

In [17]:
#models = {}
#for name in modelfiles:
#        models[name] = pystan.StanModel(file=modelfiles[name], model_name=name, obfuscate_model_name=False)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL m1 NOW.


In [37]:
models_prior = {}
for name in modelfiles_prior:
    models_prior[name] = pystan.StanModel(file=modelfiles_prior[name], model_name=name, obfuscate_model_name=False)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL m1p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m2p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m3p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m4p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m5p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m6p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m7p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m8p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m9p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m10p NOW.


In [38]:
mcmcs_prior = {}

for name in models_prior:
    mcmcs_prior[name] = models_prior[name].sampling(data=data['zinser'],iter=2000)
    df = pystan.misc.to_dataframe(mcmcs_prior[name])
    df.to_csv("prior_{name}.csv".format(name=name),index=False)



In [26]:
data

{'zinser': {'dt': 20, 'prior_only': 1, 'm': masked_array(data=26,
               mask=False,
         fill_value=999999,
              dtype=int64), 'v_min': masked_array(data=0.03,
               mask=False,
         fill_value=1e+20), 'delta_v_inv': masked_array(data=6,
               mask=False,
         fill_value=999999,
              dtype=int64), 'obs': masked_array(
    data=[[0.0030273 , 0.00129376, 0.00062941, 0.00056829, 0.00048896,
           0.00039184, 0.00076247, 0.00085491, 0.00109103, 0.00158558,
           0.00243662, 0.00332245, 0.00248628, 0.00140978, 0.00065242,
           0.00048815, 0.00038208, 0.00031133, 0.00076548, 0.00086003,
           0.00104404, 0.00165561, 0.00291454, 0.00451158],
          [0.00308053, 0.00180923, 0.00126479, 0.0006429 , 0.0006278 ,
           0.00061959, 0.00089868, 0.00099805, 0.00121806, 0.00211702,
           0.00366083, 0.00516376, 0.00369057, 0.00211076, 0.00128778,
           0.00043778, 0.00045678, 0.00043797, 0.00123938, 0.00131