## Fit several models to seaflow + Zinser, save parameter samples as .csv for analysis

### load data from files and plot

In [12]:
#%matplotlib notebook
import netCDF4 as nc4
import numpy as np

# load data
datafiles = {
    'seaflow':'../data/SeaFlow_SizeDist_regrid-25-8.nc',
    'zinser':'../data/Zinser_SizeDist_calibrated-26-6.nc',
}
itestfiles = {
    'seaflow':'../data/Zinser_SizeDist_calibrated-26-6-itest.csv', # same as zinser
    'zinser':'../data/Zinser_SizeDist_calibrated-26-6-itest.csv',         
}
desc = {
    'seaflow':'SeaFlow dataset',
    'zinser':'Zinser dataset',    
}
data_gridded = {}
for k in datafiles:
    data_gridded[k] = {}
    with nc4.Dataset(datafiles[k]) as nc:
        for var in nc.variables:
            data_gridded[k][var] = nc.variables[var][:]
    desc[k] += ' (m={data[m]}, $\Delta_v^{{-1}}$={data[delta_v_inv]})'.format(data=data_gridded[k])

data_gridded[k]['PAR'] *= 200.0/22.0 # make light similar for this experiment (it is later normalized by E_star) 

In [13]:
data = {}

for k in data_gridded:
    dt = 20 # in units of minutes    
    data[k] = {'dt':dt}
    for v in ('m','v_min','delta_v_inv'):
        data[k][v] = data_gridded[k][v]
    if 'seaflow' in k:
        limit_days = 1
        # new: average SeaFlow data in hourly bins
        binsize = 60 # in minutes
        numbins = int(np.ceil(data_gridded[k]['time'][-1]/binsize))      
        data[k]['obs'] = np.full((data[k]['m'],numbins), fill_value=np.nan)
        data[k]['t_obs'] = np.full(numbins, fill_value=np.nan)
 
        i = 0
        for ibin in range(numbins):
            binind = np.logical_and(data_gridded[k]['time'] >= ibin*binsize,
                                    data_gridded[k]['time'] < (ibin+1)*binsize)
            if np.any(binind):
                # TODO we may want to make this a sum when dealing with counts
                data[k]['obs'][:,i] = np.mean(data_gridded[k]['w_obs'][:,binind], axis=1)
                data[k]['t_obs'][i] = (ibin+0.5) * binsize
                i += 1        
        data[k]['obs'] = data[k]['obs'][:,:i]
        data[k]['t_obs'] = data[k]['t_obs'][:i]        
        
        n = len(data_gridded[k]['PAR'])  
        wsh = 30 # half of median filter window size; window size is 2*wsh+1
        par = np.array([np.median(data_gridded[k]['PAR'][max(0,i-wsh):min(n,i+wsh+1)]) for i in range(n)]) # median filter PAR; see: medianfilter_par.ipynb
    else:
        limit_days = 2        
        data[k]['obs'] = data_gridded[k]['w_obs']
        data[k]['t_obs'] = data_gridded[k]['time']
        par = data_gridded[k]['PAR']
        
    if limit_days > 0:
        limit_minutes = limit_days*1440        
        ind_obs = data[k]['t_obs'] < limit_minutes
        data[k]['t_obs'] = data[k]['t_obs'][ind_obs]
        data[k]['obs'] = data[k]['obs'][:,ind_obs]        
        data[k]['nt'] = int(limit_minutes//data[k]['dt'])

    data[k]['nt_obs'] = data[k]['t_obs'].size
    
    # load cross-validation testing indices and add them to data
    data[k]['i_test'] = np.loadtxt(itestfiles[k]).astype(int)
    data[k]['i_test'] = data[k]['i_test'][:-1]                          # remove last index, so that dimensions agree

    t = np.arange(data[k]['nt'])*data[k]['dt']
    data[k]['E'] = np.interp(t, xp=data_gridded[k]['time'], fp=par)     # add light data
    
    # for now, add pseudo-count data
    data[k]['obs_count'] = (1000*data[k]['obs']).astype(int)
    
    # consistency check
    if len(data[k]['i_test']) != data[k]['nt_obs']:
        raise ValueError('Invalid number of testing indices for "{}" (expected {}, got {}).'.format(k,data[k]['nt_obs'],len(data[k]['i_test'])))

  a.partition(kth, axis=axis, kind=kind, order=order)


Compile models

In [3]:
import pystan

modelfiles = {
    'm1': '../stancode/matrixmodel_estinilnorm_monodelta_respv2_normparam_trackgrowth_xval.stan',
    'm2': '../stancode/matrixmodel_estinilnorm_monodelta-lightsig_respv2_normparam_trackgrowth_xval.stan',
}
modelfiles_prior= {
    'm1p': '../stancode/matrixmodel_estinilnorm_monodelta_respv2_normparam_trackgrowth_xval_PRIOR_ONLY.stan',
    'm2p': '../stancode/matrixmodel_estinilnorm_monodelta-lightsig_respv2_normparam_trackgrowth_xval_PRIOR_ONLY.stan',
}  

#modelfiles = {
#    'm1': '../stancode/matrixmodel_estinilnorm_monodelta_respv2_normparam_trackgrowth_xval.stan',
#    'm2': '../stancode/matrixmodel_estinilnorm_monodelta-lightsig_respv2_normparam_trackgrowth_xval.stan',
#    'm3': '../stancode/matrixmodel_multinom_estinilnorm_monodelta_respiv6_normparam_trackgrowth_xval.stan',
#    'm4': '../stancode/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respiv6_normparam_trackgrowth_xval.stan',
#    'm5': '../stancode/matrixmodel_multinom_estinilnorm_monodelta_respiv7_normparam_trackgrowth_xval.stan',
#    'm6': '../stancode/matrixmodel_multinom_estinilnorm_monodelta-lightsig_respiv7_normparam_trackgrowth_xval.stan',
#}

In [None]:
models = {}
for name in modelfiles:
        models[name] = pystan.StanModel(file=modelfiles[name], model_name=name, obfuscate_model_name=False)

In [8]:
models_prior = {}
for name in modelfiles_prior:
        models_prior[name] = pystan.StanModel(file=modelfiles_prior[name], model_name=name, obfuscate_model_name=False)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL m1p NOW.
INFO:pystan:COMPILING THE C++ CODE FOR MODEL m2p NOW.


Fit models and save to .csv

In [43]:
mcmcs = {}

for name in models:
    mcmcs[name] = {}
    for k in data:
            mcmcs[name][k] = models[name].sampling(data=data[k], iter=2000)
            df = pystan.misc.to_dataframe(mcmcs[name][k])
            df.to_csv("fit_{name}_{dataset}.csv".format(name=name,dataset=k),index=False)



In [18]:
mcmcs_prior = {}

for name in models_prior:
    mcmcs_prior[name] = models_prior[name].sampling(data=data['seaflow'],iter=2000)
    df = pystan.misc.to_dataframe(mcmcs_prior[name])
    df.to_csv("prior_{name}.csv".format(name=name),index=False)



In [48]:
mcmcs['mp']['seaflow']




For the full summary use 'print(fit)'

Inference for Stan model: mp.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

                      mean se_mean     sd     2.5%     25%    50%    75%  97.5%  n_eff   Rhat
delta_lambda          0.34    0.07    0.3     0.07    0.14   0.23   0.46   1.17     20   1.17
delta_max_incr[1]     5.44    1.98   7.56     0.16     0.9   2.45   7.05  27.16     15   1.17
delta_max_incr[2]      5.9    1.12   7.45     0.17    1.18   3.09   8.26  26.34     44   1.09
delta_max_incr[3]     2.54    0.55   4.31     0.04    0.39   1.19    3.1  13.81     61   1.08
delta_max_incr[4]     4.47    1.09   7.07     0.08    0.59   1.85   5.04  25.31     42   1.13
delta_max_incr[5]     4.73    1.11   7.42     0.23    0.99    2.5   5.14  24.48     45    1.1
delta_max_incr[6]     4.47    0.81   5.23     0.11    1.07   3.03   5.63  20.81     41   1.12
delta_max_incr[7]     4.42    0.64   5.24     0.27    1.49  