In [None]:
import sys; sys.path.append("/Users/dtolpin/venv/stan/lib/python3.7/site-packages/")
import pystan
import numpy
import matplotlib.pyplot as plt
%matplotlib inline

Simulated data:

In [None]:
NUMBER_OF_PAGES = 10
PAGES_PER_SESSION_PRIOR = 5
NUMBER_OF_SESSIONS = 100

DECAY = 2

# Simulate some trend
TREND = PAGES_PER_SESSION_PRIOR * numpy.exp(- numpy.arange(NUMBER_OF_SESSIONS) * DECAY 
                                            / NUMBER_OF_SESSIONS) 
         

# Sample data around the trend
DATA = numpy.minimum(NUMBER_OF_PAGES,
                          numpy.maximum(1, 
                                        numpy.round(numpy.random.exponential(TREND))))

print("Trend from {:.2f} to {:.2f}".format(TREND[0], TREND[-1]))
print("Data:", DATA)

The model in Stan

In [None]:
pps_code = """
data {
    int npages;                // number of pages
    int nsessions;            // number of sessions    
    vector[nsessions] pps;    // page counts (per session)
    real prior_bandwidth;
}

parameters {
    real<lower=0> bandwidth;
}

model {
    // initialize beliefs
    real beliefs[npages, 2];
    real churn_probability = 2. / npages;
    int churned;
    
    for(i in 1:npages) {
        beliefs[i][1] = 2. * churn_probability;
        beliefs[i][2] = 2. * (1 - churn_probability);
    }
    
    // put a prior on the bandwidth
    target += -bandwidth / prior_bandwidth;
    

    for (i in 1:nsessions) {
        for(j in 1:npages) {
            if(j < pps[i]) {
                churned = 0;
            } else {
                churned = 1;
            }

            // observe the pps and update the belief
            {
                real evidence = beliefs[j, 1] + beliefs[j, 2];
                if(churned) {
                    target += log(beliefs[j, 1] / evidence);
                    beliefs[j, 1] += 1;
                } else {
                    target += log(beliefs[j, 2] / evidence);
                    beliefs[j, 2] += 1;
                }

                // discount the beliefs based on the bandwidth
                if(evidence >= bandwidth) {
                    real discount = bandwidth / evidence;
                    beliefs[j, 1] *= discount;
                    beliefs[j, 2] *= discount;
                }
            }
            
            if(churned)
                break;
        }
    }
}
"""

In [None]:
if True: # enable to rebuild the model
    pps_data = {'npages': NUMBER_OF_PAGES,
                'nsessions': NUMBER_OF_SESSIONS,
                'pps': DATA.tolist(),
                'prior_bandwidth': 100}

    sm = pystan.StanModel(model_code=pps_code)

Running the inference:

In [None]:
%%time
fit = sm.sampling(data=pps_data, iter=1000, chains=4)

In [None]:
print(fit)

Drawing the predicted bandwidth directly through pyplot because `fit.plot()` is buggy and arviz is not working.

In [None]:
la = fit.extract()
mean, std = la['bandwidth'].mean(), la['bandwidth'].std()
plt.figure(figsize=(10, 4))
plt.title('predicted bandwidth')
plt.xlabel('bandwidth')
plt.ylabel('density')
hist = plt.hist(la['bandwidth'], density=True, color='lightblue')
plt.vlines(ymin=0, ymax=hist[0].max() * 1.05, x=[mean], lw=2, label='mean', color='darkgray')
plt.vlines(ymin=0, ymax=hist[0].max()/2, x=[mean - std, mean+ std],
           lw=1.25, linestyles='dashed', label='mean ± std',
           color='darkgray')
plt.vlines(ymin=0, ymax=hist[0].max()/4, x=[mean - 2 * std, mean + 2 * std], 
           lw=0.75,linestyles='dotted', label='mean ± 2 * std',
           color='darkgray')
plt.legend()