In [1]:
import stan
import pandas as pd
import numpy as np
import nest_asyncio
nest_asyncio.apply()

ModuleNotFoundError: No module named 'stan'

In [None]:
# import data and add new columns
df = pd.read_csv('../data/uberRides.csv')
df['amount'] = df.amount.abs()
df['logAmount'] = np.log(df.amount)


In [None]:
# filter data: only keep users with at least minTrips trips
minTrips = 3

nTripsUser = df['userId'].value_counts().reset_index().rename(columns={'userId':'nTrips','index':'userId'})
nTripsUser = nTripsUser[nTripsUser.nTrips >= minTrips].reset_index()
nTripsUser['userIndex'] = nTripsUser.index + 1

dfIncl = df.merge(nTripsUser,on = "userId", how = 'inner')

nObs = dfIncl.shape[0]
nUsers = nTripsUser.shape[0]

dfIncl = dfIncl.astype({col: 'int32' for col in dfIncl.select_dtypes('int64').columns})

In [None]:
dfIncl.userIndex.values

In [None]:
# create data dictionary for stan model
model_data = {'nObs': nObs,
              'nUsers': nUsers,
              'userID': dfIncl.userIndex.values,
              'y': dfIncl.logAmount.values}


In [None]:
dfIncl.userIndex.values

In [None]:
model_code = """
data {
  int<lower=0> nObs;                         // number of rows in full data 
  int<lower=0> nUsers;                       // number of users
  int<lower=1,upper=nUsers> userID[nObs];                 // user index for each row
  vector[nObs] y;                   // log amount
}

parameters {
  real<lower=0> sigma;         // sd alpha
  real mu;                     // mean alpha
  vector[nUsers] alpha;        // user effects
  real<lower=0> sigma_y;       // sd data
}

model {
  sigma ~ cauchy(0, 2.5);
  mu ~ normal(0,5);
  alpha ~ normal(mu, sigma);
  sigma_y ~ cauchy(0, 2.5);

  y ~ normal(alpha[userID], sigma_y);
}
"""

In [None]:
# compile stan model
theModel = stan.build(model_code, data=model_data)

In [None]:
fit = theModel.sample(num_chains=4, num_samples=1000)

In [None]:
mcmcAlpha = pd.DataFrame(fit['alpha'])

In [None]:
alphaMean = mcmcAlpha.mean(axis=1) # get posterior mean of alpha

In [None]:
muMean = fit['mu'].mean() # get posterior mean of mu