In [1]:
import numpy as np
import pandas as pd
from scipy.special import gammaln, gamma
from collections import Counter
from scipy.optimize import basinhopping
np.set_printoptions(suppress=True)

In [2]:
# Load the data and process it.

df = pd.read_csv("../mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')

# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 

# Fix the issue of duplicate records within the same period.
data = df[['worker_id', 'period']].drop_duplicates().pivot_table(
    index = 'worker_id',
    values = 'period',                    
    aggfunc=('count','min','max')
).join(workers).set_index('index').sort_index()

# Aggregate the capture histories
#data = data.groupby(['min', 'max', 'count']).size().reset_index(name='count_history')
S = 35
dat = np.zeros(S + 1)
out = Counter(data['count'].values)
for k,v in out.items():
    dat[k] = v
V = len(workers)
print(S, V)

35 46145


In [3]:
# Alpha Beta version
def BB(S, k, alpha, beta):
    up = gamma(k + alpha) * gamma(S - k + beta) * gamma(alpha + beta)
    down = gamma(alpha) * gamma(beta) * gamma(S + alpha + beta)
    return up / down

In [50]:
def mll(initParams):
    # params
    f0 = np.exp(initParams[0])
    alpha1 = initParams[1]
    beta1  = initParams[2]
    alpha2 = initParams[3]
    beta2 = initParams[4]
    mix = initParams[5]
    
    # N! / f0!
    N = gammaln(V + f0 +1) - gammaln(f0 + 1)
    L0 = f0 * np.log( (mix * BB(S, 0, alpha1, beta1)) +  ((1 - mix) * BB(S, 0, alpha2, beta2)) )
    Lk = 0 
    # fk likelihood
    for k in range(1, len(dat)):
        Lk += dat[k] * np.log((mix * BB(S, k, alpha1, beta1)) +  ((1 - mix) * BB(S, k, alpha2, beta2)))
    
    obj = N + L0 + Lk
    if -obj < -117000:
        print(f0, alpha1, beta1, alpha2, beta2, mix, -obj)
    return -obj

In [52]:
# Bounds
B= ((1,None), (0.0001, .1), (1, 10), (1.5, 10), (1, 10), (0.001, .6))

#args
minimizer_kwargs = dict(method="L-BFGS-B", bounds=B)

#run
initParams = np.array([1, .01, 5, 3, 5, .25])
res = basinhopping(mll, initParams, minimizer_kwargs=minimizer_kwargs, niter = 1000)
print(res)
print(np.exp(res['x'][0]) + V)

  This is separate from the ipykernel package so we can avoid doing imports until
  # This is added back by InteractiveShellApp.init_path()


                        fun: -95817.67616309458
 lowest_optimization_result:       fun: -95817.67616309458
 hess_inv: <6x6 LbfgsInvHessProduct with dtype=float64>
      jac: array([     0.04074536, -58254.36674058,    -84.69214663,  17949.80489649,
        -1917.85511561, -30421.57622986])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 126
      nit: 14
   status: 0
  success: True
        x: array([10.97045676,  0.1       , 10.        ,  1.5       , 10.        ,
        0.6       ])
                    message: ['requested number of basinhopping iterations completed successfully']
      minimization_failures: 6
                       nfev: 149758
                        nit: 1000
                          x: array([10.97045676,  0.1       , 10.        ,  1.5       , 10.        ,
        0.6       ])
104276.13937084968


In [None]:
mll([403608.7895533 ,      0.02039742,      3.7429387 ,      0.01037844,
            0.20782324])

In [62]:
mll([83793.7773379 ,     0.19999895,     4.07278073,     0.01197954,
           0.70142692])

-120977.7095533876

In [64]:
mll([234776.72579006,      0.03929604,      3.78447354,      0.01050833,
            0.33356468])

-121053.78927293565

In [67]:
mll([57392.28189185,     0.38756928,     4.32406151,     0.01459061,
           0.80091354])

-120812.86576105905

In [114]:
mll([125104.16766101,      0.09702091,      3.89744013,      0.01093061,
            0.54677371])

-121029.53788561595

In [43]:
mll([12.8,      0.09702091,      3.89744013])

-97002.804319622

In [22]:
V

46145

In [47]:
mll([13.0382216 ,  0.00733834,  2.92808285,  1.133036  , 53.75774862,
        0.82462193])

459650.4553792034 0.00733834 2.92808285 1.133036 53.75774862 0.82462193 -119369.73341259564


-119369.73341259564