In [1]:
import numpy as np
import pandas as pd
from scipy.special import gammaln, gamma
from collections import Counter
from scipy.optimize import basinhopping
np.set_printoptions(suppress=True)

In [2]:
# Load the data and process it.

df = pd.read_csv("../mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')

# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 

# Fix the issue of duplicate records within the same period.
data = df[['worker_id', 'period']].drop_duplicates().pivot_table(
    index = 'worker_id',
    values = 'period',                    
    aggfunc=('count','min','max')
).join(workers).set_index('index').sort_index()

# Aggregate the capture histories
#data = data.groupby(['min', 'max', 'count']).size().reset_index(name='count_history')
S = 35
dat = np.zeros(S + 1)
out = Counter(data['count'].values)
for k,v in out.items():
    dat[k] = v
V = len(workers)
print(S, V)

35 46145


In [19]:
def mll(initParams):
    # params
    f0 = np.exp(initParams[0])
    p1 = initParams[1]
    p2 = initParams[2]
    mix = initParams[3]
    
    # N! / f0!
    N = gammaln(V + f0 +1) - gammaln(f0 + 1)
    L0 = f0 * np.log( (mix * np.power(1-p1, S)) +  ((1 - mix) * np.power(1-p2, S)) )
    Lk = 0 
    # fk likelihood
    for k in range(1, len(dat)):
        Lk += dat[k] * np.log( (mix * np.power(p1, k) * np.power(1-p1, S-k)) +  ((1 - mix) * np.power(p2, k)  * np.power(1-p2, S - k)) )
    
    obj = N + L0 + Lk
    if -obj < -110000:
        print(f0, p1, p2, mix, -obj)
    return -obj

In [20]:
# Bounds
B= ((1, 14), (0, 1), (0, 1), (0,1))

#args
minimizer_kwargs = dict(method="L-BFGS-B", bounds=B)

#run
initParams = np.array([1, .5, .5, .5])
res = basinhopping(mll, initParams, minimizer_kwargs=minimizer_kwargs, niter = 50000)
print(res)
print(np.exp(res['x'][0]) + V)

  
  
  
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


9224.6828403093 0.2598423319448377 0.03465330516298523 0.041129711761537624 -110787.653211758
9224.682932556138 0.2598423319448377 0.03465330516298523 0.041129711761537624 -110787.65325933252
9224.6828403093 0.2598423419448377 0.03465330516298523 0.041129711761537624 -110787.65347241936
9224.6828403093 0.2598423319448377 0.03465331516298523 0.041129711761537624 -110787.65346191713
9224.6828403093 0.2598423319448377 0.03465330516298523 0.041129721761537626 -110787.65352265845
28666.362651239157 0.24510134134704054 0.03286309087153244 0.03842335015801386 -111489.33205034724
28666.36293790281 0.24510134134704054 0.03286309087153244 0.03842335015801386 -111489.3319788329
28666.362651239157 0.24510135134704053 0.03286309087153244 0.03842335015801386 -111489.33238347893
28666.362651239157 0.24510134134704054 0.03286310087153244 0.03842335015801386 -111489.32607711072
28666.362651239157 0.24510134134704054 0.03286309087153244 0.03842336015801386 -111489.33227036649
14377.452457117137 0.254072

16691.13220735761 0.4674670340167258 0.037313794368510365 0.02284992051182807 -110748.03834274772
16691.132374268946 0.4674670340167258 0.037313794368510365 0.02284992051182807 -110748.03833799792
16691.13220735761 0.4674670440167258 0.037313794368510365 0.02284992051182807 -110748.03811192582
16691.13220735761 0.4674670340167258 0.03731380436851037 0.02284992051182807 -110748.03649741219
16691.13220735761 0.4674670340167258 0.037313794368510365 0.02284993051182807 -110748.03861983621
12836.221440016543 0.4627708459500707 0.03763399140502677 0.024141546757874246 -110622.07364782505
12836.221568378769 0.4627708459500707 0.03763399140502677 0.024141546757874246 -110622.07366808876
12836.221440016543 0.46277085595007067 0.03763399140502677 0.024141546757874246 -110622.0734232876
12836.221440016543 0.4627708459500707 0.03763400140502677 0.024141546757874246 -110622.0729953787
12836.221440016543 0.4627708459500707 0.03763399140502677 0.024141556757874248 -110622.07392417855
16691.1322073576

                        fun: nan
 lowest_optimization_result:       fun: nan
 hess_inv: <4x4 LbfgsInvHessProduct with dtype=float64>
      jac: array([     39.93045539, 1413931.95372075, 1413931.95372075,
             0.        ])
  message: b'ABNORMAL_TERMINATION_IN_LNSRCH'
     nfev: 105
      nit: 0
   status: 2
  success: False
        x: array([1. , 0.5, 0.5, 0.5])
                    message: ['requested number of basinhopping iterations completed successfully']
      minimization_failures: 46972
                       nfev: 4952615
                        nit: 50000
                          x: array([1. , 0.5, 0.5, 0.5])
46147.71828182846
