In [29]:
import numpy as np
import pandas as pd
from scipy.special import gammaln, gamma
from collections import Counter
from scipy.optimize import basinhopping, fmin_slsqp
np.set_printoptions(suppress=True)

In [2]:
# Load the data and process it.

df = pd.read_csv("../mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')

# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 

# Fix the issue of duplicate records within the same period.
data = df[['worker_id', 'period']].drop_duplicates().pivot_table(
    index = 'worker_id',
    values = 'period',                    
    aggfunc=('count','min','max')
).join(workers).set_index('index').sort_index()

# Aggregate the capture histories
#data = data.groupby(['min', 'max', 'count']).size().reset_index(name='count_history')
S = 35
dat = np.zeros(S + 1)
out = Counter(data['count'].values)
for k,v in out.items():
    dat[k] = v
V = len(workers)
print(S, V)

35 46145


In [3]:
# Alpha Beta version
def BB(S, k, alpha, beta):
    up = gamma(k + alpha) * gamma(S - k + beta) * gamma(alpha + beta)
    down = gamma(alpha) * gamma(beta) * gamma(S + alpha + beta)
    return up / down

def Bin(S, k, prob):
    return np.power(prob, k) * np.power(1-prob, S - k)

In [24]:
def mll(initParams):
    # params
    f0 = np.exp(initParams[0])
    p1 = initParams[1]
    p2 = initParams[2]
    alpha = initParams[3]
    beta = initParams[4]
    m1 = initParams[5]
    m2 = initParams[6]
    m3 = initParams[7]
    
    # N! / f0!
    N = gammaln(V + f0 +1) - gammaln(f0 + 1)
    L0 = f0 * np.log( (m1 * Bin(S,0,p1)) +  (m2 * Bin(S,0,p2)) + (m3 * BB(S, 0, alpha, beta)) )
    Lk = 0 
    # fk likelihood
    for k in range(1, len(dat)):
        Lk += dat[k] * np.log((m1 * Bin(S,k,p1)) +  (m2 * Bin(S,k,p2)) + (m3 * BB(S, k, alpha, beta)) )
    
    obj = N + L0 + Lk
    if -obj < -121000:
        print(f0, p1, p2, alpha, beta, m1, m2, m3, -obj, np.sum(initParams[5:]))
    return -obj

In [89]:
# Bounds
B= ((1, 14), (0, 1), (0, 1), (0.5,10), (1, 10), (0, 1), (0, 1), (0,1))

cons =({'type': 'eq','fun' : lambda x: np.array(x[5]+x[6]+x[7] - 1.4)})

#args
minimizer_kwargs = dict(method="SLSQP", bounds=B, constraints = cons)

#run
initParams = np.array([1, .5, .5, 1, 5, .25, .25, .5])
res = basinhopping(mll, initParams, minimizer_kwargs=minimizer_kwargs)
print(res)
print(np.exp(res['x'][0]) + V)

258663.02252548002 0.0 0.048833292000622086 0.6893142077564646 9.605640393129406 0.5629763311819829 0.0 1.0 -190504.9129484751 1.5629763311819829
258663.02252548002 0.0 0.048833292000622086 0.6893142077564646 9.605640393129406 0.5629763311819829 0.0 1.0 -190504.9129484751 1.5629763311819829
258663.02637985942 0.0 0.048833292000622086 0.6893142077564646 9.605640393129406 0.5629763311819829 0.0 1.0 -190504.91320484434 1.5629763311819829
258663.02252548002 1.4901161193847656e-08 0.048833292000622086 0.6893142077564646 9.605640393129406 0.5629763311819829 0.0 1.0 -190504.87614633527 1.5629763311819829
258663.02252548002 0.0 0.04883330690178328 0.6893142077564646 9.605640393129406 0.5629763311819829 0.0 1.0 -190504.9129484751 1.5629763311819829
258663.02252548002 0.0 0.048833292000622086 0.6893142226576258 9.605640393129406 0.5629763311819829 0.0 1.0 -190504.91093728418 1.5629763311819829
258663.02252548002 0.0 0.048833292000622086 0.6893142077564646 9.605640408030567 0.5629763311819829 0.0



 0.4573943546473821 0.5000000149011612 10.0 1.0 0.44974853589872255 0.8084205797724602 -659035.7089393326 2.2581691156711825
1202604.2841647768 0.0 0.4573943546473821 0.5 10.000000014901161 1.0 0.44974853589872255 0.8084205797724602 -659035.715965553 2.2581691156711825
1202604.2841647768 0.0 0.4573943546473821 0.5 10.0 1.0000000149011612 0.44974853589872255 0.8084205797724602 -659035.7287793957 2.2581691305723437
1202604.2841647768 0.0 0.4573943546473821 0.5 10.0 1.0 0.44974855079988374 0.8084205797724602 -659035.7158193886 2.2581691305723437
1202604.2841647768 0.0 0.4573943546473821 0.5 10.0 1.0 0.44974853589872255 0.8084205946736214 -659035.7226657176 2.2581691305723437
1202604.2841647768 0.0 0.26733748174145044 0.8342941314833262 9.747578725675748 0.9513732542894415 0.6717196026632262 0.8153314637620245 -479090.87544701266 2.438424320714692
1202604.2841647768 0.0 0.26733748174145044 0.8342941314833262 9.747578725675748 0.9513732542894415 0.6717196026632262 0.8153314637620245 -479090

850918.4799444254 0.0 0.48446225461413095 0.5 9.741430607751097 0.7042557674464227 0.7705747876918839 0.9919848137074205 -396215.8456832085 2.4668153688457273
850918.492624099 0.0 0.48446225461413095 0.5 9.741430607751097 0.7042557674464227 0.7705747876918839 0.9919848137074205 -396215.8482621242 2.4668153688457273
850918.4799444254 1.4901161193847656e-08 0.48446225461413095 0.5 9.741430607751097 0.7042557674464227 0.7705747876918839 0.9919848137074205 -396215.63778441824 2.4668153688457273
850918.4799444254 0.0 0.48446226951529214 0.5 9.741430607751097 0.7042557674464227 0.7705747876918839 0.9919848137074205 -396215.8455568891 2.4668153688457273
850918.4799444254 0.0 0.48446225461413095 0.5000000149011612 9.741430607751097 0.7042557674464227 0.7705747876918839 0.9919848137074205 -396215.83860850526 2.4668153688457273
850918.4799444254 0.0 0.48446225461413095 0.5 9.741430622652258 0.7042557674464227 0.7705747876918839 0.9919848137074205 -396215.84589021513 2.4668153688457273
850918.479

KeyboardInterrupt: 

In [23]:
1 - np.sum(initParams[4:])

-0.5

In [86]:
res['x'][5:].sum()

2.0

In [85]:
res[5:].sum()

TypeError: unhashable type: 'slice'