In [1]:
import numpy as np
import pandas as pd
from scipy.special import gammaln, gamma
from collections import Counter
from scipy.optimize import basinhopping
np.set_printoptions(suppress=True)

In [2]:
# Load the data and process it.

df = pd.read_csv("../mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')

# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 

# Fix the issue of duplicate records within the same period.
data = df[['worker_id', 'period']].drop_duplicates().pivot_table(
    index = 'worker_id',
    values = 'period',                    
    aggfunc=('count','min','max')
).join(workers).set_index('index').sort_index()

# Aggregate the capture histories
#data = data.groupby(['min', 'max', 'count']).size().reset_index(name='count_history')
S = 35
dat = np.zeros(S + 1)
out = Counter(data['count'].values)
for k,v in out.items():
    dat[k] = v
V = len(workers)
print(S, V)

35 46145


In [38]:
def mll(initParams):
    # params
    f0 = np.exp(initParams[0])
    p1 = initParams[1]
    p2 = initParams[2]
    p3 = initParams[3]
    m1 = initParams[4]
    m2 = initParams[5]
    m3 = initParams[6]
    
    # N! / f0!
    N = gammaln(V + f0 +1) - gammaln(f0 + 1)
    L0 = f0 * np.log( (m1 * np.power(1-p1, S)) +  (m2 * np.power(1-p2, S)) + (m3 * np.power(1-p3, S)) )
    Lk = 0 
    # fk likelihood
    for k in range(1, len(dat)):
        Lk += dat[k] * np.log((m1* np.power(p1, k) * np.power(1-p1, S-k)) +  (m2* np.power(p2, k)* np.power(1-p2, S-k)) + (m3* np.power(p3, k) * np.power(1-p3, S-k)))
    
    obj = N + L0 + Lk
    if -obj < -10000:
        print(f0, p1, p2, p3, m1, m2, m3, -obj, np.sum(initParams[4:]))
    return -obj

In [45]:
# Bounds
B= ((1, 14), (0, 1), (0, 1), (0,1), (0, 1), (0, 1), (0,1))

cons = {'type':'eq', 'fun': lambda x: 1- np.sum(x[4:])}

#args
minimizer_kwargs = dict(method="SLSQP", bounds=B, constraints = cons)

#run
initParams = np.array([1, .5, .5, .5, .25, .25, .5])
res = basinhopping(mll, initParams, minimizer_kwargs=minimizer_kwargs)
print(res)
print(np.exp(res['x'][0]) + V)

3.891689428955684 0.08128736209795673 0.44028711251651076 0.7982503358759168 0.4162844262899601 0.49375095463583096 0.0 -54146.27438862511 0.9100353809257911
3.891689428955684 0.08128736209795673 0.44028711251651076 0.7982503358759168 0.4162844262899601 0.49375095463583096 0.0 -54146.27438862511 0.9100353809257911
3.8916894869463756 0.08128736209795673 0.44028711251651076 0.7982503358759168 0.4162844262899601 0.49375095463583096 0.0 -54146.274388939084 0.9100353809257911
3.891689428955684 0.08128737699911792 0.44028711251651076 0.7982503358759168 0.4162844262899601 0.49375095463583096 0.0 -54146.26387659588 0.9100353809257911
3.891689428955684 0.08128736209795673 0.44028712741767195 0.7982503358759168 0.4162844262899601 0.49375095463583096 0.0 -54146.27414961811 0.9100353809257911
3.891689428955684 0.08128736209795673 0.44028711251651076 0.798250350777078 0.4162844262899601 0.49375095463583096 0.0 -54146.27438862511 0.9100353809257911
3.891689428955684 0.08128736209795673 0.44028711251

  del sys.path[0]


                        fun: -9790.951715394855
 lowest_optimization_result:      fun: -9790.951715394855
     jac: array([   55216.640625 , 20248129.0703125, -1765355.140625 ,
         267424.3046875, -1232716.890625 , -6199488.0703125,
         -24602.9609375])
 message: 'Positive directional derivative for linesearch'
    nfev: 104
     nit: 12
    njev: 8
  status: 8
 success: False
       x: array([13.98588385,  0.00223675,  0.01977897,  0.97624122,  0.9930171 ,
        0.00117567,  0.02011238])
                    message: ['requested number of basinhopping iterations completed successfully']
      minimization_failures: 101
                       nfev: 185304
                        nit: 100
                       njev: 9808
                          x: array([13.98588385,  0.00223675,  0.01977897,  0.97624122,  0.9930171 ,
        0.00117567,  0.02011238])
1231892.3977800647


In [23]:
1 - np.sum(initParams[4:])

-0.5