In [1]:
import numpy as np
import pandas as pd
from scipy.special import gammaln, gamma
from collections import Counter
from scipy.optimize import basinhopping
np.set_printoptions(suppress=True)

In [2]:
# Load the data and process it.

df = pd.read_csv("../mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')

# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 

# Fix the issue of duplicate records within the same period.
data = df[['worker_id', 'period']].drop_duplicates().pivot_table(
    index = 'worker_id',
    values = 'period',                    
    aggfunc=('count','min','max')
).join(workers).set_index('index').sort_index()

# Aggregate the capture histories
#data = data.groupby(['min', 'max', 'count']).size().reset_index(name='count_history')
S = 35
dat = np.zeros(S + 1)
out = Counter(data['count'].values)
for k,v in out.items():
    dat[k] = v
V = len(workers)
print(S, V)

35 46145


In [7]:
# Alpha Beta version
def BB(S, k, alpha, beta):
    up = gamma(k + alpha) * gamma(S - k + beta) * gamma(alpha + beta)
    down = gamma(alpha) * gamma(beta) * gamma(S + alpha + beta)
    return up / down

def Bin(S, k, prob):
    return np.power(prob, k) * np.power(1-prob, S - k)

In [10]:
def mll(initParams):
    # params
    f0 = np.exp(initParams[0])
    alpha = initParams[1]
    beta = initParams[2]
    p = initParams[3]
    mix = initParams[4]
    
    # N! / f0!
    N = gammaln(V + f0 +1) - gammaln(f0 + 1)
    L0 = f0 * np.log(  (mix * np.power(1-p, S)) +  ((1 - mix) * BB(S, 0, alpha, beta)))
    Lk = 0
    
    # fk likelihood
    for k in range(1, len(dat)):
        
        Lk += dat[k] * np.log(  (mix * np.power(p, k) * np.power(1-p, S - k)) +  ((1 - mix) * BB(S, k, alpha, beta)))
    
    obj = N + L0 + Lk
    if -obj < -121000:
        print(f0, alpha, beta, p, mix, -obj)
    return -obj

In [11]:
# Bounds
B= ((1,15), (0, 10), (0, 10), (0, 1), (0,1))

#args
minimizer_kwargs = dict(method="L-BFGS-B", bounds=B)

#run
initParams = np.array([2, 0.2, 5, .005, .8])
res = basinhopping(mll, initParams, minimizer_kwargs=minimizer_kwargs, niter = 500000)
print(res)
print(np.exp(res['x'][0]) + V)

  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()


OverflowError: Range exceeds valid bounds

In [61]:
mll([403608.7895533 ,      0.02039742,      3.7429387 ,      0.01037844,
            0.20782324])

-121061.20699431212

In [62]:
mll([83793.7773379 ,     0.19999895,     4.07278073,     0.01197954,
           0.70142692])

-120977.7095533876

In [64]:
mll([234776.72579006,      0.03929604,      3.78447354,      0.01050833,
            0.33356468])

-121053.78927293565

In [67]:
mll([57392.28189185,     0.38756928,     4.32406151,     0.01459061,
           0.80091354])

-120812.86576105905

In [114]:
mll([125104.16766101,      0.09702091,      3.89744013,      0.01093061,
            0.54677371])

-121029.53788561595

In [188]:
mll([11.03010376,  1.1       ,  6.66479873,  0.01335952,  0.89314851])

-120587.12622553745