In [3]:
%matplotlib inline
from tqdm import tqdm
import numpy as np
import pymc3 as pm
from theano import shared
import theano.tensor as tt
import theano
import pandas as pd
from pymc3.distributions.dist_math import binomln, betaln, bound
np.set_printoptions(suppress=True)

In [4]:
df = pd.read_csv("mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')

In [7]:
# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 

In [8]:
df

Unnamed: 0.1,Unnamed: 0,gender,hit_answered_date,hit_creation_date,household_income,household_size,location_city,location_country,location_region,marital_status,post_to_completion_secs,worker_id,year_of_birth,date,period
0,0,male,2018-02-08 19:19:56.215,2018-02-08 18:13:16.000,"$75,000-$99,999",2,columbia,US,sc,married,4000.0,cd50724d102fc745c4b0fda263072e39,1964,2018-02-08,35
1,1,male,2018-02-08 19:14:13.308,2018-02-08 19:13:13.000,"Less than $10,000",1,flagstaff,US,az,single,60.0,0dc9dd7aeb958d2af3ba186f2764ad37,1991,2018-02-08,35
2,2,female,2018-02-08 19:00:46.391,2018-02-08 18:58:15.000,"$40,000-$59,999",2,portland,US,tn,single,151.0,2bb8a9251afe89e0cc320d242d043ee0,1984,2018-02-08,35
3,3,male,2018-02-08 18:54:31.768,2018-02-08 17:13:12.000,"$75,000-$99,999",5+,levittown,US,pa,single,6079.0,e462061542157c850a3c2c9c28815174,1987,2018-02-08,35
4,4,male,2018-02-08 18:45:51.804,2018-02-08 18:43:21.000,"$100,000 or more",3,brighton,US,mi,engaged,150.0,a26233f975b341e37934d5d746835277,1988,2018-02-08,35
5,5,male,2018-02-08 18:30:43.987,2018-02-08 18:28:18.000,"$40,000-$59,999",2,state college,US,pa,cohabitating,145.0,8ba9ed995702e837096f18b0cc92668b,1968,2018-02-08,35
6,6,female,2018-02-08 17:59:39.302,2018-02-08 17:43:23.000,"$15,000-$24,999",2,philipsburg,US,pa,divorced,976.0,d05255f6adf41b12016c7e7055563fa7,1971,2018-02-08,35
7,7,male,2018-02-08 17:59:30.498,2018-02-08 17:58:28.000,"$60,000-$74,999",3,memphis,US,tn,married,62.0,4ea9284788fff176178ab543796d02a5,1974,2018-02-08,35
8,8,female,2018-02-08 17:38:32.908,2018-02-08 16:58:10.000,"$75,000-$99,999",5+,high point,US,nc,married,2422.0,dc4d691cea821951b1608766f8d90b2f,1984,2018-02-08,35
9,9,female,2018-02-08 17:38:25.625,,"$75,000-$99,999",5+,high point,US,nc,married,,dc4d691cea821951b1608766f8d90b2f,1984,2018-02-08,35


In [9]:
data = df.pivot_table(
    index = 'worker_id',
    values = 'period',                    
    aggfunc=('count','min','max')
)

In [25]:
data[data['count'] > (data['max']-data['min'] + 1)]

Unnamed: 0_level_0,count,max,min
worker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000314db2adbfc1680937b7a9a7b0564,7,17,17
00973b532a389cae01e4821b74e5966b,2,10,10
0189e5358390fe1a746c581fd0bac904,3,23,23
01ded1b837424aa6400085f5e34205ef,2,6,6
01fe1039c33407583bb75ecdd9b2cfd9,3,25,24
0282473e990c8b97d93b932ba920025c,2,10,10
02908ecb265c8c77b2d3c5724d222acb,2,20,20
02e3c42db87d78664987b9967ca4d923,2,11,11
03711f3dd781f444764139ee93824c0d,2,7,7
0499f064272db6563a606cb3ddd6abc6,4,2,2


In [26]:
df[df.worker_id == 'feb6a5048b5bb4b30306a3afa0f862de']

Unnamed: 0.1,Unnamed: 0,gender,hit_answered_date,hit_creation_date,household_income,household_size,location_city,location_country,location_region,marital_status,post_to_completion_secs,worker_id,year_of_birth,date,period
92741,92741,male,2015-07-16 09:18:56.519,,"Less than $10,000",3,kochi,IN,kl,single,,feb6a5048b5bb4b30306a3afa0f862de,1992,2015-07-16,3
92742,92742,male,2015-07-16 09:18:56.443,,"Less than $10,000",3,kochi,IN,kl,single,,feb6a5048b5bb4b30306a3afa0f862de,1992,2015-07-16,3
92743,92743,male,2015-07-16 09:18:56.383,,"Less than $10,000",3,kochi,IN,kl,single,,feb6a5048b5bb4b30306a3afa0f862de,1992,2015-07-16,3
92744,92744,male,2015-07-16 09:18:56.264,2015-07-16 08:38:11.000,"Less than $10,000",3,kochi,IN,kl,single,2445.0,feb6a5048b5bb4b30306a3afa0f862de,1992,2015-07-16,3
92745,92745,male,2015-07-16 09:18:56.256,,"Less than $10,000",3,kochi,IN,kl,single,,feb6a5048b5bb4b30306a3afa0f862de,1992,2015-07-16,3


In [8]:
df = pd.read_csv("mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')
I = len(workers)

CH = np.zeros((I,K))

# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 
K = df['period'].max() + 1

# We give each worker an numerical id
df['idx'] = (df['worker_id']).astype('category').cat.codes

# We generate the full capture history for all workers IxK
seq = df[['idx', 'period']].values
seq = np.r_[seq].T
CH[seq[0], seq[1]] = 1

# We compute the capture history patterns and their frequencies
CH, Nh = np.unique(CH, axis=0, return_counts = True)

array([  31, 1092,  865, ...,    1,    1,    1])

In [39]:
df = pd.read_csv("mturk_surveys.csv")
df["hit_answered_date"]=pd.to_datetime(df["hit_answered_date"])
df['date'] = pd.to_datetime(df['hit_answered_date'].apply(lambda x : x.date()))

# Create a list of all unique worker IDs 
workers = pd.DataFrame(list(set(df.worker_id)), columns=['worker_id']).reset_index().set_index('worker_id')
I = len(workers)

# Calculate the time period of each survey answer. 
# Below we take out time period to be 30-day periods. 
# We can change this to be weeks, months, or anything else
window_length = 30
minDate = min(df['date'])
df['period'] = (df['date'] - minDate).dt.days // window_length 

K = df['period'].max() + 1

data = df.pivot_table(
    index = 'worker_id',
    values = 'period',                    
    aggfunc=('count','min','max')
).join(workers).set_index('index').sort_index()

data.head(10)

Unnamed: 0_level_0,count,max,min
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,27,20
1,1,5,5
2,1,28,28
3,4,31,9
4,2,24,17
5,1,4,4
6,7,21,4
7,1,18,18
8,1,5,5
9,1,17,17


In [47]:
data = data.groupby(['count','max', 'min']).size().reset_index(name='count_history')
len(data)

4118

In [18]:
np.unique(data.values.shape, axis = 0)

array([    3, 46145])

In [23]:
np.unique(data.values)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 37])

In [49]:
data.values

array([[ 1,  0,  0,  1],
       [ 1,  1,  1,  1],
       [ 1,  2,  2,  1],
       ...,
       [33, 34,  0,  1],
       [34, 34,  0,  1],
       [37, 34,  0,  1]])

In [27]:
test = np.array([[ 2, 27, 20],
       [ 1, 28, 28],
       [ 1, 28, 28]])

In [50]:
CH, nh = np.unique(data.values, axis = 0, return_counts = True)

In [51]:
len(CH)

4118

In [44]:
data

Unnamed: 0_level_0,count,max,min
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,27,20
1,1,5,5
2,1,28,28
3,4,31,9
4,2,24,17
5,1,4,4
6,7,21,4
7,1,18,18
8,1,5,5
9,1,17,17


In [136]:
def get_imask(S, V, data):
    i_mask = np.zeros((V,S,S))
    for wh in np.arange(V):
        f, l = data[wh, 0] , data[wh, 1]
        i_mask[wh, :f+1, l:] = 1
    return i_mask

In [180]:
tdata = np.array([[1,2], [5,12], [3, 13]])

In [138]:
tdata

array([[ 1,  2],
       [ 5, 12],
       [ 3, 13]])

In [139]:
t1 = get_imask(20, 3, tdata)

In [188]:
nz = np.zeros((3,20,20))
[nz[i, :j+1, k] for i,j,k in tdata]

ValueError: not enough values to unpack (expected 3, got 2)

In [193]:
[i for i in tdata]

[array([1, 2]), array([ 5, 12]), array([ 3, 13])]

In [110]:
r

array([[[0., 0., 1., ..., 1., 1., 1.],
        [0., 0., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.],
        [0., 0., 0., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [142]:
r == t1

array([[[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],

       [[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],

       [[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  Tr

In [145]:
def get_imask2(S, V, data):
    s_mask = np.zeros([V,S])
    q_mask = np.zeros([V,S])
    for w in np.arange(V):
        f, l = data[w, 0] , data[w, 1]
        s_mask[w] = np.concatenate((np.ones(f+1), np.zeros(S - f -1)))
        q_mask[w] = np.concatenate((np.zeros(l), np.ones(S-l)))
    i_mask = np.einsum("is,iq->isq", s_mask, q_mask)
    return i_mask

In [203]:
def get_vmask(S):
    v_mask = np.zeros([S,S,S])
    for s in np.arange(S):
        for q in np.arange(s,S):
            c1 = np.zeros(s)
            c2 = np.ones(q-s)
            c3 = np.zeros(S-q)
            v_mask[s,q]= np.concatenate((c1, c2, c3))
    return v_mask
ocaz = get_vmask(3)

In [204]:
def get_BetaBi(a,b, S, Ni, Nn):
    # The beta binomial
    BBi  =  bound(betaln(Ni + a, Nn - Ni + b) - betaln(a, b),
                     Ni >= 0, Ni <= Nn,
                     a > 0, b > 0)
    # The above is the computation of the log, so we take the exponent
    return np.exp(BBi)

In [195]:
# For a given outcome in {1,S} (Frequency of capture that's not 0) 
# Generate the Binomial parameters n and i   choose(n, i)
# Nn is the duration from arrival to departure
# Ni is the Frequency we are intersted in
# Dim: SxSxS
def make_binmask(S):
    i = np.arange(S) + 1
    i = i[:, np.newaxis, np.newaxis] *  np.ones((S, S, S))
    n = np.broadcast_to(np.triu(np.ones([S,S]), 0).cumsum(axis =1), (S, S, S))
    return i, n
Ni, Nn = make_binmask(5)

In [206]:
get_BetaBi(1,1, 5, Ni, Nn).eval()

array([[[0.5       , 0.16666667, 0.08333333, 0.05      , 0.03333333],
        [0.        , 0.5       , 0.16666667, 0.08333333, 0.05      ],
        [0.        , 0.        , 0.5       , 0.16666667, 0.08333333],
        [0.        , 0.        , 0.        , 0.5       , 0.16666667],
        [0.        , 0.        , 0.        , 0.        , 0.5       ]],

       [[0.        , 0.33333333, 0.08333333, 0.03333333, 0.01666667],
        [0.        , 0.        , 0.33333333, 0.08333333, 0.03333333],
        [0.        , 0.        , 0.        , 0.33333333, 0.08333333],
        [0.        , 0.        , 0.        , 0.        , 0.33333333],
        [0.        , 0.        , 0.        , 0.        , 0.        ]],

       [[0.        , 0.        , 0.25      , 0.05      , 0.01666667],
        [0.        , 0.        , 0.        , 0.25      , 0.05      ],
        [0.        , 0.        , 0.        , 0.        , 0.25      ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.     

In [207]:
Ni

array([[[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]],

       [[2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2.]],

       [[3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.]],

       [[4., 4., 4., 4., 4.],
        [4., 4., 4., 4., 4.],
        [4., 4., 4., 4., 4.],
        [4., 4., 4., 4., 4.],
        [4., 4., 4., 4., 4.]],

       [[5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5.],
        [5., 5., 5., 5., 5.]]])

In [208]:
def make_survival(S):
    mask = np.zeros([S,S,S])  # arrival, departure, presence_sequence
    for s in np.arange(S):
        for q in np.arange(s,S):
            c1 = np.zeros(s)
            c2 = np.ones(q-s)
            c3 = np.zeros(S-q)
            mask[s,q]= np.concatenate((c1, c2, c3))
    return mask
survival = make_survival(5)

In [209]:
survival

array([[[0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 1., 0., 0.],
        [0., 1., 1., 1., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 1., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])

In [213]:
tt.triu(tt.ones((5,5)), 0).eval()

array([[1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 1.]])

In [None]:
tt