In [3]:
import io
import datetime
import itertools as itr
import collections as coll
import pprint
import pandas as pd

# %matplotlib inline

pp = pprint.PrettyPrinter(indent=2)
PS_DATE_FMT = "%Y-%m-%d %H:%M:%S"

In [5]:
data = '/Users/cjc73/Expire/test_update_times2.tsv'

In [6]:
def make_list(list_str):
    return [datetime.datetime.strptime(x[1:-1], PS_DATE_FMT) for x in list_str[1:-1].split(',')]

def data_parser(in_file_name):
    with open(in_file_name) as infile:
        for line in infile:
            elements = line.strip().split('\t')
            if len(elements) != 4:
                continue
            yield (elements[0], elements[1], make_list(elements[2]), make_list(elements[3]))


In [26]:
def compute_exposure_stats(line_elements):
    uid, tag, ego, nbr = line_elements
    #pp.pprint(ego)
    #print()
    #pp.pprint(nbr)
    #print()

    e_idx = 0 
    n_idx = 0 
    exposure_inc = 0
    total_exp = 0
    hist_too_short = 0
    ego_span = ego[0]-ego[-1]

    # ignore nbr uses after ego's use
    while nbr[n_idx] >= ego[e_idx]:
        n_idx += 1
        if n_idx == len(nbr):
            # No exposure prior to adoption, return zeros
            return (uid, tag, exposure_inc, total_exp, e_idx, hist_too_short, ego_span)

    # total number of exposures before adoption
    # is this correct? 
    total_exp = len(nbr) - n_idx
    
    if ego and nbr:
        hist_too_short = ego[-1] > nbr[n_idx]
    if hist_too_short:
        return (uid, tag, exposure_inc, total_exp, len(ego), hist_too_short, ego_span)
    
    # strip non-exposure intervals from head
    # possibility of crossing?
    while nbr[n_idx] < ego[e_idx]:
        e_idx += 1
        if e_idx == len(ego):
            # User history too short to overlap
            return (uid, tag, exposure_inc, total_exp, e_idx, hist_too_short, ego_span)


    # Computation can proceed
    n_inc_idx = n_idx
    len_nbr = len(nbr)
    while n_inc_idx < len_nbr: # and e_idx < len(ego):
        if nbr[n_inc_idx] >= ego[e_idx]:
            # print(n_inc_idx, e_idx)
            # print("{} greater than {}".format(nbr[n_inc_idx], ego[e_idx]))
            exposure_inc += 1
            n_inc_idx += 1            
        else:
            break

    return (uid, tag, exposure_inc, total_exp, e_idx, hist_too_short, ego_span)



In [28]:
%%time
dp = data_parser(data)
exposure_data = [compute_exposure_stats(x) for x in dp]
exp_df = pd.DataFrame.from_records(exposure_data, columns=['uid', 'tag', 'exposure_inc', 'total_exp', 'hist_len', 'err', 'span'])

CPU times: user 2min 16s, sys: 701 ms, total: 2min 16s
Wall time: 2min 17s


In [29]:
exp_df.head()

Unnamed: 0,uid,tag,exposure_inc,total_exp,hist_len,err,span
0,1497,sub,0,1,101,True,10 days 16:27:35
1,3520,yeahright,0,4,101,True,44 days 04:50:25
2,3936,yeahright,0,7,101,True,5 days 03:47:54
3,8906,yeahright,0,0,0,0,26 days 23:53:14
4,10424,flappybird,0,0,0,0,31 days 10:54:07


In [30]:
exp_df.to_csv("tag_data_computations.txt", index=None)

In [None]:
28932/53763

In [None]:
(exp_df.err).mean()

### exp_df.span[(exp_df.err == True) & (exp_df.hist_len == 101)].astype('timedelta64[h]').hist(bins=50)

In [36]:
exp_df.span[(exp_df.err == True) & (exp_df.hist_len == 101)].astype('timedelta64[h]').median()

375.0

In [37]:
375/25

15.0

In [24]:
coll.Counter((exp_df.hist_len[(exp_df.err == True) & (exp_df.hist_len == 101)]))

Counter({1: 20,
         2: 19,
         3: 12,
         4: 14,
         5: 17,
         6: 10,
         7: 21,
         8: 20,
         9: 12,
         10: 14,
         11: 28,
         12: 20,
         13: 19,
         14: 15,
         15: 15,
         16: 14,
         17: 15,
         18: 18,
         19: 12,
         20: 13,
         21: 20,
         22: 14,
         23: 12,
         24: 12,
         25: 16,
         26: 17,
         27: 15,
         28: 14,
         29: 17,
         30: 16,
         31: 16,
         32: 9,
         33: 13,
         34: 23,
         35: 15,
         36: 12,
         37: 15,
         38: 16,
         39: 10,
         40: 10,
         41: 11,
         42: 19,
         43: 7,
         44: 23,
         45: 10,
         46: 19,
         47: 18,
         48: 7,
         49: 15,
         50: 13,
         51: 23,
         52: 17,
         53: 11,
         54: 11,
         55: 19,
         56: 18,
         57: 17,
         58: 13,
         59: 16,
         

In [13]:
coll.Counter(exp_df.total_exp[exp_df.exposure_inc==1])

Counter({1: 3129,
         2: 662,
         3: 302,
         4: 166,
         5: 116,
         6: 76,
         7: 70,
         8: 60,
         9: 58,
         10: 40,
         11: 31,
         12: 28,
         13: 21,
         14: 19,
         15: 17,
         16: 22,
         17: 11,
         18: 15,
         19: 6,
         20: 5,
         21: 17,
         22: 5,
         23: 8,
         24: 9,
         25: 9,
         26: 2,
         27: 3,
         28: 7,
         29: 3,
         30: 2,
         31: 4,
         33: 4,
         34: 3,
         36: 1,
         37: 1,
         38: 3,
         39: 3,
         40: 3,
         42: 1,
         43: 3,
         44: 2,
         45: 1,
         46: 1,
         47: 2,
         48: 1,
         49: 2,
         52: 1,
         53: 4,
         54: 1,
         55: 1,
         57: 1,
         60: 1,
         62: 1,
         67: 1,
         70: 1,
         71: 1,
         74: 1})

In [30]:
first = next(dp)
exposure_inc, total_exp = compute_exposure_stats(first)
pp.pprint("Exposure Inc: {}".format(exposure_inc))
pp.pprint("Total Exp: {}".format(total_exp))


[ datetime.datetime(2013, 8, 6, 2, 26, 56),
  datetime.datetime(2013, 8, 6, 2, 24, 57),
  datetime.datetime(2013, 8, 5, 21, 11, 22),
  datetime.datetime(2013, 8, 5, 21, 8, 36),
  datetime.datetime(2013, 8, 5, 20, 51),
  datetime.datetime(2013, 8, 5, 20, 39, 18),
  datetime.datetime(2013, 8, 5, 18, 45),
  datetime.datetime(2013, 8, 5, 18, 44, 45),
  datetime.datetime(2013, 8, 5, 18, 10, 51),
  datetime.datetime(2013, 8, 5, 13, 28, 15),
  datetime.datetime(2013, 8, 5, 13, 24, 13)]

[ datetime.datetime(2014, 5, 17, 19, 49, 10),
  datetime.datetime(2013, 6, 21, 19, 17, 47)]

'Exposure Inc: 0'
'Total Exp: 1'


In [51]:
ego = first[2]
nbr = first[3]

e_idx = 0 
n_idx = 0 
# ignore nbr uses after ego's use
while nbr[n_idx] >= ego[e_idx]:
    n_idx += 1
    if n_idx >= len(nbr):
        return 0

# total number of exposures before adoption
total_exp = len(nbr) - n_idx



SyntaxError: unexpected EOF while parsing (<ipython-input-51-22ce67c616da>, line 4)

In [16]:
date_list

'{"2013-04-14 02:36:56","2013-04-14 02:36:56","2013-04-14 02:36:56","2013-04-14 02:22:10","2013-04-13 19:49:18","2013-04-13 17:03:43","2013-04-13 17:03:43","2013-04-12 17:03:37","2013-04-12 17:03:37","2013-04-12 15:01:16","2013-04-12 04:03:38"}'

In [22]:
[datetime.datetime.strptime(x[1:-1], PS_DATE_FMT) for x in date_list[1:-1].split(',')]

[datetime.datetime(2013, 4, 14, 2, 36, 56),
 datetime.datetime(2013, 4, 14, 2, 36, 56),
 datetime.datetime(2013, 4, 14, 2, 36, 56),
 datetime.datetime(2013, 4, 14, 2, 22, 10),
 datetime.datetime(2013, 4, 13, 19, 49, 18),
 datetime.datetime(2013, 4, 13, 17, 3, 43),
 datetime.datetime(2013, 4, 13, 17, 3, 43),
 datetime.datetime(2013, 4, 12, 17, 3, 37),
 datetime.datetime(2013, 4, 12, 17, 3, 37),
 datetime.datetime(2013, 4, 12, 15, 1, 16),
 datetime.datetime(2013, 4, 12, 4, 3, 38)]

In [52]:
line = "foo\tbar\n"
foo = line.strip().split('\t')

In [53]:
foo

['foo', 'bar']

In [54]:
foo.pop()

'bar'