In [1]:
import io
import datetime
import itertools as itr
import collections as coll
import pprint
import pandas as pd

# %matplotlib inline

pp = pprint.PrettyPrinter(indent=2)
PS_DATE_FMT = "%Y-%m-%d %H:%M:%S"

In [2]:
# contains ego ht usage and alter updates
agg_path = '/Users/g/Desktop/temp/agg_first_usages.tsv'
# contains all ego updates
ego_updates_path = '/Users/g/Desktop/temp/ego_updates.tsv'

In [3]:
def make_list(list_str):
    '''
    Strips {} from the entire array str then strips "" from each entry
    Finally makes date using the date format
    '''
    l = [
        datetime.datetime.strptime(x.strip('"'), PS_DATE_FMT)
        for x in list_str.strip('{}').split(',')
    ]
    return l

def agg_parser(agg_path):
    '''
    4 columns:
        uid
        hashtag
        ego_first_usage
        alter_updates
        
    Returns tuple
    '''
    with open(agg_path) as f:
        for line in f:
            elements = line.strip().split('\t')
            assert len(elements) == 4, 'Agg row does not have 4 cols'
            uid = elements[0]
            tag = elements[1]
            ego_first_usage = datetime.datetime.strptime(elements[2], PS_DATE_FMT)
            alter_updates = make_list(elements[3])
            yield uid, tag, ego_first_usage, alter_updates

def ego_updates_parser(ego_updates_path):
    '''
    2 columns:
        uid
        ego_updates
    '''
    with open(ego_updates_path) as f:
        for line in f:
            elements = line.strip().split('\t')
            assert len(elements) == 2, 'Ego updates row does not have 4 cols'
            uid = elements[0]
            ego_updates = make_list(elements[1])
            yield uid, ego_updates

In [4]:
def compute_exposure_stats(
    uid,
    tag,
    ego_first_usage,
    alter_updates,
    ego_updates):
    '''
    Pass this function everything we need to compute one (ego, hashtag) pair
    
    Algorithm:
        1. Trim such that all ego_updates <= ego_first_usage
        2. Trim such that all alter_updates <= ego_first_usage
        3. 
        
    Returns
    '''

    e_idx = ego_updates.index(ego_first_usage) # automatically start in right place
    n_idx = 0 # neighbor update index
    exposure_inc = 0
    total_exp = 0
    hist_too_short = False
    span = ego_updates[e_idx] - ego_updates[-1]
    
    # ignore nbr usages after ego's use
    while alter_updates[n_idx] >= ego_updates[e_idx]:
        n_idx += 1
        if n_idx == len(alter_updates):
            # No exposure prior to adoption, return zeros
            return (uid, tag, exposure_inc, total_exp, hist_too_short, span)

    # total number of exposures before adoption
    total_exp = len(alter_updates) - n_idx
    
    # if last ego update is after the most recent alter update, bail out
    if ego_updates and alter_updates:
        hist_too_short = ego_updates[-1] > alter_updates[n_idx]
    if hist_too_short:
        return (uid, tag, exposure_inc, total_exp, hist_too_short, span)
    
    # strip non-exposure intervals from head
    while ego_updates[e_idx] > alter_updates[n_idx]:
        e_idx += 1
        if e_idx == len(ego_updates):
            # User history too short to overlap
            return (uid, tag, exposure_inc, total_exp, hist_too_short, span)

    # Computation can proceed
    n_inc_idx = n_idx
    len_nbr = len(alter_updates)
    while n_inc_idx < len_nbr: # and e_idx < len(ego):
        if alter_updates[n_inc_idx] >= ego_updates[e_idx]:
            # print(n_inc_idx, e_idx)
            # print("{} greater than {}".format(nbr[n_inc_idx], ego[e_idx]))
            exposure_inc += 1
            n_inc_idx += 1            
        else:
            break

    return (uid, tag, exposure_inc, total_exp, hist_too_short, span)

In [5]:
# need to pre-load the ego_updates since we will use them many times
ego_updates_dict = {}
for uid, ego_updates in ego_updates_parser(ego_updates_path):
    ego_updates_dict[uid] = ego_updates
    if len(ego_updates_dict) % 100 == 0:
        break

exposure_data = []
for uid, tag, ego_first_usage, alter_updates in agg_parser(agg_path):
    if uid in ego_updates_dict:
        res = compute_exposure_stats(
            uid, tag, ego_first_usage, alter_updates, ego_updates_dict[uid]
        )
        exposure_data.append(res)

exp_df = pd.DataFrame.from_records(exposure_data, columns=['uid', 'tag', 'exposure_inc', 'total_exp', 'err', 'span'])

In [6]:
exp_df

Unnamed: 0,uid,tag,exposure_inc,total_exp,err,span
0,22167545,yeahright,1,2,False,1274 days 22:59:32
1,23270835,yeahright,0,0,False,231 days 17:06:43
2,23349470,rare,0,0,False,968 days 15:54:34
3,27107246,thetruth,1,1,False,1637 days 01:33:41
4,27578762,thetruth,0,0,False,662 days 02:23:11
5,30310944,FuckIt,0,0,False,81 days 13:19:45
6,32480116,yeahright,0,1,True,773 days 02:08:10
7,33148717,rare,0,5,True,0 days 22:48:43
8,33700456,rare,1,4,False,39 days 05:26:01
9,33846841,noexcuses,1,1,False,741 days 22:11:26


In [8]:
list(exp_df.uid)

['22167545',
 '23270835',
 '23349470',
 '27107246',
 '27578762',
 '30310944',
 '32480116',
 '33148717',
 '33700456',
 '33846841',
 '36398000',
 '37608007',
 '37608007',
 '42036792',
 '42036792',
 '42036792',
 '42036792',
 '45458084',
 '46056060',
 '51947272',
 '59862949',
 '83402675',
 '95092996',
 '105064622',
 '128222962',
 '162110552',
 '173259641',
 '174792347',
 '180639422',
 '186271603',
 '186483324',
 '186848962',
 '195077460',
 '200922832',
 '214906777',
 '222485208',
 '232707564',
 '234257876',
 '241377893',
 '255909606',
 '257050654',
 '260605003',
 '266957115',
 '284858962',
 '293801920',
 '296848576',
 '299220023',
 '320815422',
 '325362684',
 '325362684',
 '326623402',
 '331915191',
 '333610398',
 '333610398',
 '333610398',
 '336271445',
 '337311184',
 '344151944',
 '346766131',
 '349333357',
 '352228929',
 '359068098',
 '361402472',
 '375843511',
 '377741986',
 '392495513',
 '393606992',
 '406724804',
 '409694839',
 '412651007',
 '416100197',
 '416100197',
 '421376683',
 

In [23]:
coll.Counter(exp_df.total_exp[exp_df.exposure_inc==1])

Counter({1: 239,
         2: 92,
         3: 54,
         4: 25,
         5: 15,
         6: 20,
         7: 12,
         8: 8,
         9: 3,
         10: 5,
         11: 4,
         12: 4,
         13: 1,
         14: 4,
         15: 3,
         18: 1,
         19: 1,
         21: 1,
         51: 1})

In [29]:
coll.Counter(exp_df.err)

Counter({False: 8633, True: 2150})

In [30]:
coll.Counter(exp_df.tag)

Counter({'FuckIt': 1071,
         'flappybird': 776,
         'focused': 1185,
         'noexcuses': 1147,
         'rare': 1230,
         'sosick': 1242,
         'sub': 1431,
         'thetruth': 935,
         'up': 736,
         'yeahright': 1030})