In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict, defaultdict, ChainMap, Counter
import glob
import sys 
import functools
sys.path.append('..')
sys.path.append('../../../../utils')
sys.path.append('../../../../third')
import dask.dataframe as dd
from multiprocessing import Pool, Manager, cpu_count
from joblib import Parallel, delayed
import pymp
import gezi
from gezi import tqdm
tqdm.pandas()

In [2]:
ACTIONS = [
  'read_comment',
  'like',	
  'click_avatar',	
  'forward',
  'favorite',
  'comment',	
  'follow'
]
DAYS = 15

In [3]:
timer = gezi.Timer('read user_action2.feather', True)
d = pd.read_feather('../input/user_action2.feather')
d.feedid = d.feedid.astype(int)
d.userid = d.userid.astype(int)
d.date_ = d.date_.astype(int)
timer.print()

read user_action2.feather start
read user_action2.feather duration: 0.28912949562072754


In [4]:
all_feedids = set(pd.read_csv('../input/feed_info.csv').feedid)

In [5]:
d = d.sort_values(['date_'], ascending=True)

In [6]:
d.head()

Unnamed: 0,index,userid,feedid,date_,device,read_comment,comment,like,play,stay,click_avatar,forward,follow,favorite,version,finish_rate,stay_rate,is_first,actions
0,0,8,71474,1,1,0,0,1,500,5366,0,0,0,0,2,0.045455,0.487818,1,1
319000,6957070,237907,3404,1,2,0,0,0,13573,13871,0,0,0,0,2,0.226217,0.231183,1,0
318999,6957069,237907,87814,1,2,0,0,0,0,333,0,0,0,0,2,0.0,0.00555,1,0
318998,6957068,237907,58723,1,2,0,0,0,0,1973,0,0,0,0,2,0.0,0.07892,1,0
318997,2435793,82352,69580,1,2,0,0,0,13241,13496,0,0,0,0,2,1.203727,1.226909,1,0


In [7]:
d.read_comment.mean()

0.03501586934580252

In [8]:
doc_dynamic_feature = {}
for feedid in tqdm(all_feedids):
  doc_dynamic_feature[int(feedid)] = {}

  0%|          | 0/106444 [00:00<?, ?it/s]

In [9]:
dates = d.groupby(['feedid'])['date_'].progress_apply(list).reset_index(name='dates')

  0%|          | 0/96564 [00:00<?, ?it/s]

In [10]:
dates['dates'] = dates.dates.apply(lambda x:dict(Counter(x)))

In [11]:
dates.head()

Unnamed: 0,feedid,dates
0,0,{10: 1}
1,1,"{1: 3, 3: 22, 4: 15, 5: 10, 6: 6}"
2,3,"{11: 2, 14: 1}"
3,4,"{1: 2, 2: 4, 4: 2, 5: 1}"
4,6,"{1: 1, 2: 1}"


In [12]:
days = DAYS
for feedid in all_feedids:
  shows = [0] * (days + 1)
  doc_dynamic_feature[feedid]['shows'] = shows
  
for row in tqdm(dates.itertuples(), total=len(dates), desc='shows'):
  row = row._asdict()
  dates_ = row['dates']
  shows = [0] * (days + 1)
  for i in range(days):
    i += 1
    if i in dates_:
      shows[i] = dates_[i]
  doc_dynamic_feature[int(row['feedid'])]['shows'] = shows

shows:   0%|          | 0/96564 [00:00<?, ?it/s]

In [13]:
doc_dynamic_feature[d.feedid.values[0]]

{'shows': [0, 64, 42, 60, 73, 40, 20, 20, 15, 18, 27, 14, 4, 4, 0, 0]}

In [14]:
def gen_doc_dynamic(d, feedids=None):
  days = DAYS
  if feedids is not None:
    d = d[d.feedid.isin(set(feedids))]
  else:
    feedids = set(d.feedid)
  dg = d.groupby(['feedid', 'date_'])
  actions = ACTIONS + ['actions', 'finish_rate', 'stay_rate']
  doc_dynamic_feature = {}
  
  for feedid in feedids:
    doc_dynamic_feature[int(feedid)] = {}
    for action in actions:
      doc_dynamic_feature[int(feedid)][action] = [0] * (days + 1)

  t = tqdm(actions)
  for action in t:
#   for action in actions:
#     t.set_postfix({'action': action})
    da = dg[action].progress_apply(sum).reset_index(name=f'{action}_count')
#     for row in tqdm(da.itertuples(), total=len(da), desc=f'{action}_count'):
    for row in da.itertuples():
      row = row._asdict()
      date = row['date_']
      feedid = int(row['feedid'])
      ddf = doc_dynamic_feature[int(row['feedid'])]
      ddf[action][date] = row[f'{action}_count']
  return doc_dynamic_feature

In [15]:
# gen_doc_dynamic(d)

In [16]:
import pymp
nw = cpu_count()
feedids_list = np.array_split(list(all_feedids), nw)
res = Manager().dict()
with pymp.Parallel(nw) as p:
  for i in p.range(nw):
    res[i] = gen_doc_dynamic(d, feedids_list[i])
doc_dynamic_feature2 = dict(ChainMap(*res.values()))

# pfunc = functools.partial(gen_doc_dynamic, d=d)
# with Pool(nw) as p:
#   res = p.map(pfunc, feedids_list)
# doc_dynamic_feature2 = dict(ChainMap(*res))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

  0%|          | 0/5104 [00:00<?, ?it/s]

In [17]:
print(len(doc_dynamic_feature), len(doc_dynamic_feature2))

106444 106444


In [18]:
for feedid in doc_dynamic_feature:
  doc_dynamic_feature[feedid].update(doc_dynamic_feature2[feedid])

In [19]:
doc_dynamic_feature[d.feedid.values[0]]

{'shows': [0, 64, 42, 60, 73, 40, 20, 20, 15, 18, 27, 14, 4, 4, 0, 0],
 'read_comment': [0, 5, 5, 2, 1, 4, 0, 3, 1, 3, 2, 2, 0, 1, 0, 0],
 'like': [0, 2, 1, 1, 3, 3, 4, 2, 1, 1, 3, 3, 0, 1, 0, 0],
 'click_avatar': [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'forward': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'favorite': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'comment': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'follow': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'actions': [0, 8, 7, 3, 5, 7, 4, 5, 2, 4, 5, 5, 0, 2, 0, 0],
 'finish_rate': [0,
  32.64390909090909,
  27.221,
  36.043,
  32.32454545454546,
  28.948545454545453,
  11.886545454545457,
  11.411636363636365,
  9.01518181818182,
  9.865727272727273,
  11.901727272727273,
  8.07690909090909,
  3.9398181818181817,
  3.9409090909090914,
  0,
  0],
 'stay_rate': [0,
  42.20454545454545,
  37.18063636363637,
  44.575545454545455,
  50.346,
  38.80199999999999,
  15.386454545454546,
  16.

In [20]:
dates[dates.feedid==36523]

Unnamed: 0,feedid,dates
31261,36523,{14: 950}


In [21]:
doc_dynamic_feature[36523]

{'shows': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 950, 0],
 'read_comment': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0],
 'like': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0],
 'click_avatar': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0],
 'forward': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'favorite': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'comment': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'follow': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'actions': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0],
 'finish_rate': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1326.4539736842105,
  0],
 'stay_rate': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1414.6834999999999,
  0]}

In [22]:
gezi.save_pickle(doc_dynamic_feature, '../input/doc_dynamic_feature.pkl')

In [23]:
d.groupby(['date_'])['userid'].count()

date_
1     478489
2     461842
3     543566
4     504623
5     444828
6     472726
7     480460
8     500120
9     512466
10    596886
11    614175
12    532038
13    566627
14    609036
Name: userid, dtype: int64