In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict, defaultdict, ChainMap, Counter
import glob
import sys 
import functools
sys.path.append('..')
sys.path.append('../../../../utils')
sys.path.append('../../../../third')
import dask.dataframe as dd
from multiprocessing import Pool, Manager, cpu_count
from joblib import Parallel, delayed
import pymp
import gezi
from gezi import tqdm
tqdm.pandas()

In [2]:
ACTIONS = [
  'read_comment',
  'like',	
  'click_avatar',	
  'forward',
  'favorite',
  'comment',	
  'follow'
]
DAYS = 15

In [3]:
d = pd.read_csv('../input/user_action2.csv')
d.feedid = d.feedid.astype(int)
d.userid = d.userid.astype(int)
d.date_ = d.date_.astype(int)

In [4]:
d = d.sort_values(['date_'], ascending=True)

In [5]:
d.head()

Unnamed: 0,userid,feedid,date_,device,read_comment,comment,like,play,stay,click_avatar,forward,follow,favorite,finish_rate,stay_rate,is_first,actions
0,8,71474,1,1,0,0,1,500,5366,0,0,0,0,0.045455,0.487818,1,1
319000,237907,3404,1,2,0,0,0,13573,13871,0,0,0,0,0.226217,0.231183,1,0
318999,237907,87814,1,2,0,0,0,0,333,0,0,0,0,0.0,0.00555,1,0
318998,237907,58723,1,2,0,0,0,0,1973,0,0,0,0,0.0,0.07892,1,0
318997,82352,69580,1,2,0,0,0,13241,13496,0,0,0,0,1.203727,1.226909,1,0


In [6]:
d.read_comment.mean()

0.03501586934580252

In [6]:
dates = d.groupby(['feedid'])['date_'].progress_apply(list).reset_index(name='dates')

  0%|          | 0/96564 [00:00<?, ?it/s]

In [7]:
dates['dates'] = dates.dates.apply(lambda x:dict(Counter(x)))

In [8]:
dates

Unnamed: 0,feedid,dates
0,0,{10: 1}
1,1,"{1: 3, 3: 22, 4: 15, 5: 10, 6: 6}"
2,3,"{11: 2, 14: 1}"
3,4,"{1: 2, 2: 4, 4: 2, 5: 1}"
4,6,"{1: 1, 2: 1}"
...,...,...
96559,112866,"{6: 4, 7: 3, 9: 1, 10: 1, 11: 3, 12: 3, 13: 4}"
96560,112868,"{11: 132, 12: 82, 13: 73, 14: 48}"
96561,112869,"{13: 7, 14: 14}"
96562,112870,"{1: 2, 3: 1, 4: 3, 6: 1}"


In [9]:
dates['start_day'] = dates.dates.apply(lambda x: min(x.keys()))

In [10]:
dates

Unnamed: 0,feedid,dates,start_day
0,0,{10: 1},10
1,1,"{1: 3, 3: 22, 4: 15, 5: 10, 6: 6}",1
2,3,"{11: 2, 14: 1}",11
3,4,"{1: 2, 2: 4, 4: 2, 5: 1}",1
4,6,"{1: 1, 2: 1}",1
...,...,...,...
96559,112866,"{6: 4, 7: 3, 9: 1, 10: 1, 11: 3, 12: 3, 13: 4}",6
96560,112868,"{11: 132, 12: 82, 13: 73, 14: 48}",11
96561,112869,"{13: 7, 14: 14}",13
96562,112870,"{1: 2, 3: 1, 4: 3, 6: 1}",1


In [40]:
doc_dynamic_feature = {}
for feedid in d.feedid.values:
  doc_dynamic_feature[int(feedid)] = {}

In [41]:
days = DAYS
for row in tqdm(dates.itertuples(), total=len(dates), desc='shows'):
  row = row._asdict()
  dates_ = row['dates']
  shows = [0] * (days + 1)
  for i in range(days):
    i += 1
    if i in dates_:
      shows[i] = dates_[i]
  doc_dynamic_feature[int(row['feedid'])]['shows'] = shows

shows:   0%|          | 0/96564 [00:00<?, ?it/s]

In [42]:
doc_dynamic_feature[d.feedid.values[0]]

{'shows': [0, 64, 42, 60, 73, 40, 20, 20, 15, 18, 27, 14, 4, 4, 0, 0]}

In [27]:
def gen_doc_dynamic(d, feedids=None):
  if feedids is not None:
    d = d[d.feedid.isin(set(feedids))]
  dg = d.groupby(['feedid', 'date_'])
  actions = ACTIONS + ['actions', 'finish_rate', 'stay_rate']
  doc_dynamic_feature = {}
  for feedid in d.feedid.values:
    doc_dynamic_feature[int(feedid)] = {}
  t = tqdm(actions)
  for action in t:
    t.set_postfix({'action': action})
    da = dg[action].progress_apply(sum).reset_index(name=f'{action}_count')
    days = DAYS
    for row in tqdm(da.itertuples(), total=len(da), desc=f'{action}_count'):
      row = row._asdict()
      date = row['date_']
      feedid = int(row['feedid'])
      ddf = doc_dynamic_feature[int(row['feedid'])]
      if action not in ddf:
        ddf[action] = [0] * (days + 1)

      ddf[action][date] = row[f'{action}_count']
  return doc_dynamic_feature

In [17]:
# gen_doc_dynamic(d)

In [28]:
import pymp
nw = cpu_count()
feedids_list = np.array_split(list(set(d.feedid)), nw)
res = Manager().dict()
with pymp.Parallel(nw) as p:
  for i in p.range(nw):
    res[i] = gen_doc_dynamic(d, feedids_list[i])

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

read_comment_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

like_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

click_avatar_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

forward_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

favorite_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

comment_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

follow_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

actions_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

finish_rate_count:   0%|          | 0/5069 [00:00<?, ?it/s]

  0%|          | 0/5069 [00:00<?, ?it/s]

stay_rate_count:   0%|          | 0/5069 [00:00<?, ?it/s]

In [30]:
doc_dynamic_feature2 = dict(ChainMap(*res.values()))

In [43]:
for feedid in doc_dynamic_feature:
  doc_dynamic_feature[feedid].update(doc_dynamic_feature2[feedid])

In [44]:
doc_dynamic_feature[d.feedid.values[0]]

{'shows': [0, 64, 42, 60, 73, 40, 20, 20, 15, 18, 27, 14, 4, 4, 0, 0],
 'read_comment': [0, 5, 5, 2, 1, 4, 0, 3, 1, 3, 2, 2, 0, 1, 0, 0],
 'like': [0, 2, 1, 1, 3, 3, 4, 2, 1, 1, 3, 3, 0, 1, 0, 0],
 'click_avatar': [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'forward': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'favorite': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'comment': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'follow': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'actions': [0, 8, 7, 3, 5, 7, 4, 5, 2, 4, 5, 5, 0, 2, 0, 0],
 'finish_rate': [0,
  32.64390909090909,
  27.221,
  36.043,
  32.32454545454546,
  28.948545454545453,
  11.886545454545457,
  11.411636363636365,
  9.01518181818182,
  9.865727272727272,
  11.901727272727273,
  8.07690909090909,
  3.9398181818181817,
  3.9409090909090914,
  0,
  0],
 'stay_rate': [0,
  42.20454545454545,
  37.18063636363637,
  44.575545454545455,
  50.346000000000004,
  38.80199999999999,
  15.3864545454

In [52]:
dates[dates.feedid==36523]

Unnamed: 0,feedid,dates,start_day
31261,36523,{14: 950},14


In [49]:
doc_dynamic_feature[36523]

{'shows': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 950, 0],
 'read_comment': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0],
 'like': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0],
 'click_avatar': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0],
 'forward': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'favorite': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'comment': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'follow': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'actions': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0],
 'finish_rate': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1326.4539736842105,
  0],
 'stay_rate': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1414.6835, 0]}

In [45]:
gezi.save_pickle(doc_dynamic_feature, '../input/doc_dynamic_feature.pkl')

In [46]:
d.finish_rate.mean()

0.9686956016196298

In [47]:
d.stay_rate.mean()

1.158361276127292

In [None]:
dates.to_csv('../input/doc_static_feature.csv', index=False)

In [None]:
dates2 = d.groupby(['userid'])['date_'].progress_apply(list).reset_index(name='dates')

In [None]:
dates2['dates'] = dates2.dates.apply(lambda x:dict(Counter(x)))

In [None]:
dates2

In [None]:
d.groupby(['date_'])['userid'].count()

In [None]:
d.groupby(['date_'])['feedid'].count()

In [None]:
import numpy as np

def wilson_ctr(clks, imps, z=1.96):
    
    origin_ctr = clks * 1.0 / imps
    
    if origin_ctr > 0.9:
        return 0.0
    
    n = imps
    
    first_part_numerator = origin_ctr + z**2 / (2*n)
    second_part_numerator_2 = np.sqrt(origin_ctr * (1-origin_ctr) / n + z**2 / (4*(n**2)))
    common_denominator = 1 + z**2 / n
    second_part_numerator = z * second_part_numerator_2
    

    new_ctr = (first_part_numerator-second_part_numerator)/common_denominator
    
    return new_ctr

test_case = [(5, 10), (50, 100), (500, 1000), (5000, 10000)]
for item in test_case:
    print(wilson_ctr(*item))


In [None]:
import numpy
import random
import scipy.special as special
 
class BayesianSmoothing(object):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta
    
    def sample(self, alpha, beta, num, imp_upperbound):
        # 先验分布参数
        clicks = []
        exposes = []
        for clk_rt in numpy.random.beta(alpha, beta, num):
            imp = imp_upperbound
            clk = int(imp * clk_rt)
            exposes.append(imp)
            clicks.append(clk)
        return clicks, exposes
    
    def update(self, imps, clks, iter_num=1000, epsilon=1e-5):
        for i in range(iter_num):
            new_alpha, new_beta = self.__fixed_point_iteration(imps, clks, self.alpha, self.beta)
            if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
                break
            self.alpha = new_alpha
            self.beta = new_beta
            
    def __fixed_point_iteration(self, imps, clks, alpha, beta):
        numerator_alpha = 0.0
        numerator_beta = 0.0
        denominator = 0.0
        
        for i in range(len(imps)):
            numerator_alpha += (special.digamma(clks[i]+alpha) - special.digamma(alpha))
            numerator_beta += (special.digamma(imps[i]-clks[i]+beta) - special.digamma(beta))
            denominator += (special.digamma(imps[i]+alpha+beta) - special.digamma(alpha+beta))
        return alpha*(numerator_alpha/denominator), beta*(numerator_beta/denominator)
    
def main():
    bs = BayesianSmoothing(1, 1)
#     clk, exp = bs.sample(500, 500, 10, 1000)
    clk = [5, 50, 500, 5000]
    exp = [10, 100, 1000, 10000]
    print('原始数据')
    for i, j in zip(clk, exp):
        print(i, j)
        
    bs.update(exp, clk)
    print('bayes光滑先验分布参数：', bs.alpha, bs.beta)
    fixed_ctr = []
    for i in range(len(clk)):
        origin_ctr = clk[i] / exp[i]
        new_ctr = (clk[i] + bs.alpha) / (exp[i]+bs.alpha+bs.beta)
        print('修正前{}, 修正后{}'.format(round(origin_ctr, 3), round(new_ctr, 3)))
    
if __name__ == '__main__':
    main()