In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict, defaultdict
import glob
import sys, os
sys.path.append('..')
sys.path.append('../../../../utils')
sys.path.append('../../../../third')
import gezi
from gezi import tqdm
tqdm.pandas()

In [2]:
with gezi.Timer('read train user_actions'):
  d = pd.read_feather('../input/user_action2.feather')
  d['version'] = 2
  try:
    d1 = pd.read_csv('../input/v1/user_action.csv')
    d1['version'] = 1
    d = pd.concat([d, d1])
  except Exception:
    pass
with gezi.Timer('read test and merge'):
  dt = pd.read_csv('../input/test_a.csv')
  dt['version'] = 2
  try:
    dt1a = pd.read_csv('../input/v1/test_a.csv')
    dt1a['version'] = 1
    dt1b = pd.read_csv('../input/v1/test_b.csv')
    dt1b['version'] = 1
    dt = pd.concat([dt, dt1a, dt1b])
  except Exception:
    dtb = pd.read_csv('../input/test_b.csv')
    dtb['version'] = 2
    dt = pd.concat([dt, dtb])
  dt['date_'] = 15
  dt['actions'] = 0
cols = ['userid', 'feedid', 'date_', 'version', 'actions']
dshow = pd.concat([d[cols], dt[cols]])
dshow = dshow.sort_values(['date_'], ascending=True)

read train user_actions start
read train user_actions duration: 0.30967187881469727
read test and merge start
read test and merge duration: 0.17609405517578125


In [3]:
dshow.head()

Unnamed: 0,userid,feedid,date_,version,actions
0,8,71474,1,2,1
319000,237907,3404,1,2,0
318999,237907,87814,1,2,0
318998,237907,58723,1,2,0
318997,82352,69580,1,2,0


In [4]:
feed_start = dshow.groupby(['feedid'])['date_'].progress_apply(min).reset_index(name='feed_start')

  0%|          | 0/99420 [00:00<?, ?it/s]

In [5]:
feed_start.head()

Unnamed: 0,feedid,feed_start
0,0,10
1,1,1
2,3,11
3,4,1
4,6,1


In [6]:
d = pd.merge(dshow, feed_start, on='feedid')

In [7]:
d['fresh'] = d['date_'] - d['feed_start']

In [8]:
actions_today = d.groupby(['userid', 'date_'])['actions'].progress_apply(sum).reset_index(name='actions_today')

  0%|          | 0/190372 [00:00<?, ?it/s]

In [9]:
actions_today.head()

Unnamed: 0,userid,date_,actions_today
0,8,1,5
1,8,2,9
2,8,3,3
3,8,5,3
4,8,6,2


In [10]:
actions_prev = []
prev_userid = None
prev = 0
for i, row in tqdm(enumerate(actions_today.itertuples()), total=len(actions_today)):
  if row.userid != prev_userid:
    actions_prev.append(0)
  else:
    actions_prev.append(actions_prev[i - 1] + prev)
  prev = row.actions_today
  prev_userid = row.userid
actions_today['actions_prev'] = actions_prev

  0%|          | 0/190372 [00:00<?, ?it/s]

In [11]:
actions_today[actions_today.userid==129].head(100)

Unnamed: 0,userid,date_,actions_today,actions_prev
90,129,2,0,0
91,129,5,0,0
92,129,6,0,0
93,129,7,2,0
94,129,8,0,2
95,129,9,0,2
96,129,10,1,2
97,129,11,0,3
98,129,12,0,3
99,129,13,0,3


In [12]:
d = pd.merge(d, actions_today, on=['userid', 'date_'])

In [13]:
d.head()

Unnamed: 0,userid,feedid,date_,version,actions,feed_start,fresh,actions_today,actions_prev
0,8,71474,1,2,1,1,0,5,0
1,8,50282,1,2,0,1,0,5,0
2,8,62705,1,2,1,1,0,5,0
3,8,84318,1,2,0,1,0,5,0
4,8,69745,1,2,0,1,0,5,0


In [15]:
d[d.date_ == 14].describe()

Unnamed: 0,userid,feedid,date_,version,actions,feed_start,fresh,actions_today,actions_prev
count,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0
mean,124279.308573,57009.594791,14.0,2.0,0.070904,9.889136,4.110864,6.165013,27.510187
std,72055.974325,32601.355963,0.0,0.0,0.278249,3.885904,3.885904,13.59474,51.8841
min,8.0,3.0,14.0,2.0,0.0,1.0,0.0,0.0,0.0
25%,60576.0,29023.0,14.0,2.0,0.0,7.0,1.0,0.0,5.0
50%,124086.0,57141.0,14.0,2.0,0.0,11.0,3.0,2.0,10.0
75%,187056.0,85132.0,14.0,2.0,0.0,13.0,7.0,6.0,25.0
max,250229.0,112871.0,14.0,2.0,4.0,14.0,13.0,183.0,628.0


In [16]:
d[d.date_ == 15].describe()

Unnamed: 0,userid,feedid,date_,version,actions,feed_start,fresh,actions_today,actions_prev
count,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0
mean,124462.094549,57065.875044,15.0,2.0,0.0,10.978918,4.021082,0.0,29.66734
std,72329.318289,32425.157875,0.0,0.0,0.0,4.010435,4.010435,0.0,56.129751
min,8.0,0.0,15.0,2.0,0.0,1.0,0.0,0.0,0.0
25%,60866.0,29674.0,15.0,2.0,0.0,8.0,1.0,0.0,5.0
50%,125668.0,57114.0,15.0,2.0,0.0,13.0,2.0,0.0,11.0
75%,187052.0,84948.0,15.0,2.0,0.0,14.0,7.0,0.0,27.0
max,250236.0,112871.0,15.0,2.0,0.0,15.0,14.0,0.0,827.0


In [26]:
d['new_doc'] = (d.fresh == 0).astype(int)

In [27]:
d['new_user'] = (d.actions_prev == 0).astype(int)

In [29]:
d[d.date_ == 14].describe()

Unnamed: 0,userid,feedid,date_,version,actions,feed_start,fresh,actions_today,actions_prev,new_doc,new_user
count,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0,609036.0
mean,124279.308573,57009.594791,14.0,2.0,0.070904,9.889136,4.110864,6.165013,27.510187,0.191659,0.04127
std,72055.974325,32601.355963,0.0,0.0,0.278249,3.885904,3.885904,13.59474,51.8841,0.393606,0.198915
min,8.0,3.0,14.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,60576.0,29023.0,14.0,2.0,0.0,7.0,1.0,0.0,5.0,0.0,0.0
50%,124086.0,57141.0,14.0,2.0,0.0,11.0,3.0,2.0,10.0,0.0,0.0
75%,187056.0,85132.0,14.0,2.0,0.0,13.0,7.0,6.0,25.0,0.0,0.0
max,250229.0,112871.0,14.0,2.0,4.0,14.0,13.0,183.0,628.0,1.0,1.0


In [30]:
d[d.date_ == 15].describe()

Unnamed: 0,userid,feedid,date_,version,actions,feed_start,fresh,actions_today,actions_prev,new_doc,new_user
count,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0,841631.0
mean,124462.094549,57065.875044,15.0,2.0,0.0,10.978918,4.021082,0.0,29.66734,0.171991,0.010146
std,72329.318289,32425.157875,0.0,0.0,0.0,4.010435,4.010435,0.0,56.129751,0.377373,0.100214
min,8.0,0.0,15.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,60866.0,29674.0,15.0,2.0,0.0,8.0,1.0,0.0,5.0,0.0,0.0
50%,125668.0,57114.0,15.0,2.0,0.0,13.0,2.0,0.0,11.0,0.0,0.0
75%,187052.0,84948.0,15.0,2.0,0.0,14.0,7.0,0.0,27.0,0.0,0.0
max,250236.0,112871.0,15.0,2.0,0.0,15.0,14.0,0.0,827.0,1.0,1.0


In [14]:
abcd

NameError: name 'abcd' is not defined

In [None]:
d.head()

In [None]:
d2 = d.groupby(['userid', 'date_'])['feedid'].progress_apply(list).reset_index(name='feedids')

In [None]:
d2['shows'] = d2.feedids.apply(len)

In [None]:
d2.describe([.25,.5,.75,.9,.99,.999])

In [None]:
d2[d2.date_ == 14].describe([.25,.5,.75,.9,.99,.999])

In [None]:
da = pd.read_csv('../input/test_a.csv')
da['date_'] = 15
da2 = da.groupby(['userid', 'date_'])['feedid'].progress_apply(list).reset_index(name='feedids')
da2['shows'] = da2.feedids.apply(len)

In [None]:
da2.describe([.25,.5,.75,.9,.99,.999])

In [None]:
db = pd.read_csv('../input/test_b.csv')
db['date_'] = 15
db2 = db.groupby(['userid', 'date_'])['feedid'].progress_apply(list).reset_index(name='feedids')
db2['shows'] = db2.feedids.apply(len)

In [None]:
db2.describe([.25,.5,.75,.9,.99,.999])

In [None]:
USER_ACTION = '../input/user_action.csv'
ROOT_PATH = '../input'
FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar",  "forward", "comment", "follow", "favorite"]
END_DAY = 15

In [None]:
def statis_feature(start_day=1, before_day=7, agg='sum'):
    """
    统计用户/feed 过去n天各类行为的次数
    :param start_day: Int. 起始日期
    :param before_day: Int. 时间范围（天数）
    :param agg: String. 统计方法
    """
    history_data = pd.read_csv(USER_ACTION)[["userid", "date_", "feedid"] + FEA_COLUMN_LIST]
#     feature_dir = os.path.join(ROOT_PATH, "feature")
    feature_dir = ROOT_PATH
    for dim in ["userid", "feedid"]:
        print(dim)
        user_data = history_data[[dim, "date_"] + FEA_COLUMN_LIST]
        res_arr = []
        for start in tqdm(range(start_day, END_DAY-before_day+1)):
            temp = user_data[((user_data["date_"]) >= start) & (user_data["date_"] < (start + before_day))]
            temp = temp.drop(columns=['date_'])
            temp = temp.groupby([dim]).agg([agg]).reset_index()
            temp.columns = list(map(''.join, temp.columns.values))
            temp["date_"] = start + before_day
            res_arr.append(temp)
        dim_feature = pd.concat(res_arr)
        feature_path = os.path.join(feature_dir, dim+"_feature.csv")
        print('Save to: %s'%feature_path)
        dim_feature.to_csv(feature_path, index=False)

In [None]:
statis_feature()

In [None]:
d = pd.read_csv('../input/feedid_feature.csv')

In [None]:
d.head()

In [None]:
d[d.feedid == 1]

In [None]:
set(d.date_)

In [None]:
his = pd.read_csv('../input/user_action.csv')

In [None]:
his[his.feedid==1]