In [1]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
import time
from collections import defaultdict
import gezi
import melt

In [2]:
def get_day(timestamp):
  x = time.localtime(timestamp)
  return x.tm_mday

def get_mon_day(timestamp):
  x = time.localtime(timestamp)
  return x.tm_mon, x.tm_mday

# 时间穿越或者当天的历史去掉 测试集合没有当天历史
def is_badtime(x, timestamp):
  if get_mon_day(x) >= get_mon_day(timestamp):
    return True

context_cols = ['prev', 'mod', 'mf', 'aver', 'sver', 'region']
item_cols = ['vid', 'duration_', 'title_length_', 'class_id', 'second_class', 'is_intact', 'vv_', 'ctr_']

def gen_context_feats(row):
  feats = {}
  cols = context_cols
  for i in range(len(cols) - 1):
    for j in range(i+1, len(cols)):
      feats[f'{cols[i]}_{cols[j]}'] = hash(f'{row[cols[i]]}_{row[cols[j]]}')

  return feats

def gen_item_feats(row):
  feats = {}
  cols = item_cols
  for i in range(len(cols) - 1):
    for j in range(i+1, len(cols)):
      feats[f'{cols[i]}_{cols[j]}'] = hash(f'{row[cols[i]]}_{row[cols[j]]}')
  return feats

# cross feats change to cross_
def gen_match_feats(row):
  feats = {}
  for context_col in context_cols:
    for item_col in item_cols:
      feats[f'cross_{context_col}_{item_col}'] = hash(f'{row[context_col]}_{row[item_col]}')

  # l = []
  # for context_col in context_cols:
  #   for star in row['stars']:
  #     l += hash(f'{row[context_col]_{star}}')
  # feats['match_stars'] = l
  return feats

def get_time(timeStamp):
    timeArray = time.localtime(timeStamp)
    return time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
  
def get_new_date(data):
    tmp_timestamp_first = data.groupby(['did'])['timestamp'].min().reset_index()
    del data['timestamp']
    data = pd.merge(data,tmp_timestamp_first,on=['did'],how='left',copy=False)
    return data

In [3]:
FLAGS = melt.get_flags()
from absl import flags
flags.DEFINE_string('odir', '../input/tfrecords', '')
flags.DEFINE_string('mark', 'train', 'train or eval')
flags.DEFINE_integer('num_records', 32, '')
flags.DEFINE_integer('day', 30, '')
flags.DEFINE_bool('toy', False, '')
flags.DEFINE_integer('seed_', 12345, '')
flags.DEFINE_bool('force', True, '')
flags.DEFINE_bool('lm', False, '')

In [4]:
melt.init_flags()

In [5]:
FLAGS.mark = 'train'
FLAGS.day = 1

In [11]:
ifile = f'../input/{FLAGS.mark}/part_{FLAGS.day}/context.parquet' if FLAGS.mark == 'train' else f'../input/{FLAGS.mark}/context.parquet'
df = gezi.read_parquet(ifile)
df = get_new_date(df)
if FLAGS.lm:
  df = df.groupby('did', as_index=False).first()
if FLAGS.mark == 'train':
  df = df.sample(frac=1, random_state=FLAGS.seed_)

In [12]:
df

Unnamed: 0,label,mod,mf,aver,sver,did,vid,prev,region,index,timestamp
395059,0,e54e08241b1b786273e2e56a55db2d00,fce8cc8ce0587de13c3fecbc13537746,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,0d309aa7cafb52b8038d02f0678ec0a4,3770000695,2954065584,3849523164,395059,1424415089
1014566,0,0cbce95d55af672caebaeb5f719822c9,932a5cc56763396c6b205a63868d6316,a4ae7310d52aeec7511d20bd29289974,e49dc1e8e576ff35a07e1182f4e98c91,38fe2a5ad0e504dba875a35ef534bf9e,1872532270,0,2601207674,1014566,1424428954
921202,0,e277978dc2149421abd2b4b7b69774f7,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,ba195dd85894d6957bc61edca5ac7026,93772a5f285e8b08bf6dd40cdb96c416,1106796770,0,4044483189,921202,1424412648
643737,0,9749719a2900e64655a8ee8819551a6b,932a5cc56763396c6b205a63868d6316,183e022fe835e248ec27774c2687e312,21032c3a66d86aa1d99127d4b97cb548,563ff26ae011f218a8be3432bc0761e0,3568116718,0,897685811,643737,1424369204
2543541,0,3209266cd5c1a05ebe46a0b792ab5cc8,932a5cc56763396c6b205a63868d6316,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,e60a926a1d77953bf2969388575c5421,1561337430,0,387038698,2543541,1424412386
...,...,...,...,...,...,...,...,...,...,...,...
1134974,0,1e88c4e3094a7ee59c35d59215bf91df,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,ae38f6fec828f8bcdf2e4f651481ccf0,2791304249,2005344201,1447042734,1134974,1424404421
1970174,0,e277978dc2149421abd2b4b7b69774f7,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,ba195dd85894d6957bc61edca5ac7026,a787ae1935c6ae9b6d05dd5ad97d85a4,2072040006,3760553654,1803814524,1970174,1424431174
2993577,0,ecff271f71d2e2b20f40798da88bff7b,fce8cc8ce0587de13c3fecbc13537746,183e022fe835e248ec27774c2687e312,21032c3a66d86aa1d99127d4b97cb548,ba502cb68f22f8f02cde45f6b350a01d,1566946944,0,2076990513,2993577,1424370587
1396132,0,1373ec7a8565dc3c28eae927edaf5de7,fce8cc8ce0587de13c3fecbc13537746,91eceab4802d90baf863430c968fbb0a,1c58433296d1b994e72747f4b3cddbf5,eeda84896d7aba7f230ccaf388d0e6a6,923905001,0,1972148823,1396132,1424417886


In [10]:
df

Unnamed: 0,label,mod,mf,aver,sver,did,vid,prev,region,index,timestamp,watch
0,0,e54e08241b1b786273e2e56a55db2d00,fce8cc8ce0587de13c3fecbc13537746,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,0d309aa7cafb52b8038d02f0678ec0a4,3770000695,2954065584,3849523164,395059,1424415089,"[[1424434053, 895703210], [1424433760, 3839826..."
1,0,0cbce95d55af672caebaeb5f719822c9,932a5cc56763396c6b205a63868d6316,a4ae7310d52aeec7511d20bd29289974,e49dc1e8e576ff35a07e1182f4e98c91,38fe2a5ad0e504dba875a35ef534bf9e,1872532270,0,2601207674,1014566,1424428954,"[[1424448002, 3823067793], [1424446697, 285752..."
2,0,e277978dc2149421abd2b4b7b69774f7,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,ba195dd85894d6957bc61edca5ac7026,93772a5f285e8b08bf6dd40cdb96c416,1106796770,0,4044483189,921202,1424412648,"[[1424415222, 1190429005], [1424415117, 384042..."
3,0,9749719a2900e64655a8ee8819551a6b,932a5cc56763396c6b205a63868d6316,183e022fe835e248ec27774c2687e312,21032c3a66d86aa1d99127d4b97cb548,563ff26ae011f218a8be3432bc0761e0,3568116718,0,897685811,643737,1424369204,"[[1424370896, 785284683], [1424369271, 5812367..."
4,0,3209266cd5c1a05ebe46a0b792ab5cc8,932a5cc56763396c6b205a63868d6316,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,e60a926a1d77953bf2969388575c5421,1561337430,0,387038698,2543541,1424412386,"[[1424413117, 940589783], [1424412687, 2495786..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3208241,0,1e88c4e3094a7ee59c35d59215bf91df,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,ae38f6fec828f8bcdf2e4f651481ccf0,2791304249,2005344201,1447042734,1134974,1424404421,"[[1424404248, 3148409116], [1424404233, 319445..."
3208242,0,e277978dc2149421abd2b4b7b69774f7,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,ba195dd85894d6957bc61edca5ac7026,a787ae1935c6ae9b6d05dd5ad97d85a4,2072040006,3760553654,1803814524,1970174,1424431174,"[[1424442049, 3009700169], [1424441915, 138542..."
3208243,0,ecff271f71d2e2b20f40798da88bff7b,fce8cc8ce0587de13c3fecbc13537746,183e022fe835e248ec27774c2687e312,21032c3a66d86aa1d99127d4b97cb548,ba502cb68f22f8f02cde45f6b350a01d,1566946944,0,2076990513,2993577,1424370587,"[[1424372347, 3853651641], [1424371758, 241963..."
3208244,0,1373ec7a8565dc3c28eae927edaf5de7,fce8cc8ce0587de13c3fecbc13537746,91eceab4802d90baf863430c968fbb0a,1c58433296d1b994e72747f4b3cddbf5,eeda84896d7aba7f230ccaf388d0e6a6,923905001,0,1972148823,1396132,1424417886,"[[1424433130, 2416800345], [1424429960, 359151..."


In [11]:
ifile3 = f'../input/{FLAGS.mark}/part_{FLAGS.day}/item.parquet' if FLAGS.mark == 'train' else f'../input/{FLAGS.mark}/item.parquet'
idf = gezi.read_parquet(ifile3)
w = pd.read_csv('../input/all/bins.csv')
cols = ['title_length', 'duration', 'vv']
for col in cols:
  idf[f'{col}_'] = pd.cut(idf[col], w[col].values, labels=range(10))
  idf[f'{col}_'] = idf[f'{col}_'].astype(int)
  idf[f'{col}_'] = idf[f'{col}_'].apply(lambda x: max(x + 1, 1))
  

def _ctr(x):
  bins = list(map(float, range(100)))
  bins = [x * 0.01 for x in bins]
  for i in range(100):
    if x <= bins[i]:
      return i
  return i
idf['ctr_'] = idf.ctr.apply(_ctr)

idf = idf.rename(columns={'timestamp':'vtimestamp'})

df = df.merge(idf, how="left", on="vid")

In [12]:
idf.ctr_.describe()

count    59117.000000
mean         2.950285
std          6.562674
min          0.000000
25%          0.000000
50%          0.000000
75%          3.000000
max         76.000000
Name: ctr_, dtype: float64

In [13]:
df.describe()

Unnamed: 0,label,vid,prev,region,index,timestamp,title_length,vtimestamp,cid,class_id,is_intact,second_class,duration,ctr,vv,title_length_,duration_,vv_,ctr_
count,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0,3208246.0
mean,0.06539523,2151460000.0,575100500.0,2187078000.0,1604122.0,1424410000.0,69.54812,1396389000.0,2214799000.0,1370040000.0,2345982000.0,151666800.0,1177.002,0.09189928,254549.4,5.749552,5.833016,8.722974,9.629673
std,0.247222,1219143000.0,1135547000.0,1215683000.0,926141.0,24492.96,25.41361,78653080.0,1226056000.0,692275500.0,589988200.0,617498600.0,1930.231,0.05814578,1615142.0,2.967125,3.254325,2.315781,5.895776
min,0.0,26537.0,0.0,12720950.0,0.0,1424362000.0,3.0,0.0,1584856.0,324498100.0,472867300.0,0.0,3.27,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,1118049000.0,0.0,1189423000.0,802061.2,1424394000.0,55.0,1390549000.0,1281370000.0,605350600.0,2234347000.0,0.0,63.22,0.0725,4104.0,3.0,3.0,9.0,8.0
50%,0.0,2167203000.0,0.0,2226260000.0,1604122.0,1424412000.0,73.0,1407479000.0,2261083000.0,1413776000.0,2234347000.0,0.0,139.52,0.095,20384.0,6.0,6.0,10.0,10.0
75%,0.0,3163015000.0,315060800.0,3339606000.0,2406184.0,1424432000.0,88.0,1421392000.0,3174894000.0,1820266000.0,2234347000.0,0.0,1679.69,0.105,68672.0,8.0,9.0,10.0,11.0
max,1.0,4294785000.0,4294374000.0,4265471000.0,3208245.0,1424448000.0,177.0,1424447000.0,4294754000.0,4242945000.0,4116347000.0,4269766000.0,23620.3,0.7575,28723890.0,10.0,10.0,10.0,76.0


In [14]:
df.columns

Index(['label', 'mod', 'mf', 'aver', 'sver', 'did', 'vid', 'prev', 'region',
       'index', 'timestamp', 'watch', 'title_length', 'vtimestamp', 'stars',
       'cid', 'class_id', 'is_intact', 'second_class', 'duration', 'ctr', 'vv',
       'title_length_', 'duration_', 'vv_', 'ctr_'],
      dtype='object')

In [15]:
idf['prev'] = idf['vid']
cols =  ['is_intact', 'duration', 'title_length', 'ctr', 'vv', 'duration_', 'title_length_', 'ctr_', 'vv_']
cols2 = [f'prev_{x}' for x in cols]
m = dict(zip(cols, cols2))
idf = idf.rename(columns=m)
idf = idf[['prev', *cols2]]
idf

Unnamed: 0,prev,prev_is_intact,prev_duration,prev_title_length,prev_ctr,prev_vv,prev_duration_,prev_title_length_,prev_ctr_,prev_vv_
0,5893106,2234347078,86.110001,25,0.0,1396.0,4,1,0,7
1,15819492,2234347078,6954.200195,98,0.0,1100.0,10,9,0,7
2,16814925,2234347078,317.190002,21,0.0,4196.0,8,1,0,9
3,43514419,2234347078,442.540009,41,0.0,48.0,8,2,0,2
4,66011667,3172394293,213.639999,64,0.0,8.0,7,5,0,1
...,...,...,...,...,...,...,...,...,...,...
59112,4183392248,3172394293,102.459999,69,0.0,28.0,5,5,0,2
59113,4209813609,2234347078,4708.799805,99,0.0,700.0,10,10,0,6
59114,4223027093,2234347078,83.930000,87,0.0,268.0,4,8,0,5
59115,4229590239,3172394293,190.750000,46,0.0,648.0,7,3,0,6


In [16]:
df = df.merge(idf, how="left", on='prev')

In [17]:
df = df.fillna(0)

In [18]:
df

Unnamed: 0,label,mod,mf,aver,sver,did,vid,prev,region,index,...,ctr_,prev_is_intact,prev_duration,prev_title_length,prev_ctr,prev_vv,prev_duration_,prev_title_length_,prev_ctr_,prev_vv_
0,0,e54e08241b1b786273e2e56a55db2d00,fce8cc8ce0587de13c3fecbc13537746,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,0d309aa7cafb52b8038d02f0678ec0a4,3770000695,2954065584,3849523164,395059,...,6,4.728673e+08,52.320000,90.0,0.0,1128.0,2.0,9.0,0.0,7.0
1,0,0cbce95d55af672caebaeb5f719822c9,932a5cc56763396c6b205a63868d6316,a4ae7310d52aeec7511d20bd29289974,e49dc1e8e576ff35a07e1182f4e98c91,38fe2a5ad0e504dba875a35ef534bf9e,1872532270,0,2601207674,1014566,...,11,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,e277978dc2149421abd2b4b7b69774f7,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,ba195dd85894d6957bc61edca5ac7026,93772a5f285e8b08bf6dd40cdb96c416,1106796770,0,4044483189,921202,...,12,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,9749719a2900e64655a8ee8819551a6b,932a5cc56763396c6b205a63868d6316,183e022fe835e248ec27774c2687e312,21032c3a66d86aa1d99127d4b97cb548,563ff26ae011f218a8be3432bc0761e0,3568116718,0,897685811,643737,...,10,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,3209266cd5c1a05ebe46a0b792ab5cc8,932a5cc56763396c6b205a63868d6316,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,e60a926a1d77953bf2969388575c5421,1561337430,0,387038698,2543541,...,5,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3208241,0,1e88c4e3094a7ee59c35d59215bf91df,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,e49dc1e8e576ff35a07e1182f4e98c91,ae38f6fec828f8bcdf2e4f651481ccf0,2791304249,2005344201,1447042734,1134974,...,11,4.728673e+08,47.959999,112.0,0.0,15824.0,2.0,10.0,0.0,10.0
3208242,0,e277978dc2149421abd2b4b7b69774f7,9de1665e6430fa58785027dbf9ab53e0,183e022fe835e248ec27774c2687e312,ba195dd85894d6957bc61edca5ac7026,a787ae1935c6ae9b6d05dd5ad97d85a4,2072040006,3760553654,1803814524,1970174,...,11,3.172394e+09,109.000000,91.0,0.0,2376.0,5.0,9.0,0.0,8.0
3208243,0,ecff271f71d2e2b20f40798da88bff7b,fce8cc8ce0587de13c3fecbc13537746,183e022fe835e248ec27774c2687e312,21032c3a66d86aa1d99127d4b97cb548,ba502cb68f22f8f02cde45f6b350a01d,1566946944,0,2076990513,2993577,...,10,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3208244,0,1373ec7a8565dc3c28eae927edaf5de7,fce8cc8ce0587de13c3fecbc13537746,91eceab4802d90baf863430c968fbb0a,1c58433296d1b994e72747f4b3cddbf5,eeda84896d7aba7f230ccaf388d0e6a6,923905001,0,1972148823,1396132,...,7,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [259]:
vinfo = pd.read_csv('../input/all/vinfo_static.csv')

In [260]:
vinfo

Unnamed: 0,vid,image_emb,title,story,title_length,timestamp,stars,cid,class_id,is_intact,second_class,duration,ctr,vv,stars2
0,5893106,"-8.26598072052,1.63619112968,17.9674415588,6.9...",19957519957640564,,25.0,1.411552e+09,[],600008821,3174145329,2234347078,0,86.11,0.0,1396.0,
1,10423101,"-3.63512468338,2.38069868088,14.9096345901,1.2...",717361318,,33.0,1.409657e+09,[],2128734282,3174145329,2234347078,0,981.00,0.0,436.0,
2,15819492,"0.107854895294,3.20008993149,20.3623485565,-0....","396,1705,31495,3424,77035,85420,28,17970,9300,...",,98.0,1.407204e+09,[2950753864 3600550670 3606342894 3338364112 1...,638675564,1413776458,2234347078,0,6954.20,0.0,1100.0,"2950753864,3600550670,3606342894,3338364112,16..."
3,16814925,"-11.0617599487,1.99434304237,14.9591712952,1.2...",85791814773227,,21.0,1.380506e+09,[],511097257,3174145329,2234347078,0,317.19,0.0,4196.0,
4,20360674,"6.6802148819,1.83605599403,15.9008598328,3.244...","22485,118989,12418,396611,61891,3029,26353,396...",,63.0,1.365757e+09,[ 792028649 1936823271],1552633930,1413776458,2234347078,0,26.16,0.0,32.0,7920286491936823271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183788,4276491220,"-4.29484510422,6.66334295273,16.4290809631,-4....","3072,34976,173488,1864,6276,492635,58,19175,24...",,81.0,1.382671e+09,3209094375,828839503,1413776458,2234347078,2164216144,456.71,0.0,0.0,3209094375
183789,4283547666,"-4.81997919083,13.6302375793,14.3650541306,6.9...",26694774505839742581674425775,,58.0,1.424685e+09,[2142910698 1316768238 1853987563 1984414365],2753664459,470292959,2234347078,0,154.78,0.0,0.0,2142910698131676823818539875631984414365
183790,4283640696,"-0.665687978268,11.5875854492,9.86795520782,3....",406009911495261166378290355096,,81.0,1.367677e+09,[2255039294 3696039767 1984414365 1936823271 1...,2553585254,1413776458,3172394293,0,78.48,0.0,3408.0,"2255039294,3696039767,1984414365,1936823271,10..."
183791,4290887658,"2.51159644127,2.58366537094,9.91071224213,4.36...","42,128095,44,188981,28,6705,87381,2679,59,3316...",,73.0,1.305795e+09,[2260640674 2569601900],201748736,1413776458,3172394293,0,16.35,0.0,0.0,22606406742569601900


In [256]:
vinfo

Unnamed: 0,vid,image_emb,title,story,title_length,timestamp,stars,cid,class_id,is_intact,second_class,duration,ctr,vv,stars2
0,5893106,"-8.26598072052,1.63619112968,17.9674415588,6.9...",19957519957640564,0,25.0,1.411552e+09,[],600008821,3174145329,2234347078,0,86.11,0.0,1396.0,0
1,10423101,"-3.63512468338,2.38069868088,14.9096345901,1.2...",717361318,0,33.0,1.409657e+09,[],2128734282,3174145329,2234347078,0,981.00,0.0,436.0,0
2,15819492,"0.107854895294,3.20008993149,20.3623485565,-0....","396,1705,31495,3424,77035,85420,28,17970,9300,...",0,98.0,1.407204e+09,[2950753864 3600550670 3606342894 3338364112 1...,638675564,1413776458,2234347078,0,6954.20,0.0,1100.0,"2950753864,3600550670,3606342894,3338364112,16..."
3,16814925,"-11.0617599487,1.99434304237,14.9591712952,1.2...",85791814773227,0,21.0,1.380506e+09,[],511097257,3174145329,2234347078,0,317.19,0.0,4196.0,0
4,20360674,"6.6802148819,1.83605599403,15.9008598328,3.244...","22485,118989,12418,396611,61891,3029,26353,396...",0,63.0,1.365757e+09,[ 792028649 1936823271],1552633930,1413776458,2234347078,0,26.16,0.0,32.0,7920286491936823271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183788,4276491220,"-4.29484510422,6.66334295273,16.4290809631,-4....","3072,34976,173488,1864,6276,492635,58,19175,24...",0,81.0,1.382671e+09,3209094375,828839503,1413776458,2234347078,2164216144,456.71,0.0,0.0,3209094375
183789,4283547666,"-4.81997919083,13.6302375793,14.3650541306,6.9...",26694774505839742581674425775,0,58.0,1.424685e+09,[2142910698 1316768238 1853987563 1984414365],2753664459,470292959,2234347078,0,154.78,0.0,0.0,2142910698131676823818539875631984414365
183790,4283640696,"-0.665687978268,11.5875854492,9.86795520782,3....",406009911495261166378290355096,0,81.0,1.367677e+09,[2255039294 3696039767 1984414365 1936823271 1...,2553585254,1413776458,3172394293,0,78.48,0.0,3408.0,"2255039294,3696039767,1984414365,1936823271,10..."
183791,4290887658,"2.51159644127,2.58366537094,9.91071224213,4.36...","42,128095,44,188981,28,6705,87381,2679,59,3316...",0,73.0,1.305795e+09,[2260640674 2569601900],201748736,1413776458,3172394293,0,16.35,0.0,0.0,22606406742569601900


In [136]:
udf

Unnamed: 0,did,watch,watch_vids,watch_times
0,b85ee45e0b0f0d32ea27b83859943bd8,"[[1424450711, 1970231413], [1424450128, 209639...","[1970231413, 2096393410, 1343904424, 145739096...","[1424450711, 1424450128, 1424449087, 142444791..."
1,ec51024cacd62b2f44a5dedeab3b3ddd,"[[1424447915, 3224674369], [1424442687, 218108...","[3224674369, 218108376, 3835874799, 4073766715...","[1424447915, 1424442687, 1424442630, 142443557..."
2,805a6dfc253c8fd72fff89a1468bf101,"[[1424433395, 3683265124], [1424177945, 271756...","[3683265124, 271756254, 385283079, 2722220318]","[1424433395, 1424177945, 1424177720, 1424177550]"
3,681af5bb683758603018eb30ced355a5,"[[1424429841, 3753149116], [1424416653, 194721...","[3753149116, 1947216289, 2542638947, 381370225...","[1424429841, 1424416653, 1424416122, 142441564..."
4,2f66caafbdd897ed371378323a2b9c41,"[[1424433612, 113020635], [1424314589, 3359125...","[113020635, 3359125398, 2536631690, 1347456801...","[1424433612, 1424314589, 1424151148, 142415092..."
...,...,...,...,...
75012,d679be3058b1c1b39e45ce22211cc2aa,"[[1424437707, 3168322038], [1424437583, 263846...","[3168322038, 2638467458, 1325108032, 125660504...","[1424437707, 1424437583, 1424437506, 142443742..."
75013,ffd7e854e2d6ae73ae31a3ce09e17992,"[[1424431182, 597364737], [1424404501, 2779388...","[597364737, 2779388063, 2683083235, 3224674369...","[1424431182, 1424404501, 1424404402, 142440435..."
75014,b36e5a2d261cd9ec0f4b5f8424e9f1b9,"[[1424348459, 1123409917]]",[1123409917],[1424348459]
75015,916739a22226633e38c3ed2ea6b6ba44,"[[1424453233, 1343904424], [1424450895, 145739...","[1343904424, 1457390961, 4073766715, 607921262...","[1424453233, 1424450895, 1424449665, 142444965..."


In [54]:
udf['watch_vids'] = udf.watch.apply(lambda l: [x[1] if len(x) == 2 else 0 for x in l])

In [55]:
udf['watch_times'] = udf.watch.apply(lambda l: [x[0] if len(x) == 2 else 0 for x in l])

In [56]:
udf

Unnamed: 0,did,watch,watch_vids,watch_times
0,b85ee45e0b0f0d32ea27b83859943bd8,"[[1424450711, 1970231413], [1424450128, 209639...","[1970231413, 2096393410, 1343904424, 145739096...","[1424450711, 1424450128, 1424449087, 142444791..."
1,ec51024cacd62b2f44a5dedeab3b3ddd,"[[1424447915, 3224674369], [1424442687, 218108...","[3224674369, 218108376, 3835874799, 4073766715...","[1424447915, 1424442687, 1424442630, 142443557..."
2,805a6dfc253c8fd72fff89a1468bf101,"[[1424433395, 3683265124], [1424177945, 271756...","[3683265124, 271756254, 385283079, 2722220318]","[1424433395, 1424177945, 1424177720, 1424177550]"
3,681af5bb683758603018eb30ced355a5,"[[1424429841, 3753149116], [1424416653, 194721...","[3753149116, 1947216289, 2542638947, 381370225...","[1424429841, 1424416653, 1424416122, 142441564..."
4,2f66caafbdd897ed371378323a2b9c41,"[[1424433612, 113020635], [1424314589, 3359125...","[113020635, 3359125398, 2536631690, 1347456801...","[1424433612, 1424314589, 1424151148, 142415092..."
...,...,...,...,...
75012,d679be3058b1c1b39e45ce22211cc2aa,"[[1424437707, 3168322038], [1424437583, 263846...","[3168322038, 2638467458, 1325108032, 125660504...","[1424437707, 1424437583, 1424437506, 142443742..."
75013,ffd7e854e2d6ae73ae31a3ce09e17992,"[[1424431182, 597364737], [1424404501, 2779388...","[597364737, 2779388063, 2683083235, 3224674369...","[1424431182, 1424404501, 1424404402, 142440435..."
75014,b36e5a2d261cd9ec0f4b5f8424e9f1b9,"[[1424348459, 1123409917]]",[1123409917],[1424348459]
75015,916739a22226633e38c3ed2ea6b6ba44,"[[1424453233, 1343904424], [1424450895, 145739...","[1343904424, 1457390961, 4073766715, 607921262...","[1424453233, 1424450895, 1424449665, 142444965..."


In [148]:
def get_dw(did):
  d = udf[udf.did==did]
  watch_vids = d.watch_vids.values[0]
  watch_times = d.watch_times.values[0]
  dw = pd.DataFrame({'vid': watch_vids, 'watch_times': watch_times})
  return dw

In [149]:
dw = get_dw('b85ee45e0b0f0d32ea27b83859943bd8')

In [150]:
dw = dw.merge(vinfo, on='vid', how='left', copy=False)

In [151]:
dw

Unnamed: 0,vid,watch_times,image_emb,title,story,title_length,timestamp,stars,cid,class_id,is_intact,second_class,duration,ctr,vv,stars2
0,1970231413,1424450711,"1.93601417542,10.786362648,14.49609375,-3.8681...",865770217,,20.0,1422532000.0,[1091459898 991727417 974380206 3677247097 ...,3926084000.0,1820266000.0,2234347000.0,0.0,2871.06,0.075,2623288.0,"1091459898,991727417,974380206,3677247097,9279..."
1,2096393410,1424450128,"2.22179055214,11.103266716,11.7319412231,-3.66...",86577022,,20.0,1422532000.0,[1091459898 991727417 974380206 3677247097 ...,3926084000.0,1820266000.0,2234347000.0,0.0,2959.35,0.0,2502672.0,"1091459898,991727417,974380206,3677247097,9279..."
2,1343904424,1424449087,"6.4067029953,11.0306529999,13.9482421875,-0.43...",86577021,,20.0,1422532000.0,[1091459898 991727417 974380206 3677247097 ...,3926084000.0,1820266000.0,2234347000.0,0.0,2845.99,0.125,2692720.0,"1091459898,991727417,974380206,3677247097,9279..."
3,1457390961,1424447912,"3.4910492897,10.4007940292,14.4470529556,-1.65...",86577020,,20.0,1422532000.0,[1091459898 991727417 974380206 3677247097 ...,3926084000.0,1820266000.0,2234347000.0,0.0,2834.0,0.0925,2683352.0,"1091459898,991727417,974380206,3677247097,9279..."
4,4073766715,1424447746,"-0.605210125446,7.49364614487,17.2392196655,0....",86577019,,20.0,1422532000.0,[1091459898 991727417 974380206 3677247097 ...,3926084000.0,1820266000.0,2234347000.0,0.0,3032.38,0.05,7831692.0,"1091459898,991727417,974380206,3677247097,9279..."
5,1877551774,1424444623,,,,,,,,,,,,,,
6,1519553229,1424444548,"1.56257164478,7.70134210587,18.0198421478,0.56...","45824,10420,1544,95846,6650,293187,18187,2709,...",,71.0,1398227000.0,[ 158318357 2995458451 2031659507],1587795000.0,1413776000.0,3172394000.0,0.0,292.12,0.0,312.0,15831835729954584512031659507
7,520446502,1424444467,"1.06891572475,10.1612348557,9.8675069809,9.912...",14884822034119130111418771,,30.0,1423819000.0,[],3851894000.0,1413776000.0,2234347000.0,0.0,74.12,0.055,5240.0,
8,2704673539,1424444157,"-4.05946111679,10.9183197021,12.1457862854,-3....","42,46379,34873,44286,44,2822,9273,213,229016,1...",,90.0,1317635000.0,[ 418568571 2161915924 1456675238 2564813357 1...,1810499000.0,1413776000.0,3445749000.0,0.0,136.25,0.1075,3300.0,"418568571,2161915924,1456675238,2564813357,126..."
9,1527603191,1424443745,"0.943881690502,5.2524561882,12.2208585739,-2.4...",159811196422034124865068,,32.0,1422439000.0,3881095978,1464895000.0,470293000.0,2234347000.0,0.0,1010.43,0.06,5068.0,3881095978


In [80]:
dw.title.values

array(['865770,217', '865770,22', '865770,21', '865770,20', '865770,19',
       nan,
       '45824,10420,1544,95846,6650,293187,18187,2709,275836,6359,8603,1859,1006,7328',
       '148848,220341,191301,1141,8771',
       '42,46379,34873,44286,44,2822,9273,213,229016,112629,2857,5255,1138,5,73,79,41842,39470',
       '15981,11964,220341,24,865068',
       '58131,373,52839,220341,73,396,24,172,49219,175,396,302,44099',
       '42,46379,44,2822,9273,213,28,68819,2679,240855,240856,4525,2078,1652',
       '854903,220341,3144,1479',
       '396,855,172,50486,175,578,8942,9964,12271,220341,13883,369,6850,31710,2537,9138,2062',
       '42,226280,44,229053,229054,24,9818,117417,220341,18350,20407',
       '42,46379,34873,44286,44,2822,9273,213,229016,44383,27589,238092,5,1225,3924',
       '42,46379,34873,44286,44,2822,9273,213,229016,1227,820,68819,73,27589,2381,238093,47913,4081',
       '42,46379,34873,44286,44,2822,9273,213,28,68819,2381,9385,220341,73,12616,1491,9069',
       '42,839904,4

In [118]:
dw.stars

'[1091459898  991727417  974380206 3677247097  927984876  662551445\n  386183300]'

In [177]:
def deal_titles(dw):
  titles = dw.title.values
  m = defaultdict(float)
  for title in titles:
    if not pd.isnull(title):
      words = title.split(',')
      for word in words:
        m[word] += 1 / (len(words))
  l = [x[0] for x in sorted(m.items(), key=lambda kv: -kv[1])]
  res = {'titles': l[:100]}
  return res

In [182]:
def deal_stars(dw):
  m = defaultdict(int)
  stars_list = dw.stars2.values
  for stars in stars_list:
    if not pd.isnull(stars):
      stars = stars.split(',')
      for x in stars[:2]:
        m[x] += 1
  l = [x[0] for x in sorted(m.items(), key=lambda kv: -kv[1])]

  res = {'stars_list': l[:100]}
  return res

In [183]:
def deal_others(dw):
  cols = ['cid', 'class_id', 'is_intact', 'second_class']
  res = {}
  for col in cols:
    out_col = col + 's' if col != 'second_class' else  'second_classes'
    res[out_col] = [int(x) if not pd.isnull(x) else 0 for x in dw[col].values]
  return res

In [219]:
def deal_his(watch_vids):
  dw = pd.DataFrame({'vid': watch_vids}, index=[0])
  dw = dw.merge(vinfo, on='vid', how='left', copy=False, left_index=True, right_index=True)
  res = deal_stars(dw)
  res.update(deal_titles(dw))
  res.update(deal_others(dw))
  return res

In [239]:
vinfos = {}
for _, row in tqdm(vinfo.iterrows(), total=len(vinfo)):
  vinfos[row['vid']] = row













  0%|          | 0/183793 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A











  0%|          | 1/183793 [00:00<17:03:30,  2.99it/s][A[A[A[A[A[A[A[A[A[A[A[A











  0%|          | 417/183793 [00:00<11:55:03,  4.27it/s][A[A[A[A[A[A[A[A[A[A[A[A











  0%|          | 843/183793 [00:00<8:19:35,  6.10it/s] [A[A[A[A[A[A[A[A[A[A[A[A











  1%|          | 1171/183793 [00:00<5:49:21,  8.71it/s][A[A[A[A[A[A[A[A[A[A[A[A











  1%|          | 1531/183793 [00:00<4:04:19, 12.43it/s][A[A[A[A[A[A[A[A[A[A[A[A











  1%|          | 1901/183793 [00:00<2:50:55, 17.74it/s][A[A[A[A[A[A[A[A[A[A[A[A











  1%|          | 2276/183793 [00:00<1:59:38, 25.28it/s][A[A[A[A[A[A[A[A[A[A[A[A











  1%|▏         | 2650/183793 [00:01<1:23:49, 36.02it/s][A[A[A[A[A[A[A[A[A[A[A[A











  2%|▏         | 3022/183793 [00:01<58:47, 51.24it/s]  [A[A[A[A[A[A[A

 41%|████      | 74541/183793 [00:16<00:23, 4680.03it/s][A[A[A[A[A[A[A[A[A[A[A[A











 41%|████      | 75020/183793 [00:16<00:23, 4711.46it/s][A[A[A[A[A[A[A[A[A[A[A[A











 41%|████      | 75494/183793 [00:16<00:23, 4600.48it/s][A[A[A[A[A[A[A[A[A[A[A[A











 41%|████▏     | 75957/183793 [00:16<00:26, 4114.84it/s][A[A[A[A[A[A[A[A[A[A[A[A











 42%|████▏     | 76380/183793 [00:17<00:26, 3990.19it/s][A[A[A[A[A[A[A[A[A[A[A[A











 42%|████▏     | 76799/183793 [00:17<00:26, 4046.51it/s][A[A[A[A[A[A[A[A[A[A[A[A











 42%|████▏     | 77265/183793 [00:17<00:25, 4212.45it/s][A[A[A[A[A[A[A[A[A[A[A[A











 42%|████▏     | 77710/183793 [00:17<00:24, 4279.38it/s][A[A[A[A[A[A[A[A[A[A[A[A











 43%|████▎     | 78143/183793 [00:17<00:25, 4169.85it/s][A[A[A[A[A[A[A[A[A[A[A[A











 43%|████▎     | 78593/183793 [00:17<00:24, 4263.30it/s][A[A[

 80%|███████▉  | 146425/183793 [00:35<00:06, 5427.36it/s][A[A[A[A[A[A[A[A[A[A[A[A











 80%|███████▉  | 146971/183793 [00:35<00:07, 5212.28it/s][A[A[A[A[A[A[A[A[A[A[A[A











 80%|████████  | 147496/183793 [00:35<00:07, 5135.74it/s][A[A[A[A[A[A[A[A[A[A[A[A











 81%|████████  | 148049/183793 [00:35<00:06, 5247.90it/s][A[A[A[A[A[A[A[A[A[A[A[A











 81%|████████  | 148605/183793 [00:35<00:06, 5336.72it/s][A[A[A[A[A[A[A[A[A[A[A[A











 81%|████████  | 149172/183793 [00:35<00:06, 5432.21it/s][A[A[A[A[A[A[A[A[A[A[A[A











 81%|████████▏ | 149718/183793 [00:35<00:06, 5162.98it/s][A[A[A[A[A[A[A[A[A[A[A[A











 82%|████████▏ | 150244/183793 [00:36<00:06, 5191.27it/s][A[A[A[A[A[A[A[A[A[A[A[A











 82%|████████▏ | 150810/183793 [00:36<00:06, 5321.79it/s][A[A[A[A[A[A[A[A[A[A[A[A











 82%|████████▏ | 151399/183793 [00:36<00:05, 5478.27it/

In [249]:

timestamp = df.timestamp.values[0]

def get_start(watch_times, timestamp):
  for i in range(len(watch_times)):
    if not is_badtime(watch_times[i], timestamp):
      return i

watch_vids_list = []
watch_times_list = []
cids_list = []
class_ids_list = []
second_classes_list = []
is_intacts_list = []
titles_list = []
stars_list = []

for _, row in tqdm(udf.iterrows(), total=len(udf)):
  watch_vids = row['watch_vids']
  watch_times = row['watch_times']
  watch_vids_ = []
  watch_times_ = []
  cids = []
  class_ids = []
  second_classes = []
  is_intacts = []
  
  titles = []
  m_title = collections.def
  stars = []
  m_star = {}
  
  i = 0
  for vid, time_ in zip(watch_vids, watch_times):
    if is_badtime(time_,timestamp):
      continue
    i += 1
    watch_vids_ += [vid]
    watch_times_ += [time_]
    vrow = vinfos[vid] if vid in vinfos else None
    cids += [vrow['cid']] if vrow is not None else [0]
    class_ids += [vrow['class_id']] if vrow is not None else [0]
    second_classes += [vrow['second_class']] if vrow is not None else [0]
    is_intacts += [vrow['is_intacts']] if vrow is not None else [0]
    
    if vrow is not None:
      title = vrow['title']
      words = title.split(',')
      for word in words:
        m_title[word] += 
    

  watch_vids_list += [','.join(map(str, watch_vids_)]
  watch_times_list += [','.join(map(str, watch_times_)]
  cids_list += [','.join(map(str, cids))]
  class_ids_list += [','.join(map(str, class_ids))]
  second_clases_list += [','.join(map(str, second_classes))]
  is_intacts_list += [','.join(map(str, is_intacts))]

udf['watch_vids'] = watch_vids_list    
udf['watch_times'] = watch_times_list
udf['cids'] = cids_list
udf['class_ids'] = class_ids_list
udf['second_clases'] = second_classes_list
udf['is_intacts'] = is_intacts_list














  0%|          | 0/75017 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 122/75017 [00:00<01:01, 1219.57it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 250/75017 [00:00<01:00, 1236.51it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 372/75017 [00:00<01:00, 1229.44it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  1%|          | 486/75017 [00:00<01:02, 1200.39it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  1%|          | 577/75017 [00:00<01:13, 1019.00it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  1%|          | 664/75017 [00:00<01:19, 930.87it/s] [A[A[A[A[A[A[A[A[A[A[A[A[A












  1%|          | 752/75017 [00:00<01:21, 914.27it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  1%|          | 837/75017 [00:00<01:23, 885.77it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












  1%|          | 930/75017 [00:00<01:22, 896.45it/s][A

 20%|█▉        | 14631/75017 [00:16<01:11, 844.33it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|█▉        | 14721/75017 [00:16<01:10, 857.87it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|█▉        | 14817/75017 [00:16<01:08, 883.53it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|█▉        | 14913/75017 [00:16<01:06, 903.59it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|██        | 15016/75017 [00:16<01:04, 936.68it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|██        | 15111/75017 [00:16<01:06, 898.94it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|██        | 15202/75017 [00:16<01:08, 878.48it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|██        | 15302/75017 [00:16<01:05, 905.01it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 21%|██        | 15394/75017 [00:17<01:10, 848.72it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 21%|██        | 15484/75017 [00:17<01:09, 861

 30%|███       | 22768/75017 [00:24<00:49, 1063.77it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 31%|███       | 22881/75017 [00:24<00:48, 1081.16it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 31%|███       | 23000/75017 [00:24<00:46, 1108.11it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 31%|███       | 23112/75017 [00:24<00:47, 1098.95it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 31%|███       | 23223/75017 [00:24<00:47, 1092.13it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 31%|███       | 23333/75017 [00:24<00:47, 1091.42it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 31%|███▏      | 23456/75017 [00:24<00:45, 1128.51it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 31%|███▏      | 23582/75017 [00:24<00:44, 1163.24it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 32%|███▏      | 23704/75017 [00:25<00:43, 1176.61it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 32%|███▏      | 23823/75017 [00:25<0

 55%|█████▌    | 41304/75017 [00:39<00:28, 1181.14it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 55%|█████▌    | 41423/75017 [00:40<00:28, 1159.65it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 55%|█████▌    | 41550/75017 [00:40<00:28, 1189.20it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 56%|█████▌    | 41674/75017 [00:40<00:27, 1202.30it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 56%|█████▌    | 41795/75017 [00:40<00:27, 1192.76it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 56%|█████▌    | 41915/75017 [00:40<00:28, 1151.49it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 56%|█████▌    | 42031/75017 [00:40<00:29, 1130.75it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 56%|█████▌    | 42145/75017 [00:40<00:29, 1112.69it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 56%|█████▋    | 42257/75017 [00:40<00:30, 1072.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 56%|█████▋    | 42365/75017 [00:40<0

 80%|████████  | 60067/75017 [00:55<00:11, 1263.49it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 80%|████████  | 60194/75017 [00:55<00:11, 1260.81it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 80%|████████  | 60321/75017 [00:55<00:11, 1234.46it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 81%|████████  | 60445/75017 [00:56<00:11, 1224.79it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 81%|████████  | 60577/75017 [00:56<00:11, 1250.10it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 81%|████████  | 60711/75017 [00:56<00:11, 1275.30it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 81%|████████  | 60839/75017 [00:56<00:12, 1134.55it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 81%|████████▏ | 60956/75017 [00:56<00:12, 1138.78it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 81%|████████▏ | 61084/75017 [00:56<00:11, 1177.62it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 82%|████████▏ | 61204/75017 [00:56<0

In [250]:
udf

Unnamed: 0,did,watch,watch_vids,watch_times,cids
0,b85ee45e0b0f0d32ea27b83859943bd8,"[[1424450711, 1970231413], [1424450128, 209639...","[1970231413, 2096393410, 1343904424, 145739096...","[1424450711, 1424450128, 1424449087, 142444791...","1464895332.0,1810498863.0,1810498863.0,1810498..."
1,ec51024cacd62b2f44a5dedeab3b3ddd,"[[1424447915, 3224674369], [1424442687, 218108...","[3224674369, 218108376, 3835874799, 4073766715...","[1424447915, 1424442687, 1424442630, 142443557...","0,0,0,0,1108246334.0,1389669965.0,0,1389669965..."
2,805a6dfc253c8fd72fff89a1468bf101,"[[1424433395, 3683265124], [1424177945, 271756...","[3683265124, 271756254, 385283079, 2722220318]","[1424433395, 1424177945, 1424177720, 1424177550]",000
3,681af5bb683758603018eb30ced355a5,"[[1424429841, 3753149116], [1424416653, 194721...","[3753149116, 1947216289, 2542638947, 381370225...","[1424429841, 1424416653, 1424416122, 142441564...","2880934939.0,2880934939.0,2880934939.0,2880934..."
4,2f66caafbdd897ed371378323a2b9c41,"[[1424433612, 113020635], [1424314589, 3359125...","[113020635, 3359125398, 2536631690, 1347456801...","[1424433612, 1424314589, 1424151148, 142415092...","2957047406.0,2722543282.0,4137826533.0,2596226..."
...,...,...,...,...,...
75012,d679be3058b1c1b39e45ce22211cc2aa,"[[1424437707, 3168322038], [1424437583, 263846...","[3168322038, 2638467458, 1325108032, 125660504...","[1424437707, 1424437583, 1424437506, 142443742...","2321913827.0,2321913827.0,2321913827.0,3926084..."
75013,ffd7e854e2d6ae73ae31a3ce09e17992,"[[1424431182, 597364737], [1424404501, 2779388...","[597364737, 2779388063, 2683083235, 3224674369...","[1424431182, 1424404501, 1424404402, 142440435...","4226625658.0,4226625658.0,4226625658.0,4226625..."
75014,b36e5a2d261cd9ec0f4b5f8424e9f1b9,"[[1424348459, 1123409917]]",[1123409917],[1424348459],2596226704.0
75015,916739a22226633e38c3ed2ea6b6ba44,"[[1424453233, 1343904424], [1424450895, 145739...","[1343904424, 1457390961, 4073766715, 607921262...","[1424453233, 1424450895, 1424449665, 142444965...","2880934939.0,2880934939.0,2880934939.0,2880934..."


In [220]:
deal_his(udf.did.values[0])

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [221]:
dw = pd.DataFrame({'vid': watch_vids}, index=[0])

ValueError: Shape of passed values is (75017, 1), indices imply (1, 1)

In [204]:
udf

Unnamed: 0,did,watch,watch_vids,watch_times
0,b85ee45e0b0f0d32ea27b83859943bd8,"[[1424450711, 1970231413], [1424450128, 209639...","[1970231413, 2096393410, 1343904424, 145739096...","[1424450711, 1424450128, 1424449087, 142444791..."
1,ec51024cacd62b2f44a5dedeab3b3ddd,"[[1424447915, 3224674369], [1424442687, 218108...","[3224674369, 218108376, 3835874799, 4073766715...","[1424447915, 1424442687, 1424442630, 142443557..."
2,805a6dfc253c8fd72fff89a1468bf101,"[[1424433395, 3683265124], [1424177945, 271756...","[3683265124, 271756254, 385283079, 2722220318]","[1424433395, 1424177945, 1424177720, 1424177550]"
3,681af5bb683758603018eb30ced355a5,"[[1424429841, 3753149116], [1424416653, 194721...","[3753149116, 1947216289, 2542638947, 381370225...","[1424429841, 1424416653, 1424416122, 142441564..."
4,2f66caafbdd897ed371378323a2b9c41,"[[1424433612, 113020635], [1424314589, 3359125...","[113020635, 3359125398, 2536631690, 1347456801...","[1424433612, 1424314589, 1424151148, 142415092..."
...,...,...,...,...
75012,d679be3058b1c1b39e45ce22211cc2aa,"[[1424437707, 3168322038], [1424437583, 263846...","[3168322038, 2638467458, 1325108032, 125660504...","[1424437707, 1424437583, 1424437506, 142443742..."
75013,ffd7e854e2d6ae73ae31a3ce09e17992,"[[1424431182, 597364737], [1424404501, 2779388...","[597364737, 2779388063, 2683083235, 3224674369...","[1424431182, 1424404501, 1424404402, 142440435..."
75014,b36e5a2d261cd9ec0f4b5f8424e9f1b9,"[[1424348459, 1123409917]]",[1123409917],[1424348459]
75015,916739a22226633e38c3ed2ea6b6ba44,"[[1424453233, 1343904424], [1424450895, 145739...","[1343904424, 1457390961, 4073766715, 607921262...","[1424453233, 1424450895, 1424449665, 142444965..."


In [205]:
udf[udf.did.isin(set(df.did))]

Unnamed: 0,did,watch,watch_vids,watch_times
0,b85ee45e0b0f0d32ea27b83859943bd8,"[[1424450711, 1970231413], [1424450128, 209639...","[1970231413, 2096393410, 1343904424, 145739096...","[1424450711, 1424450128, 1424449087, 142444791..."
1,ec51024cacd62b2f44a5dedeab3b3ddd,"[[1424447915, 3224674369], [1424442687, 218108...","[3224674369, 218108376, 3835874799, 4073766715...","[1424447915, 1424442687, 1424442630, 142443557..."
2,805a6dfc253c8fd72fff89a1468bf101,"[[1424433395, 3683265124], [1424177945, 271756...","[3683265124, 271756254, 385283079, 2722220318]","[1424433395, 1424177945, 1424177720, 1424177550]"
3,681af5bb683758603018eb30ced355a5,"[[1424429841, 3753149116], [1424416653, 194721...","[3753149116, 1947216289, 2542638947, 381370225...","[1424429841, 1424416653, 1424416122, 142441564..."
4,2f66caafbdd897ed371378323a2b9c41,"[[1424433612, 113020635], [1424314589, 3359125...","[113020635, 3359125398, 2536631690, 1347456801...","[1424433612, 1424314589, 1424151148, 142415092..."
...,...,...,...,...
75012,d679be3058b1c1b39e45ce22211cc2aa,"[[1424437707, 3168322038], [1424437583, 263846...","[3168322038, 2638467458, 1325108032, 125660504...","[1424437707, 1424437583, 1424437506, 142443742..."
75013,ffd7e854e2d6ae73ae31a3ce09e17992,"[[1424431182, 597364737], [1424404501, 2779388...","[597364737, 2779388063, 2683083235, 3224674369...","[1424431182, 1424404501, 1424404402, 142440435..."
75014,b36e5a2d261cd9ec0f4b5f8424e9f1b9,"[[1424348459, 1123409917]]",[1123409917],[1424348459]
75015,916739a22226633e38c3ed2ea6b6ba44,"[[1424453233, 1343904424], [1424450895, 145739...","[1343904424, 1457390961, 4073766715, 607921262...","[1424453233, 1424450895, 1424449665, 142444965..."


In [206]:
l = []
udf_ = udf[udf.did.isin(set(df.did))]
watch_vids = udf.watch_vids.values
for i in tqdm(range(len(watch_vids))):
  res = deal_his(watch_vids[i])
  l += [res]







  0%|          | 0/75017 [00:00<?, ?it/s][A[A[A[A[A[A





  0%|          | 1/75017 [00:00<4:21:02,  4.79it/s][A[A[A[A[A[A





  0%|          | 3/75017 [00:00<3:37:51,  5.74it/s][A[A[A[A[A[A





  0%|          | 5/75017 [00:00<3:07:28,  6.67it/s][A[A[A[A[A[A





  0%|          | 7/75017 [00:00<2:46:52,  7.49it/s][A[A[A[A[A[A





  0%|          | 9/75017 [00:00<2:32:17,  8.21it/s][A[A[A[A[A[A





  0%|          | 11/75017 [00:01<2:21:24,  8.84it/s][A[A[A[A[A[A





  0%|          | 13/75017 [00:01<2:13:52,  9.34it/s][A[A[A[A[A[A





  0%|          | 15/75017 [00:01<2:08:49,  9.70it/s][A[A[A[A[A[A





  0%|          | 17/75017 [00:01<2:04:52, 10.01it/s][A[A[A[A[A[A





  0%|          | 19/75017 [00:01<2:05:10,  9.99it/s][A[A[A[A[A[A





  0%|          | 21/75017 [00:02<2:02:42, 10.19it/s][A[A[A[A[A[A





  0%|          | 23/75017 [00:02<2:00:34, 10.37it/s][A[A[A[A[A[A





  0%|          | 25/75

KeyboardInterrupt: 

In [187]:
deal_his(udf.watch_vids.values[0])

stars_list        [2161915924, 418568571, 1091459898, 991727417,...
titles            [865770, 220341, 42, 44, 217, 22, 21, 20, 19, ...
cids              [3926084406, 3926084406, 3926084406, 392608440...
class_ids         [1820266297, 1820266297, 1820266297, 182026629...
is_intacts        [2234347078, 2234347078, 2234347078, 223434707...
second_classes    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
dtype: object

In [191]:
%timeit
udf.watch_vids.apply(deal_his)

KeyboardInterrupt: 

In [None]:
%timeit
pd.concat([df, udf.watch_vids.apply(deal_his))])

In [None]:
# df[['col1', 'col2']] = df['col3'].apply(lambda x: pd.Series('val1', 'val2'))

In [21]:
index = 0

In [27]:
vocabs = {}
vocab_names = [
                  'vid', 'words', 'stars', 'did', 'region', 'sver', 
                  'mod', 'mf', 'aver', 'is_intact', 'second_class', 'class_id', 'cid',
                ]
for vocab_name in vocab_names:
  vocabs[vocab_name] = gezi.Vocab(f'../input/all/{vocab_name}.txt')

In [31]:
ofile = f'{FLAGS.odir}/record_{index}.TMP'

if not FLAGS.force:
  if glob.glob(f'{FLAGS.odir}/record_{index}*'):
    print(f'{FLAGS.odir}/record_{index} exists')
#     return

num_records = 0

total = len(df)
start, end = gezi.get_fold(total, FLAGS.num_records, index)
df_ = df.iloc[start:end]

if FLAGS.toy:
  df_ = df_[:2000]

def _id(row, name):
  return vocabs[name].id(row[name])

def _id2(row, name):
  if not np.isnan(row[name]):
    return vocabs[name].id(int(row[name]))
  else:
    return vocabs[name].unk_id()

missing_image_emb = list(np.random.uniform(-0.05, 0.05,(128,)))
with melt.tfrecords.Writer(ofile) as writer:
  for _, row in tqdm(df_.iterrows(), total=len(df_), ascii=True):      
    feature = {}

    did = row['did']
    vid = row['vid']

    day = 0 if not FLAGS.mark == 'train' else FLAGS.day
    feature['day'] = day
    feature['index'] = row['index']

    feature['label'] = row['label'] if 'label' in row else 0

    # -------id 
    feature['id'] = f'{did}\t{vid}'
    feature['did'] = _id(row, 'did')
    feature['vid'] = _id(row, 'vid')

    feature['did_'] = did
    feature['vid_'] = vid

    # -------user
    watch_times = []
    watch_vids = []
    cids = []
    class_ids = []
    second_classes = []
    is_intacts = []
    stars_list = []
    titles = []
    durs = []
    freshes = []

    match_stars = 0
    match_cids = 0
    match_class_ids = 0
    match_second_classes = 0
    match_is_intacts = 0
    match_prev = 0
    match_first_word = 0

    match_last_stars = 0
    match_last_cids = 0
    match_last_class_ids = 0
    match_last_second_classes = 0
    match_last_is_intacts = 0
    match_last_prev = 0
    match_last_first_word = 0

    cur_stars = set(row['stars'])
    
    i = 0
    for x in row['watch']:
      # 非常重要 否则数据穿越。。。
      if not is_badtime(x[0], row['timestamp']):
        i += 1
#         wtime, wvid = x[0], x[1]
#         watch_times += [wtime]
#         watch_vids += [vocabs['vid'].id(wvid)]





  0%|          | 0/100257 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/100257 [00:00<7:31:41,  3.70it/s][A[A[A[A



  0%|          | 35/100257 [00:00<5:17:33,  5.26it/s][A[A[A[A



  0%|          | 91/100257 [00:00<3:43:03,  7.48it/s][A[A[A[A



  0%|          | 184/100257 [00:00<2:36:32, 10.65it/s][A[A[A[A



  0%|          | 241/100257 [00:00<1:50:24, 15.10it/s][A[A[A[A



  0%|          | 296/100257 [00:00<1:18:09, 21.32it/s][A[A[A[A



  0%|          | 346/100257 [00:00<55:44, 29.87it/s]  [A[A[A[A



  0%|          | 407/100257 [00:00<39:49, 41.79it/s][A[A[A[A



  0%|          | 490/100257 [00:01<28:27, 58.42it/s][A[A[A[A



  1%|          | 552/100257 [00:01<20:59, 79.18it/s][A[A[A[A



  1%|          | 609/100257 [00:01<15:39, 106.06it/s][A[A[A[A



  1%|          | 665/100257 [00:01<12:03, 137.59it/s][A[A[A[A



  1%|          | 718/100257 [00:01<09:45, 169.95it/s][A[A[A[A



  1%|          | 766/100257 [00:01<07:58,

KeyboardInterrupt: 