In [65]:
import gezi
from gezi import tqdm

In [66]:
behaviors_names = ['impression_id', 'uid', 'time', 'history', 'impressions']

In [67]:
TEST_START = 1000000
uid_vocab = gezi.WordCounter()
did_vocab = gezi.WordCounter()
train_uid_vocab = gezi.WordCounter()
train_did_vocab = gezi.WordCounter()
dev_uid_vocab = gezi.WordCounter()
dev_did_vocab = gezi.WordCounter()
test_uid_vocab = gezi.WordCounter()
test_did_vocab = gezi.WordCounter()

files = [
  '../input/train/behaviors.tsv',
  '../input/dev/behaviors.tsv',
  '../input/test/behaviors.tsv',
]

# 让train dev出现的排在test出现的前面
# history 会用户去重 一个用户一个history 对应一次history 中doc计数
# 理论上 不去重可以由uid表征

uids = set()
dids = set()
def _add(did, file, is_history=False):
  count = 1 if ('test' in file and not is_history) or did in dids else 1000000
  dids.add(did)
  did_vocab.add(did, count)
  if 'train' in file:
    train_did_vocab.add(did)
  if 'dev' in file:
    dev_did_vocab.add(did)
  if 'test' in file:
    test_did_vocab.add(did)

for file in files:
  total = len(open(file).readlines())
  for line in tqdm(open(file), total=total):
    l = line.strip().split('\t')
    uid, history, impressions = l[1], l[-2], l[-1]
    count = 1 if 'test' in file or uid in uids else TEST_START
    
    uid_vocab.add(uid, count)
    if 'train' in file:
      train_uid_vocab.add(uid)
    if 'dev' in file:
      dev_uid_vocab.add(uid)
    if 'test' in file:
      test_uid_vocab.add(uid)
    
    for did in history.split():
      if uid not in uids:
        _add(did, file, is_history=False)
      _add(did, file, is_history=False)
    for did in impressions.split():
      did = did.split('-')[0]
      _add(did, file)
      
    uids.add(uid)

100%|██████████| 2232748/2232748 [08:37<00:00, 4311.55it/s]
100%|██████████| 376471/376471 [01:22<00:00, 4545.95it/s]
100%|██████████| 2370727/2370727 [11:34<00:00, 3414.25it/s]


In [114]:
uid_vocab.save('../input/uid.txt')
did_vocab.save('../input/did.txt')

total_count 750438229512 unknown_count 0 total_word 876956
total_count 104541073881 unknown_count 0 total_word 130379


In [115]:
len(did_vocab.counter)

130379

In [116]:
train_uid_vocab.save('../input/train/uid.txt')
train_did_vocab.save('../input/train/did.txt')
dev_uid_vocab.save('../input/dev/uid.txt')
dev_did_vocab.save('../input/dev/did.txt')
test_uid_vocab.save('../input/test/uid.txt')
test_did_vocab.save('../input/test/did.txt')

total_count 2232748 unknown_count 0 total_word 711222
total_count 170487392 unknown_count 0 total_word 101527
total_count 376471 unknown_count 0 total_word 255990
total_count 26760487 unknown_count 0 total_word 72023
total_count 2370727 unknown_count 0 total_word 702005
total_count 192930153 unknown_count 0 total_word 120961


In [117]:
len(uid_vocab.counter)

876956

In [118]:
len(train_uid_vocab.counter)

711222

In [119]:
len(dev_uid_vocab.counter)

255990

In [120]:
len(test_uid_vocab.counter)

702005

In [121]:
len(set(dev_uid_vocab.counter) - set(train_uid_vocab.counter)) / len(dev_uid_vocab.counter)

0.15317785851009805

In [122]:
len(set(test_uid_vocab.counter) - set(train_uid_vocab.counter)) / len(test_uid_vocab.counter)

0.22116651590800634

In [123]:
len(set(dev_did_vocab.counter) - set(train_did_vocab.counter)) / len(dev_did_vocab.counter)

0.03643280618691251

In [124]:
len(set(test_did_vocab.counter) - set(train_did_vocab.counter)) / len(test_did_vocab.counter)

0.23380263059994544

In [125]:
# dev uid 
def uid_info(mark):
  file = f'../input/{mark}/behaviors.tsv'
  total = len(open(file).readlines())
  new_uid_impress = 0
  for line in tqdm(open(file), total=total):
    l = line.strip().split('\t')
    uid = l[1]
    if not uid in train_uid_vocab.counter:
      new_uid_impress += 1
  print('new uid impression ratio:', new_uid_impress / total, new_uid_impress, total)

In [126]:
uid_info('dev')

100%|██████████| 376471/376471 [00:01<00:00, 291533.40it/s]

new uid impression ratio: 0.12912017127481268 48610 376471





In [127]:
uid_info('test')

100%|██████████| 2370727/2370727 [00:08<00:00, 288344.12it/s]

new uid impression ratio: 0.1510481805792063 358094 2370727





In [128]:
def did_info(mark):
  file = f'../input/{mark}/behaviors.tsv'
  total = len(open(file).readlines())
  new_did_insts = 0
  total_impres = 0
  new_did_impres = 0
  for line in tqdm(open(file), total=total):
    l = line.strip().split('\t')
    impressions = l[-1]
    find = 0
    for item in impressions.split():
      did = item.split('-')[0]
      if did not in train_did_vocab.counter:
        find += 1
      total += 1
    new_did_insts += find
    if find:
      new_did_impres += 1
    total_impres += 1

  print('new_did_insts_ratio:', new_did_insts / total)
  print('new_did_impress_ratio:', new_did_impres / total_impres)

In [129]:
did_info('dev')

100%|██████████| 376471/376471 [00:11<00:00, 32611.16it/s]

new_did_insts_ratio: 0.05225947564200539
new_did_impress_ratio: 0.6119568306722164





In [130]:
did_info('test')

100%|██████████| 2370727/2370727 [01:05<00:00, 35953.38it/s]

new_did_insts_ratio: 0.8388794501310185
new_did_impress_ratio: 0.9996380857011372





In [131]:
vocab = gezi.Vocab('../input/train/uid.txt')

In [132]:
vocab.size()

711224

In [133]:
vocab.size(10)

28899

In [134]:
vocab = gezi.Vocab('../input/train/did.txt')

In [135]:
vocab.size()

101529

In [136]:
vocab.size(10)

80034

In [137]:
vocab = gezi.Vocab('../input/entity.txt')

In [138]:
vocab.size()

50927

In [139]:
vocab.size(10)

44104

In [140]:
vocab = gezi.Vocab('../input/entity_type.txt')

In [141]:
vocab.size()

24

In [142]:
vocab.size(10)

23