In [1]:
import gezi
from gezi import tqdm
import numpy as np
from transformers import AutoTokenizer
import json

In [2]:
news_names = ['did', 'cat', 'sub_cat', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

# title 

In [3]:
files = [
          '../input/news/news.tsv',
        ]

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

emb_size = 30

vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

empty = [0] * emb_size
empty[0] = 1

emb = [empty] * emb_height
lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title = l[0], l[3]
    if title:
      tokens = tokenizer.encode(title)
    else:
      tokens = [1]
    lens += [len(tokens[1:-1])]
    tokens = gezi.pad(tokens[1:-1], emb_size)
    emb[vocab.id(did)] = np.asarray(tokens)
emb = np.asarray(emb)

130381


100%|██████████| 130379/130379 [00:43<00:00, 3011.49it/s]


In [4]:
print(emb)
print(emb.shape)

np.save(f'../input/title_lookup.npy', emb)

[[    1     0     0 ...     0     0     0]
 [    1     0     0 ...     0     0     0]
 [ 1109  3128  1130 ...     0     0     0]
 ...
 [ 9743  1192 12958 ...     0     0     0]
 [ 2289 24409   117 ...     0     0     0]
 [21166   119  3254 ...     0     0     0]]
(130381, 30)


In [5]:
print(np.mean(lens))
print(np.min(lens))
print(np.max(lens))

15.035120686613642
2
74


# cat sbucat

In [6]:
files = [
          '../input/news/news.tsv',
        ]


emb_size = 2
vocab = gezi.Vocab('../input/did.txt')
cat_vocab = gezi.Vocab('../input/cat.txt')
scat_vocab = gezi.Vocab('../input/sub_cat.txt')
emb_height = vocab.size()
print(emb_height)
empty = [1] * emb_size
emb = [empty] * emb_height

for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, cat, sub_cat = l[0], l[1], l[2]
    data = [cat_vocab.id(cat), scat_vocab.id(sub_cat)]
    emb[vocab.id(did)] = np.asarray(data)

emb = np.asarray(emb)
print(emb)
print(emb.shape)

np.save(f'../input/cat_lookup.npy', emb)

130381


100%|██████████| 130379/130379 [00:01<00:00, 113124.27it/s]


[[  1   1]
 [  1   1]
 [  3  40]
 ...
 [ 10 104]
 [  2   3]
 [  2  19]]
(130381, 2)


# abstract

In [7]:
files = [
          '../input/news/news.tsv',
        ]

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

emb_size = 50

vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

empty = [0] * emb_size
empty[0] = 1

emb = [empty] * emb_height
lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, abstract = l[0], l[4]
    if abstract:
      tokens = tokenizer.encode(abstract)
    else:
      tokens = [1]
    lens += [len(tokens[1:-1])]
    tokens = gezi.pad(tokens[1:-1], emb_size)
    emb[vocab.id(did)] = np.asarray(tokens)

emb = np.asarray(emb)
print(emb)
print(emb.shape)

np.save(f'../input/abstract_lookup.npy', emb)

130381


100%|██████████| 130379/130379 [01:55<00:00, 1126.40it/s]


[[    1     0     0 ...     0     0     0]
 [    1     0     0 ...     0     0     0]
 [ 3128  1112  1562 ...     0     0     0]
 ...
 [ 1636  1132  1103 ...     0     0     0]
 [  123   118  1795 ...  1795 27019   117]
 [ 1109  1148  1226 ...  1509  3299  1406]]
(130381, 50)


In [8]:
print(np.mean(lens))
print(np.min(lens))
print(np.max(lens))

49.402311721979764
0
944


# body

In [9]:
files = [
          '../input/news/msn.json',
        ]

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

emb_size = 100

vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

empty = [0] * emb_size
empty[0] = 1
emb = [empty] * emb_height
lens = []

bodys = json.load(open(files[0]))

130381


In [10]:
nid2did = {}
file = '../input/news/news.tsv'
total = len(open(file).readlines())
for line in tqdm(open(file), total=total):
  l = line.strip().split('\t')
  did, url = l[0], l[5]
  nid = url.split('/')[-1].split('.')[0]
  nid2did[nid] = did

100%|██████████| 130379/130379 [00:00<00:00, 213959.13it/s]


In [11]:
lens = []
for body_ in tqdm(bodys, total=len(bodys)):
  body = ' '.join(body_['body'])
  body = ' '.join(body.split()[:100])
  tokens = tokenizer.encode(body)
  totkens = tokens[1:-1]
  lens += [len(tokens)]
  did = vocab.id(nid2did[body_['nid']])
  tokens = gezi.pad(tokens[1:-1], emb_size)
  emb[did] = np.asarray(tokens)
#   if len(lens) > 100:
#     break

100%|██████████| 130379/130379 [04:37<00:00, 469.33it/s]


In [17]:
emb = np.asarray(emb)
emb

array([[    1,     0,     0, ...,     0,     0,     0],
       [    1,     0,     0, ...,     0,     0,     0],
       [ 9300, 14263,  2821, ...,  9707,  7308,  4317],
       ...,
       [ 2372,  1128,  1107, ...,  1170,  1105,  1256],
       [  123,   118,  1795, ...,  4254,  2237,  1348],
       [ 1109,  1148,  1226, ...,  8125, 24996,   117]])

In [18]:
np.mean(lens)

121.96120540884651

In [19]:
np.max(lens)

623

In [20]:
np.min(lens)

2

In [21]:
np.save(f'../input/body_lookup.npy', emb)

# Entity

In [38]:
files = [
          '../input/news/news.tsv',
        ]

# emb_size = 2
vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)
# emb = [empty] * emb_height

title_entity_lens = []
abstract_entity_lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title_entities, abstract_entities = l[0], l[-2], l[-1]
    title_entities = json.loads(title_entities)
    abstract_entities = json.loads(abstract_entities)
    title_entity_lens +=  [len(title_entities)]
    abstract_entity_lens += [len(abstract_entities)]
    if len(abstract_entities) > 10:
      print(line)
      print(title_entities)
      print(abstract_entities)
      print('\n'.join(l))
      break

  0%|          | 331/130379 [00:00<00:02, 54709.75it/s]

130381
N82836	health	medical	Where to Get a Cheap Flu Shot: Walmart, CVS, Costco, and More	The Centers for Disease Control and Prevention recommends that everyone over the age of 6 months get a dose of the influenza vaccine. To find out where to get cheap flu shots in 2019, we compared prices at Walgreens, CVS, Rite Aid, Walmart, Target, Kroger, Safeway, Meijer, Costco, and Sam's Club.	https://assets.msn.com/labs/mind/AAI3QbY.html	[{"Label": "Costco", "Type": "O", "WikidataId": "Q715583", "Confidence": 1.0, "OccurrenceOffsets": [45], "SurfaceForms": ["Costco"]}, {"Label": "CVS Pharmacy", "Type": "O", "WikidataId": "Q2078880", "Confidence": 0.971, "OccurrenceOffsets": [40], "SurfaceForms": ["CVS"]}, {"Label": "Walmart", "Type": "O", "WikidataId": "Q483551", "Confidence": 1.0, "OccurrenceOffsets": [31], "SurfaceForms": ["Walmart"]}]	[{"Label": "Costco", "Type": "O", "WikidataId": "Q715583", "Confidence": 1.0, "OccurrenceOffsets": [274], "SurfaceForms": ["Costco"]}, {"Label": "Meijer", "T




In [None]:
files = [
          '../input/news/news.tsv',
        ]

# emb_size = 2
vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

entity_vocab = gezi.Vocab('../input/entity.txt')
entity_type_vocab = gezi.Vocab('../input/entity_type.txt')


MAX_TITLE_ENTITIES = 9
MAX_ABSTRACT_ENTITIES = 30

emb_size = (MAX_TITLE_ENTITIES + MAX_ABSTRACT_ENTITIES) * 2

empty = [0] * emb_size
empty[0] = 1 
empty[MAX_TITLE_ENTITIES] = 1
emb = [empty] * emb_height

title_entity_lens = []
abstract_entity_lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title_entities, abstract_entities = l[0], l[-2], l[-1]
    did = vocab.id(did)
    title_entities = json.loads(title_entities)
    abstract_entities = json.loads(abstract_entities)
    for i, m in enumerate(title_entities):
      emb[did][i] = entity_vocab.id(enm['WikidataId'])
      

In [32]:
np.mean(title_entity_lens)

1.1927917839529372

In [33]:
np.max(title_entity_lens)

9

In [34]:
np.min(title_entity_lens)

0

In [35]:
np.mean(abstract_entity_lens)

1.9756095690256867

In [36]:
np.max(abstract_entity_lens)

30

In [37]:
np.min(abstract_entity_lens)

0

# merge emb

In [25]:
keys = ['cat', 'title']
embs = []
for key in keys:
  embs += [np.load(f'../input/{key}_lookup.npy')]

In [26]:
emb = np.concatenate(embs, 1)
emb

array([[    1,     1,     1, ...,     0,     0,     0],
       [    1,     1,     1, ...,     0,     0,     0],
       [    3,    40,  1109, ...,     0,     0,     0],
       ...,
       [   10,   104,  9743, ...,     0,     0,     0],
       [    2,     3,  2289, ...,     0,     0,     0],
       [    2,    19, 21166, ...,     0,     0,     0]])

In [27]:
emb.shape

(130381, 32)

In [28]:
np.save('../input/doc_lookup.npy', emb)