In [7]:
import gezi
from gezi import tqdm
import numpy as np
from transformers import AutoTokenizer
import json

In [8]:
news_names = ['did', 'cat', 'sub_cat', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

# title 

In [9]:
fnames = {}
flens = {}
fnames['title'] = ['title']
flens['title'] = [30]

In [106]:
files = [
          '../input/news/news.tsv',
        ]

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

emb_size = 30

vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

empty = [0] * emb_size
empty[0] = 1

emb = [empty] * emb_height
lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title = l[0], l[3]
    if title:
      tokens = tokenizer.encode(title)
    else:
      tokens = [1]
    lens += [len(tokens[1:-1])]
    tokens = gezi.pad(tokens[1:-1], emb_size)
    emb[vocab.id(did)] = np.asarray(tokens)
emb = np.asarray(emb)

130381


100%|██████████| 130379/130379 [00:45<00:00, 2887.69it/s]


In [107]:
print(emb)
print(emb.shape)

np.save(f'../input/title_lookup.npy', emb)

[[    1     0     0 ...     0     0     0]
 [    1     0     0 ...     0     0     0]
 [ 1109  3128  1130 ...     0     0     0]
 ...
 [ 9743  1192 12958 ...     0     0     0]
 [ 2289 24409   117 ...     0     0     0]
 [21166   119  3254 ...     0     0     0]]
(130381, 30)


In [18]:
import torch
from transformers import *
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [22]:
def tovec(sent):
  input_ids = torch.tensor(tokenizer.encode('[CLS] ' + sent)).unsqueeze(0)  # Batch size 1
  outputs = model(input_ids)
  last_hidden_states = outputs[0]
  emb = last_hidden_states[0][0]
  return emb.detach().numpy()

In [None]:
files = [
          '../input/news/news.tsv',
        ]

emb_size = 768

vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

empty = [0.] * emb_size

emb = [empty] * emb_height
lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title = l[0], l[3]
    emb[vocab.id(did)] = tovec(title)
emb = np.asarray(emb)

130381


 26%|██▌       | 33844/130379 [1:10:31<4:12:45,  6.37it/s] 

In [109]:
print(np.mean(lens))
print(np.min(lens))
print(np.max(lens))

15.035120686613642
2
74


# cat sbucat

In [110]:
fnames['cat'] = ['cat', 'sub_cat']
flens['cat'] = [1, 1]

In [111]:
files = [
          '../input/news/news.tsv',
        ]


emb_size = 2
vocab = gezi.Vocab('../input/did.txt')
cat_vocab = gezi.Vocab('../input/cat.txt')
scat_vocab = gezi.Vocab('../input/sub_cat.txt')
emb_height = vocab.size()
print(emb_height)
empty = [1] * emb_size
emb = [empty] * emb_height

for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, cat, sub_cat = l[0], l[1], l[2]
    data = [cat_vocab.id(cat), scat_vocab.id(sub_cat)]
    emb[vocab.id(did)] = np.asarray(data)

emb = np.asarray(emb)
print(emb)
print(emb.shape)

np.save(f'../input/cat_lookup.npy', emb)

  0%|          | 0/130379 [00:00<?, ?it/s]

130381


100%|██████████| 130379/130379 [00:01<00:00, 109229.73it/s]


[[  1   1]
 [  1   1]
 [  3  40]
 ...
 [ 10 104]
 [  2   3]
 [  2  19]]
(130381, 2)


# abstract

In [112]:
fnames['abstract'] = ['abstract']
flens['abstract'] = [50]

In [113]:
files = [
          '../input/news/news.tsv',
        ]

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

emb_size = 50

vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

empty = [0] * emb_size
empty[0] = 1

emb = [empty] * emb_height
lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, abstract = l[0], l[4]
    if abstract:
      tokens = tokenizer.encode(abstract)
    else:
      tokens = [1]
    lens += [len(tokens[1:-1])]
    tokens = gezi.pad(tokens[1:-1], emb_size)
    emb[vocab.id(did)] = np.asarray(tokens)

emb = np.asarray(emb)
print(emb)
print(emb.shape)

np.save(f'../input/abstract_lookup.npy', emb)

130381


100%|██████████| 130379/130379 [01:56<00:00, 1120.87it/s]


[[    1     0     0 ...     0     0     0]
 [    1     0     0 ...     0     0     0]
 [ 3128  1112  1562 ...     0     0     0]
 ...
 [ 1636  1132  1103 ...     0     0     0]
 [  123   118  1795 ...  1795 27019   117]
 [ 1109  1148  1226 ...  1509  3299  1406]]
(130381, 50)


In [115]:
print(np.mean(lens))
print(np.min(lens))
print(np.max(lens))

49.402311721979764
0
944


# body

In [73]:
fnames['body'] = ['body']
flens['body'] = [100]

In [9]:
files = [
          '../input/news/msn.json',
        ]

model_name = 'bert-base-cased'
model = f'/home/gezi/data/lm/{model_name}'
tokenizer = AutoTokenizer.from_pretrained(model)

emb_size = 100

vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

empty = [0] * emb_size
empty[0] = 1
emb = [empty] * emb_height
lens = []

bodys = json.load(open(files[0]))

130381


In [10]:
nid2did = {}
file = '../input/news/news.tsv'
total = len(open(file).readlines())
for line in tqdm(open(file), total=total):
  l = line.strip().split('\t')
  did, url = l[0], l[5]
  nid = url.split('/')[-1].split('.')[0]
  nid2did[nid] = did

100%|██████████| 130379/130379 [00:00<00:00, 213959.13it/s]


In [11]:
lens = []
for body_ in tqdm(bodys, total=len(bodys)):
  body = ' '.join(body_['body'])
  body = ' '.join(body.split()[:100])
  tokens = tokenizer.encode(body)
  totkens = tokens[1:-1]
  lens += [len(tokens)]
  did = vocab.id(nid2did[body_['nid']])
  tokens = gezi.pad(tokens[1:-1], emb_size)
  emb[did] = np.asarray(tokens)
#   if len(lens) > 100:
#     break

100%|██████████| 130379/130379 [04:37<00:00, 469.33it/s]


In [17]:
emb = np.asarray(emb)
emb

array([[    1,     0,     0, ...,     0,     0,     0],
       [    1,     0,     0, ...,     0,     0,     0],
       [ 9300, 14263,  2821, ...,  9707,  7308,  4317],
       ...,
       [ 2372,  1128,  1107, ...,  1170,  1105,  1256],
       [  123,   118,  1795, ...,  4254,  2237,  1348],
       [ 1109,  1148,  1226, ...,  8125, 24996,   117]])

In [18]:
np.mean(lens)

121.96120540884651

In [19]:
np.max(lens)

623

In [20]:
np.min(lens)

2

In [21]:
np.save(f'../input/body_lookup.npy', emb)

# Entity

In [116]:
fnames['entity'] = ['title_entities', 'abstract_entities', 'title_entity_types', 'abstract_entity_types']
flens['entity'] = [5, 5, 5, 5]

In [117]:
files = [
          '../input/news/news.tsv',
        ]

# emb_size = 2
vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)
# emb = [empty] * emb_height

title_entity_lens = []
abstract_entity_lens = []
for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title_entities, abstract_entities = l[0], l[-2], l[-1]
    title_entities = json.loads(title_entities)
    abstract_entities = json.loads(abstract_entities)
    title_entity_lens +=  [len(title_entities)]
    abstract_entity_lens += [len(abstract_entities)]
    if len(abstract_entities) > 10:
      print(line)
      print(title_entities)
      print(abstract_entities)
      print('\n'.join(l))
      break

130381


  0%|          | 331/130379 [00:00<00:02, 49564.96it/s]

N82836	health	medical	Where to Get a Cheap Flu Shot: Walmart, CVS, Costco, and More	The Centers for Disease Control and Prevention recommends that everyone over the age of 6 months get a dose of the influenza vaccine. To find out where to get cheap flu shots in 2019, we compared prices at Walgreens, CVS, Rite Aid, Walmart, Target, Kroger, Safeway, Meijer, Costco, and Sam's Club.	https://assets.msn.com/labs/mind/AAI3QbY.html	[{"Label": "Costco", "Type": "O", "WikidataId": "Q715583", "Confidence": 1.0, "OccurrenceOffsets": [45], "SurfaceForms": ["Costco"]}, {"Label": "CVS Pharmacy", "Type": "O", "WikidataId": "Q2078880", "Confidence": 0.971, "OccurrenceOffsets": [40], "SurfaceForms": ["CVS"]}, {"Label": "Walmart", "Type": "O", "WikidataId": "Q483551", "Confidence": 1.0, "OccurrenceOffsets": [31], "SurfaceForms": ["Walmart"]}]	[{"Label": "Costco", "Type": "O", "WikidataId": "Q715583", "Confidence": 1.0, "OccurrenceOffsets": [274], "SurfaceForms": ["Costco"]}, {"Label": "Meijer", "Type": "




In [32]:
np.mean(title_entity_lens)

1.1927917839529372

In [33]:
np.max(title_entity_lens)

9

In [34]:
np.min(title_entity_lens)

0

In [35]:
np.mean(abstract_entity_lens)

1.9756095690256867

In [36]:
np.max(abstract_entity_lens)

30

In [37]:
np.min(abstract_entity_lens)

0

In [118]:
files = [
          '../input/news/news.tsv',
        ]

# emb_size = 2
vocab = gezi.Vocab('../input/did.txt')
emb_height = vocab.size()
print(emb_height)

entity_vocab = gezi.Vocab('../input/entity.txt')
entity_type_vocab = gezi.Vocab('../input/entity_type.txt')

# [title_entities, abstract_entities, title_etypes, abstract_entities]
# 9 and 30
MAX_TITLE_ENTITIES = 5
MAX_ABSTRACT_ENTITIES = 5

ENTITY_LEN = MAX_TITLE_ENTITIES + MAX_ABSTRACT_ENTITIES
emb_size = ENTITY_LEN * 2

empty = [0] * emb_size
empty[0] = 1 
empty[MAX_TITLE_ENTITIES] = 1
emb = [empty.copy() for _ in range(emb_height)] 

for file_ in files:
  total = len(open(file_).readlines())
  for line in tqdm(open(file_), total=total):
    l = line.strip().split('\t')
    did, title_entities, abstract_entities = l[0], l[-2], l[-1]
    did = vocab.id(did)
    title_entities = json.loads(title_entities)
    abstract_entities = json.loads(abstract_entities)
    for i, m in enumerate(title_entities):
      if i >= MAX_TITLE_ENTITIES:
        continue
      emb[did][i] = entity_vocab.id(m['WikidataId'])
      emb[did][ENTITY_LEN + i] = entity_type_vocab.id(m['Type'])
    for i, m in enumerate(abstract_entities):
      if i >= MAX_ABSTRACT_ENTITIES:
        continue
      emb[did][MAX_TITLE_ENTITIES + i] = entity_vocab.id(m['WikidataId'])
      emb[did][ENTITY_LEN + MAX_TITLE_ENTITIES + i] = entity_type_vocab.id(m['Type'])     
emb = np.asarray(emb)
print(emb, emb.shape)

130381


100%|██████████| 130379/130379 [00:03<00:00, 35555.55it/s]

[[   1    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]
 ...
 [   1    0    0 ...    0    0    0]
 [   1    0    0 ...    5    0    0]
 [1840 9673    0 ...    0    0    0]] (130381, 20)





In [84]:
np.save('../input/entity_lookup.npy', emb)

In [55]:
np.mean(emb > 1) 

0.30629923071613196

In [54]:
emb[100]

array([424,   0,   0,   0,   0, 254, 713, 120, 260, 532,   2,   0,   0,
         0,   0,   2,   2,   2,   2,   2])

# merge emb

In [119]:
keys = ['cat', 'entity', 'title', 'abstract']
feats_ = [ ]
flens_ = []

for key in keys:
  feats_ += fnames[key]
  flens_ += flens[key]
  
print(feats_)
print(flens_)
print(sum(flens_))

embs = []
for key in keys:
  emb = np.load(f'../input/{key}_lookup.npy')
  print(emb.shape)
  embs += [emb]

['cat', 'sub_cat', 'title_entities', 'abstract_entities', 'title_entity_types', 'abstract_entity_types', 'title', 'abstract']
[1, 1, 5, 5, 5, 5, 30, 50]
102
(130381, 2)
(130381, 20)
(130381, 30)
(130381, 50)


In [120]:
emb = np.concatenate(embs, 1)
emb

array([[    1,     1,     1, ...,     0,     0,     0],
       [    1,     1,     1, ...,     0,     0,     0],
       [    3,    40,     1, ...,     0,     0,     0],
       ...,
       [   10,   104,     1, ...,     0,     0,     0],
       [    2,     3,     1, ...,  1795, 27019,   117],
       [    2,    19,  1840, ...,  1509,  3299,  1406]])

In [121]:
emb.shape

(130381, 102)

In [122]:
np.save('../input/doc_lookup.npy', emb)
np.save('../input/doc_fnames.npy', np.asarray(feats_))
np.save('../input/doc_flens.npy', np.asarray(flens_))

In [123]:
emb.dtype

dtype('int64')

# entity emb from wiki

In [125]:
from sklearn.preprocessing import normalize
from tqdm import tqdm

vocab_file = '../input/entity.txt'
vocab = gezi.Vocab(vocab_file)
emb_height = vocab.size()

emb_size = len(open('../input/train/entity_embedding.vec').readline().strip().split()) - 1
print(emb_size)

emb = np.random.uniform(-0.05, 0.05,(emb_height, emb_size))
print(emb)

emb = list(emb)

files = [
  '../input/train/entity_embedding.vec', 
  '../input/dev/entity_embedding.vec', 
  '../input/test/entity_embedding.vec'
]

entities = set()
for file_ in files:
  for line in tqdm(open(file_), total=emb_height):
    l = line.strip().split()
    entity, vals = l[0], l[1:]
    if entity in entities:
      continue
    entities.add(entity)
    vals = np.asarray(list(map(float, vals)))
    #vals = normalize(np.reshape(vals, (1,-1)))
    #vals /= np.sqrt(emb_size)
    vals = np.reshape(vals, (-1,))
    emb[vocab.id(entity)] = vals

emb = np.asarray(emb)
print(emb)

#emb = normalize(emb)

np.save('../input/entity_emb.npy', emb)

  0%|          | 0/50927 [00:00<?, ?it/s]

100
[[ 0.04096048 -0.03556133 -0.04408021 ... -0.0022821  -0.02009994
   0.02849957]
 [-0.04931467 -0.02747755 -0.01376152 ... -0.04337468 -0.04510694
  -0.03300745]
 [-0.00224096 -0.0357698  -0.00430866 ... -0.03357329 -0.03051395
   0.04724857]
 ...
 [-0.04139754  0.00033464 -0.01594218 ... -0.0262714   0.04772429
   0.01051264]
 [ 0.00095603  0.01616765 -0.03987193 ...  0.03548088  0.0228728
   0.00683941]
 [-0.03737675 -0.02940915  0.03667745 ...  0.00616488  0.00748581
  -0.04651056]]


 82%|████████▏ | 42007/50927 [00:02<00:00, 18099.00it/s]
 65%|██████▌   | 33310/50927 [00:00<00:00, 75491.09it/s]
 92%|█████████▏| 46807/50927 [00:00<00:00, 56981.38it/s]


[[ 0.04096048 -0.03556133 -0.04408021 ... -0.0022821  -0.02009994
   0.02849957]
 [-0.04931467 -0.02747755 -0.01376152 ... -0.04337468 -0.04510694
  -0.03300745]
 [ 0.001677   -0.069251    0.00074    ... -0.05406     0.003242
  -0.006887  ]
 ...
 [ 0.02741     0.019489    0.005972   ... -0.041304    0.007908
   0.009783  ]
 [ 0.037157   -0.016108   -0.01267    ... -0.004157   -0.008539
   0.050533  ]
 [ 0.040347    0.054959    0.084386   ... -0.066457   -0.03859
   0.001724  ]]
[[ 0.14022623 -0.12174249 -0.15090647 ... -0.00781266 -0.06881118
   0.09756688]
 [-0.17931151 -0.09991027 -0.05003782 ... -0.15771332 -0.16401193
  -0.12001736]
 [ 0.00335602 -0.13858546  0.00148089 ... -0.10818515  0.00648791
  -0.0137823 ]
 ...
 [ 0.09532903  0.06778065  0.02076997 ... -0.14365087  0.02750317
   0.03402422]
 [ 0.09275387 -0.0402099  -0.03162773 ... -0.01037699 -0.02131564
   0.12614397]
 [ 0.12996376  0.17703121  0.27182    ... -0.214068   -0.1243042
   0.00555326]]


In [126]:
from sklearn.preprocessing import normalize
from tqdm import tqdm

vocab_file = '../input/entity.txt'
vocab = gezi.Vocab(vocab_file)
emb_height = vocab.size()

emb_size = len(open('../input/train/entity_embedding.vec').readline().strip().split()) - 1
print(emb_size)

emb = np.random.uniform(-0.05, 0.05,(emb_height, emb_size))
print(emb)

emb = list(emb)

files = [
  '../input/train/entity_embedding.vec', 
  '../input/dev/entity_embedding.vec', 
  '../input/test/entity_embedding.vec'
]

entities = set()
for file_ in files:
  for line in tqdm(open(file_), total=emb_height):
    l = line.strip().split()
    entity, vals = l[0], l[1:]
    if entity in entities:
      continue
    entities.add(entity)
    vals = np.asarray(list(map(float, vals)))
    vals = normalize(np.reshape(vals, (1,-1)))
    #vals /= np.sqrt(emb_size)
    vals = np.reshape(vals, (-1,))
    emb[vocab.id(entity)] = vals

emb = np.asarray(emb)
print(emb)

#emb = normalize(emb)

np.save('../input/entity_emb2.npy', emb)

  0%|          | 0/50927 [00:00<?, ?it/s]

100
[[-0.00154117  0.03035518 -0.00589259 ... -0.04094373  0.04299281
  -0.02494033]
 [ 0.01657999 -0.04411747  0.04258725 ... -0.0143552  -0.04014027
  -0.02701598]
 [-0.01651694  0.01509018 -0.04305561 ...  0.00110249  0.02015254
   0.0299422 ]
 ...
 [ 0.00573689 -0.02462167  0.00071024 ...  0.03420342  0.03679753
   0.01625067]
 [-0.02460548 -0.04921551 -0.02206481 ...  0.04289599 -0.01137452
  -0.02312832]
 [ 0.01571464 -0.01823292  0.04533747 ...  0.00403212  0.03232957
   0.03336391]]


 82%|████████▏ | 42007/50927 [00:05<00:01, 7064.90it/s]
 65%|██████▌   | 33310/50927 [00:00<00:00, 62116.54it/s]
 92%|█████████▏| 46807/50927 [00:01<00:00, 28778.02it/s]


[[-0.00154117  0.03035518 -0.00589259 ... -0.04094373  0.04299281
  -0.02494033]
 [ 0.01657999 -0.04411747  0.04258725 ... -0.0143552  -0.04014027
  -0.02701598]
 [ 0.00335602 -0.13858546  0.00148089 ... -0.10818515  0.00648791
  -0.0137823 ]
 ...
 [ 0.09532903  0.06778065  0.02076997 ... -0.14365087  0.02750317
   0.03402422]
 [ 0.09275387 -0.0402099  -0.03162773 ... -0.01037699 -0.02131564
   0.12614397]
 [ 0.12996376  0.17703121  0.27182    ... -0.214068   -0.1243042
   0.00555326]]
