# huBERT
A Hungarian BERT model. More on huBERT info [here](https://hlt.bme.hu/en/resources/hubert)

In [42]:
import warnings
from transformers import logging as hf_logging

hf_logging.set_verbosity_error()
warnings.filterwarnings('ignore')

In [34]:
import heapq

import numpy as np
import torch
from scipy.spatial.distance import cosine
from transformers import AutoTokenizer, AutoModel

## BERT word embeddings
Getting word embeddings from BERT.

In [3]:
# source: https://discuss.huggingface.co/t/generate-raw-word-embeddings-using-transformer-models-like-bert-for-downstream-process/2958/2
def get_word_idx(sent: str, word: str):
    return sent.split(" ").index(word)


def get_hidden_states(encoded, token_ids_word, model, layers):
     """Push input IDs through model. Stack and sum `layers` (last four by default).
        Select only those subword token outputs that belong to our word of interest
        and average them."""
     with torch.no_grad():
         output = model(**encoded)

     # Get all hidden states
     states = output.hidden_states
     # Stack and sum all requested layers
     output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
     # Only select the tokens that constitute the requested word
     word_tokens_output = output[token_ids_word]

     return word_tokens_output.mean(dim=0)


def get_word_vector(sent, idx, tokenizer, model, layers):
     """Get a word vector by first tokenizing the input sentence, getting all token idxs
        that make up the word of interest, and then `get_hidden_states`."""
     encoded = tokenizer.encode_plus(sent, return_tensors="pt")
     # get all token idxs that belong to the word of interest
     token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

     return get_hidden_states(encoded, token_ids_word, model, layers)


def wvs_from_sent(sent, layers=None):
     # Use last four layers by default
     layers = [-4, -3, -2, -1] if layers is None else layers
     tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")
     model = AutoModel.from_pretrained("SZTAKI-HLT/hubert-base-cc", output_hidden_states=True)

     indices = [get_word_idx(sent, wd.strip()) for wd in sent.split()]
     wvs = [get_word_vector(sent, wid, tokenizer, model, layers) for wid in indices]
     return sent.split(), wvs


In [4]:
wds1, wvs1 = wvs_from_sent("Ez egy szép kis példa")
wds2, wvs2 = wvs_from_sent("Itt pedig egy másik példa")

Let's see if the word embeddings are the same.

In [5]:
print(torch.equal(wvs1[1], wvs2[2])) # egy
print(torch.equal(wvs1[-1], wvs2[-1])) # példa

False
False


Word embeddings extracted from the model are context sensitive!

## Hungarian folk tales
Let's create vectors for each word in our corpus. (This might take some time)

In [None]:
# don't run this on colab, it will go through the whole corpus
wd2vc = {}
i = 0
with open("../data/processed/lemmatized.txt", "r") as infile:
     for l in infile:
          wds, wvs = wvs_from_sent(l.strip().lower())
          for k, v in dict(zip(wds, wvs)).items():
               if k not in wd2vc:
                    if len(v.numpy()) == 768:
                         wd2vc[k] = v.numpy()
               else:
                    try:
                         mean_vec = np.mean([wd2vc[k], v.numpy()], axis=1)
                         if len(mean_vec) == 768:
                              wd2vc[k] = mean_vec
                    except Exception as e:
                         continue
          i += 1
          print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


In [25]:
import pickle
with open("../models/bert_mese.pkl", "wb") as outfile:
     pickle.dump(wd2vc, outfile)

## Distance

In [28]:
def dist(wd1, wd2):
     assert wd1 in wd2vc
     assert wd2 in wd2vc
     return cosine(wd2vc[wd1], wd2vc[wd2])

In [29]:
print(dist("a", "az"))

0.019718647003173828


## Most similar

In [32]:
def similar(wd, n):
     assert wd in wd2vc
     w_vector = wd2vc[wd]
     vocabulary = list(wd2vc.keys())
     m = np.asarray([np.asarray(e) for e in wd2vc.values()])
     sims = list(m.dot(w_vector))
     most_similar_values = heapq.nlargest(n + 10, sims)
     most_similar_indices = [sims.index(e) for e in list(most_similar_values)]
     most_similar_words = [vocabulary[e] for e in most_similar_indices]
     if wd in most_similar_words:
          most_similar_words.remove(wd)
     return most_similar_words[:n]

In [35]:
similar("a", 3)

['az', 'magyar', 'király']

## Analogy


In [38]:
def similar(positive, negative, topn=3):
   """Analogy difference"""
   assert positive[0] in wd2vc
   assert positive[1] in wd2vc
   assert negative in wd2vc

   pos1_vector = wd2vc[positive[0]]
   pos2_vector = wd2vc[positive[1]]
   neg_vector = wd2vc[negative]
   target_vector = np.subtract(neg_vector, np.add(pos1_vector, pos2_vector))
   vocabulary = list(wd2vc.keys())
   m = np.asarray([np.asarray(e) for e in wd2vc.values()])
   sims = list(m.dot(target_vector))

   most_similar_values = heapq.nlargest(topn + 10, sims)
   most_similar_indices = [sims.index(e) for e in list(most_similar_values)]
   most_similar_words = [vocabulary[e] for e in most_similar_indices]

   if positive[0] in most_similar_words:
       most_similar_words.remove(positive[0])
   if positive[1] in most_similar_words:
       most_similar_words.remove(positive[1])
   if negative in most_similar_words:
       most_similar_words.remove(negative)
   if len(most_similar_words) > 0:
       return most_similar_words[:topn]
   else:
       return []

In [39]:
similar(["a", "az"], "őzike")

['gyémánt', 'kakas', 'kis']

## Kakukktojás

In [40]:
def doesnt_match(lst):
    """odd-one-out"""
    vocabulary = list(wd2vc.keys())
    word_idxs = [vocabulary.index(wd) for wd in lst]
    m = np.asarray([np.asarray(e) for e in wd2vc.values()])

    word_vectors = np.vstack(m[i] for i in word_idxs)
    mean = np.mean(word_vectors, axis=0)
    dists = [abs(cosine(e, mean)) for e in word_vectors]
    mdist = max(dists)
    midx = dists.index(mdist)
    return lst[midx]

In [44]:
print(doesnt_match(['magyar', 'népmesék', 'arany', 'lászló']))

népmesék


## Summary
+ it is designed for a different task!
+ it is a bit complicated to handle it
+ it gives very good context sensitive embeddings