# Word category formation using BERT predictions

## The idea is to generate word categories as follows:
- Get a list of sentences
- Mask a word in each sentence (repeat a sentence in the list if you want to mask different positions)
- For each sentence, obtain the logit vector for the masked word from BERT's prediction (last hidden layer)
- Cluster sentences logit vectors. The clusters should reflect words that fit together both syntactically and semantically.
- Build each word category by finding the highest valued words in the vectors belonging to a cluster (perhaps by most common top words, all words above some threshold, etc)

In [1]:
import numpy as np
import torch
import re

from transformers import BertTokenizer, BertForMaskedLM
from sklearn.cluster import KMeans, OPTICS, DBSCAN, cluster_optics_dbscan

tokenizer = None
model = None
MASK = '[MASK]'
sentences = None

In [2]:
def initialize_model(pretrained='bert-base-uncased'):
    global tokenizer, model
    with torch.no_grad():
        tokenizer = BertTokenizer.from_pretrained(pretrained)
        model = BertForMaskedLM.from_pretrained(pretrained)
        model.eval()

### Process sentences with BERT

In [3]:
def preproc_sents(text_sentences):
    # Place [MASK] tokens
    global sentences
    sentences = re.sub(r'\b_+\b', MASK, text_sentences).split('\n')
    
    # tokenize input
    input_ids = [tokenizer.encode(s, add_special_tokens=True) for s in sentences]

    # Make all sentence arrays equal length by padding
    max_len = max(len(i) for i in input_ids)
    padded_input = np.array([i + [0]* (max_len - len(i)) for i in input_ids])

    return padded_input

In [4]:
def get_top_predictions(probs, top=5, thres=0.02):
    """
    Print and return top-k predictions for a given probs list.
    Also return predictions above threshold
    """
    # Get top-k tokens
    probs = probs.detach().numpy()
    top_indexes = np.argpartition(probs, -top)[-top:]
    sorted_indexes = top_indexes[np.argsort(-probs[top_indexes])]
    top_tokens = tokenizer.convert_ids_to_tokens(sorted_indexes)
    print(f"Ordered top predicted tokens: {top_tokens}")
    print(f"Ordered top predicted values: {probs[sorted_indexes]}\n")
    
    # Get all tokens above threshold
    high_indexes = np.where(probs > thres)
    high_tokens = tokenizer.convert_ids_to_tokens(high_indexes[0])
    return top_tokens, high_tokens

In [5]:
def obtain_predictions(padded_input, top=5, thres=0.02):
    attention_mask = np.where(padded_input != 0, 1, 0)  # Create mask to ignore padding

    input = torch.tensor(padded_input)
    attention_mask = torch.tensor(attention_mask)
    
    # Get last hidden layers
    with torch.no_grad():
        last_hidden_states = model(input, attention_mask=attention_mask)
        
    # Find location of MASKS
    id_MASK = tokenizer.convert_tokens_to_ids(MASK)

    # Get last hidden state for the masked word of each sentence
    mask_positions = [np.where(s == id_MASK)[0][0] for s in padded_input] 
    embeddings = [lh[m].numpy() for lh, m in zip(last_hidden_states[0], mask_positions)]
    
    # Convert last hidden state to probs and find tokens
    sm = torch.nn.Softmax(dim=0) 
    #id_large = tokenizer.convert_tokens_to_ids('large')
    all_high_tokens = []
    i = 0
    for lh, m in zip(last_hidden_states[0], mask_positions):
        print("Sentence:")
        print(sentences[i])
        i += 1
        probs = sm(lh[m])
        #print(f"Probability of 'large': {probs[id_large]}")
        _, high_tokens = get_top_predictions(probs, top=top, thres=thres)
        all_high_tokens.append(high_tokens)
    
    return embeddings, all_high_tokens

### Convert last layer logit predictions to probabilities
We can see what are the highest predictions for the blank in each sentence, and their probabilities.

In [6]:
def cluster_embeddings(embeddings, k):
    # Cluster embeddings with KMeans
    estimator = KMeans(init="k-means++", n_clusters=k, n_jobs=4)
    estimator.fit(embeddings)
    return estimator.labels_

### Form word categories
Take all words above a threshold from vectors that belong to a cluster to form word categories

In [7]:
def form_categories(labels, all_high_tokens, k):
    word_categories = {}
    for cl in range(k):
        cluster_members = np.where(labels == cl)
        word_categories[cl] = sum((all_high_tokens[i] for i in cluster_members[0]), [])
        word_categories[cl] = set(word_categories[cl])
        print(f"Category {cl}:")
        print(", ".join(word_categories[cl]) + "\n")

## Main entry point

In [11]:
def get_word_categories(text_sentences, k):
    padded_input = preproc_sents(text_sentences)
    embeddings, all_high_tokens = obtain_predictions(padded_input, thres=0.02)
    labels = cluster_embeddings(embeddings, k)
    form_categories(labels, all_high_tokens, k)

### Choose some simple sentences with masked adjectives and nouns

In [14]:
initialize_model(pretrained='bert-base-uncased')

In [15]:
text_sentences = """The _ cat ate the mouse.
She was wearing a lovely _ dress last night.
He was receiving quite a _ salary.
He also bought a _ sofa for his new apartment.
I was born and grew up in _.
The _ metropolitan area added more than a million people in the past decade.
Bike races are held around the _ and farmlands.
My racist _ called me last night.
A device is considered to be available if it is not being used by another _."""

In [16]:
k = 6
get_word_categories(text_sentences, k)

Sentence:
The [MASK] cat ate the mouse.
Ordered top predicted tokens: ['black', 'cheshire', 'big', 'little', 'fat']
Ordered top predicted values: [0.13267049 0.08640933 0.06516975 0.03538685 0.03100599]

Sentence:
She was wearing a lovely [MASK] dress last night.
Ordered top predicted tokens: ['white', 'black', 'red', 'pink', 'blue']
Ordered top predicted values: [0.20945124 0.16496556 0.13129269 0.08869011 0.05542691]

Sentence:
He was receiving quite a [MASK] salary.
Ordered top predicted tokens: ['good', 'handsome', 'high', 'generous', 'decent']
Ordered top predicted values: [0.18829058 0.09613485 0.09576207 0.0917473  0.0544567 ]

Sentence:
He also bought a [MASK] sofa for his new apartment.
Ordered top predicted tokens: ['new', 'comfortable', 'luxurious', 'large', 'luxury']
Ordered top predicted values: [0.6247967  0.05839209 0.02877485 0.0248212  0.01501671]

Sentence:
I was born and grew up in [MASK].
Ordered top predicted tokens: ['chicago', 'california', 'texas', 'london', 'en