In [None]:
!pip -q install transformers

## Working with Bert Embeddings

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm



### Loading Bert embeddings

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt, output_hidden_states=True).to('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model.eval()

In [None]:
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Nile river bank."
# try to tokenize the text, and extract the input IDs
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)

# try to get the token ids from the tokeniser

#your output: [101,2044,11065,2769,...,1012,102]
tokens_ids =  inputs[0].ids 

segments_ids = [1] * len(tokens_ids)

all_tokens = inputs[0].tokens

## Converting to Pytorch Tensors
tokens_tensor = torch.tensor([tokens_ids])
segments_tensors = torch.tensor([segments_ids])

print('tokens tensor shape: ',tokens_tensor.shape)
print('Segments tensor shape: ',segments_tensors.shape)

## Output:
#tokens tensor shape:  torch.Size([1, 22])
#Segments tensor shape:  torch.Size([1, 22])

## Extract the Embeddings

In [None]:
# how can get all the hidden states from the model ?
with torch.no_grad():
  outputs = model(tokens_tensor, segments_tensors)
  hidden_states = outputs[1]
hidden_states[0].shape

In [None]:
## try to inspect the hidden_state  shape
## output: torch.Size([1, 22, 768])

In [None]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 6 distilbert  layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

In [None]:
#For the 4th token in our sentence, select its feature values from layer 4.
token_i = 4
layer_i = 4
vec = hidden_states[layer_i][batch_i][token_i]

In [None]:
vec.size()

In [None]:
# Concatenate the tensors for all layers.
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

In [None]:
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings,dim=1)
token_embeddings.size()

In [None]:
# you nees to Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()
#output: torch.Size([22, 7, 768])

## Extracting Word Embeddings 
### 1 - Word Representation (concat) 


In [None]:
## write a function that concatenate the representation of tokens from the 4 last layers
token_vecs_cat = []
for token in token_embeddings:
    cat_vec = torch.cat((token[-1],token[-2],token[-3],token[-4]),dim=0)
    token_vecs_cat.append(cat_vec)
print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

#output: Shape is: 22 x 3072

###2 -Word Representation (sum)

In [None]:
## write a function that sum the representation of tokens from the 4 last layers
token_vecs_sum = []
for token in token_embeddings:
    sum_vec = torch.sum(token[-4:],dim=0)
    token_vecs_sum.append(sum_vec)
print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

#output: Shape is: 22 x 768

###3 - Sentence Representation(mean)

In [None]:
# `hidden_states` has shape [7 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[0][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs,dim=0)

In [None]:
print ("Sentence Embedding Shape:", sentence_embedding.size())

### Evaluating Context dependent Representations

In [None]:
for i, token_str in enumerate(all_tokens):
  print (i, '==>' , token_str)

In [None]:
from scipy.spatial.distance import cosine

def get_similarity(vec_1,vec_2):
  return 1 - cosine(vec_1, vec_2)


In [None]:
## use the above function to calculate semantic similarity between [10 ==> bank] and [6 ==> bank] as well as 
#between [10 ==> bank] and [19 ==> bank]
## does the model cupture this similarity

get_similarity(token_vecs_sum[10],token_vecs_sum[6])

In [None]:
get_similarity(token_vecs_sum[10],token_vecs_sum[19])

In [None]:
get_similarity(token_vecs_cat[10],token_vecs_cat[6])

In [None]:
get_similarity(token_vecs_cat[10],token_vecs_cat[19])

In [None]:
#pip install -q sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [None]:
#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

In [None]:
#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)
embeddings.shape

In [None]:
# #Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
import numpy as np

In [None]:
embedder = SentenceTransformer('sentence-transformers/stsb-bert-base')

In [None]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]

In [None]:
corpus_embeddings = embedder.encode(corpus)
corpus_embeddings.shape

In [None]:
# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [None]:
# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
cluster_assignment

In [None]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")