In [29]:
from sentence_transformers import SentenceTransformer, util
import umap
import plotly.graph_objects as go
import numpy as np
from gensim.downloader import load as gensim_load
from sklearn.metrics.pairwise import cosine_similarity, paired_manhattan_distances
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 1. Let's use embeddings to map in a 3 dimensional space 

In [None]:
# let's use a standard model.
# load it and 'load' it into mps
model = SentenceTransformer("all-mpnet-base-v2", device="mps")

In [None]:
words = ["monarchy", "republic", "kingdom", "monarch", "president", "prime minister", "senator", "theocracy", "democracy", "prince"]
colors = ["red", "red", "red", "blue", "blue", "blue", "blue", "red", "red", "blue"]
embeddings = model.encode(words)
reducer = umap.UMAP(n_components=3, random_state=42)
embeddings_3d = reducer.fit_transform(embeddings)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter3d(
    x=embeddings_3d[:, 0],
    y=embeddings_3d[:, 1],
    z=embeddings_3d[:, 2],
    mode='markers+text',
    text=words,
    textposition='top center',
    marker=dict(size=len(words), color=colors),
))

fig.update_layout(
    scene=dict(
        xaxis=dict(range=[np.min(embeddings_3d[:, 0],)-0.5, np.max(embeddings_3d[:, 0],)+0.5]),  
        yaxis=dict(range=[np.min(embeddings_3d[:, 1],)-0.5, np.max(embeddings_3d[:, 1],)+0.5]),  
        zaxis=dict(range=[np.min(embeddings_3d[:, 2],)-0.5, np.max(embeddings_3d[:, 2],)+0.5]),
        xaxis_title='UMAP 1 (X)',
        yaxis_title='UMAP 2 (Y)',
        zaxis_title='UMAP 3 (Z)'
    ),
    margin=dict(l=0, r=0, b=0, t=40),
    height=800
)

fig.show()

# 2. Basic embeddings operations

In [2]:
# let's check this sentences

# three sentences with similar meaning
sent_a = "The minister submitted a policy draft"
sent_b = "The pub served warm draft beer"
sent_c = "The NBA draft is taking place."
sent_d = "The bill has passed."

sentences = [sent_a, sent_b, sent_c, sent_d]
labels = ["Parliamentary draft", "Beer", "NBA", "Parliamentary draft 2"]

In [3]:
model = SentenceTransformer("all-mpnet-base-v2", device="mps")
embedding_a_768 = model.encode(sentences[0]) 
embedding_b_768 = model.encode(sentences[1]) 

model = SentenceTransformer("all-MiniLM-L6-v2", device="mps")
embedding_a_384 = model.encode(sentences[0]) 
embedding_b_384 = model.encode(sentences[1]) 

In [4]:
embedding_a_768.shape, embedding_a_384.shape

((768,), (384,))

### How to measure similarity? Cosine similarity!

Why? It measures direction and not length!

In [20]:
model = SentenceTransformer("all-mpnet-base-v2", device="mps")

In [21]:
sentence_a = "I like public policy"
sentence_b = "I really like public policy"

In [22]:
embedding_a = model.encode(sentence_a, convert_to_tensor=True)
embedding_b = model.encode(sentence_b, convert_to_tensor=True)

In [23]:
cosine_sim = util.cos_sim(embedding_a, embedding_b).item()
print(cosine_sim)

0.9627445936203003


In [24]:
euclidean_dist = np.linalg.norm(embedding_a.cpu().numpy() - embedding_b.cpu().numpy())
print(euclidean_dist)

0.27296668


# 2. Static vs dynamics embeddings

In [None]:
# static embedding helper
def static_embed(sentence, glove):
    words = sentence.lower().split()
    vectors = [glove[w] for w in words if w in glove]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# similarity function
def sim(v1, v2):
    return round(cosine_similarity([v1], [v2])[0][0], 4)

In [None]:
# let's use a function to help us do everything in one go

def compare_embeddings(vectors, labels=labels):
    vector = vectors[0]
    label = labels[0]
    for i, _ in enumerate(vectors):
        v = vectors[i]
        l = labels[i]        
        print(sim(vector, v), ":", label, "vs", l, )
    return

In [None]:
# load model (it may take a few minutes)
glove = gensim_load("glove-wiki-gigaword-100")

In [None]:
print("STATIC EMBEDDINGS (GloVe)")
vectors = [static_embed(s, glove) for s in sentences]
_ = compare_embeddings(vectors)

In [None]:
print("CONTEXTUAL EMBEDDINGS")
model = SentenceTransformer("all-MiniLM-L6-v2", device="mps")
vectors = model.encode(sentences)
_ = compare_embeddings(vectors)

In [None]:
print("CONTEXTUAL EMBEDDINGS")
model = SentenceTransformer("all-mpnet-base-v2", device="mps")
vectors = model.encode(sentences)
_ = compare_embeddings(vectors)

Note! Each embedding model has different dimensions!

More embeddings? More information can be captured by the vector! However, adding more vectors make comparison more difficult!

# 3. Dimensionality reduction

Go back to slides

Note! What happens when we add more information to the sentences?