In [14]:
from sentence_transformers import SentenceTransformer
import umap
import plotly.graph_objects as go
import numpy as np
from gensim.downloader import load as gensim_load
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 1. Let's use embeddings to map in a 3 dimensional space 

In [2]:
# let's use a standard model.
# load it and 'load' it into mps
model = SentenceTransformer("all-mpnet-base-v2", device="mps")

In [3]:
words = ["monarchy", "republic", "kingdom", "monarch", "president", "prime minister", "senator", "theocracy", "democracy", "prince"]
colors = ["red", "red", "red", "blue", "blue", "blue", "blue", "red", "red", "blue"]
embeddings = model.encode(words)
reducer = umap.UMAP(n_components=3, random_state=42)
embeddings_3d = reducer.fit_transform(embeddings)

In [4]:
fig = go.Figure()

fig.add_trace(go.Scatter3d(
    x=embeddings_3d[:, 0],
    y=embeddings_3d[:, 1],
    z=embeddings_3d[:, 2],
    mode='markers+text',
    text=words,
    textposition='top center',
    marker=dict(size=len(words), color=colors),
))

fig.update_layout(
    scene=dict(
        xaxis=dict(range=[np.min(embeddings_3d[:, 0],)-0.5, np.max(embeddings_3d[:, 0],)+0.5]),  
        yaxis=dict(range=[np.min(embeddings_3d[:, 1],)-0.5, np.max(embeddings_3d[:, 1],)+0.5]),  
        zaxis=dict(range=[np.min(embeddings_3d[:, 2],)-0.5, np.max(embeddings_3d[:, 2],)+0.5]),
        xaxis_title='UMAP 1 (X)',
        yaxis_title='UMAP 2 (Y)',
        zaxis_title='UMAP 3 (Z)'
    ),
    margin=dict(l=0, r=0, b=0, t=40),
    height=800
)

fig.show()

# 2. Static vs dynamics embeddings

In [5]:
# load model (it may take a few minutes)
glove = gensim_load("glove-wiki-gigaword-100")

In [6]:
# let's check this sentences

# three sentences with similar meaning
sent_a = "The minister submitted a policy draft"
sent_b = "The pub served warm draft beer"
sent_c = "The NBA draft is taking place."

# compare to this
sent_d = "The bill has passed."

In [7]:
# static embedding helper
def static_embed(sentence):
    words = sentence.lower().split()
    vectors = [glove[w] for w in words if w in glove]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# similarity function
def sim(v1, v2):
    return round(cosine_similarity([v1], [v2])[0][0], 4)

In [8]:
sentences = [sent_a, sent_b, sent_c, sent_d]
labels = ["Parliamentary draft", "Beer", "NBA", "Parliamentary draft 2"]

In [9]:
# let's use a function to help us do everything in one go

def compare_embeddings(vectors, labels=labels):
    vector = vectors[0]
    label = labels[0]
    for i, _ in enumerate(vectors):
        v = vectors[i]
        l = labels[i]        
        print(sim(vector, v), ":", label, "vs", l, )
    return

In [10]:
print("STATIC EMBEDDINGS (GloVe)")
vectors = [static_embed(s) for s in sentences]
_ = compare_embeddings(vectors)

STATIC EMBEDDINGS (GloVe)
1.0 : Parliamentary draft vs Parliamentary draft
0.7233 : Parliamentary draft vs Beer
0.8605 : Parliamentary draft vs NBA
0.8665 : Parliamentary draft vs Parliamentary draft 2


In [11]:
print("CONTEXTUAL EMBEDDINGS")
model = SentenceTransformer("all-MiniLM-L6-v2", device="mps")
vectors = model.encode(sentences)
_ = compare_embeddings(vectors)

CONTEXTUAL EMBEDDINGS
1.0 : Parliamentary draft vs Parliamentary draft
0.2879 : Parliamentary draft vs Beer
0.2683 : Parliamentary draft vs NBA
0.3892 : Parliamentary draft vs Parliamentary draft 2


In [12]:
print("CONTEXTUAL EMBEDDINGS")
model = SentenceTransformer("all-mpnet-base-v2", device="mps")
vectors = model.encode(sentences)
_ = compare_embeddings(vectors)

CONTEXTUAL EMBEDDINGS
1.0 : Parliamentary draft vs Parliamentary draft
0.1259 : Parliamentary draft vs Beer
0.178 : Parliamentary draft vs NBA
0.314 : Parliamentary draft vs Parliamentary draft 2


# 3. Dimensionality reduction

Go back to slides

Curse of dimentionality: 