# TAHLR Week 11: Word Embeddings

Code notebook for TAHLR course at ISAW (Fall 2023) based on Albrecht et al. 2022 (Blueprints) Ch. 10: Exploring Semantic Relationships with Word Embeddings

In [None]:
#!pip install gensim
#!pip install plotly
#!pip install umap-learn

# Imports

from glob import glob
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline
from natsort import natsorted

## Blueprint: Using Similarity Queries on Pretrained Models

In [None]:
# Load model directory from gensim api
import gensim.downloader as api

info_df = pd.DataFrame.from_dict(api.info()['models'], orient='index')
info_df[['file_size', 'base_dataset', 'parameters']].head(5)

In [None]:
# Load specific model

model = api.load("glove-wiki-gigaword-50")

In [None]:
# Get vector size, etc.

print("Vector size:", model.vector_size)

In [None]:
# Show vectors

v_king = model['king']
v_queen = model['queen']

print("v_king  =", v_king[:10])
print("v_queen =", v_queen[:10])

In [None]:
# Show similarity to two terms

print("similarity:", model.similarity('king', 'queen'))

In [None]:
# Show closest terms by similarity

model.most_similar('king', topn=3)

In [None]:
# Show relative similarity

v_lion = model['lion']
v_nano = model['nanotechnology']

terms = ['queen', 'lion', 'nanotechnology']
sims = model.cosine_similarities(v_king, [model[t] for t in terms])

for term, sim in zip(terms, sims):
    print(f"king ~ {term:<15}: {sim:.3f}")

In [None]:
# Show similarity with negative terms

model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

In [None]:
# Show similarity with negative terms, 2
model.most_similar(positive=['paris', 'germany'], negative=['france'], topn=3)

In [None]:
# Direct "positive" approach

model.most_similar(positive=['france', 'capital'], topn=1)

In [None]:
# Direct "positive" approach, 2 (doesn't work

model.most_similar(positive=['greece', 'capital'], topn=3)

## Blueprint: Training Models with Gensim

In [None]:
# Get texts, preprocess into sents

import urllib.request

URL = "https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt"
response = urllib.request.urlopen(URL)
raw = response.read().decode('utf-8')
sents = sent_tokenize(raw)
sents = [sent.lower() for sent in sents if len(sent) > 20]
sents = [word_tokenize(sent) for sent in sents]

In [None]:
# Get number of sents

print(len(sents))

In [None]:
# Train w2v model

from gensim.models import Word2Vec

model = Word2Vec(sents,       # tokenized input sentences
                 vector_size=100,    # size of word vectors (default 100)
                 window=5,    # context window size (default 5)
                 sg=1,        # use skip-gram (default 0 = CBOW)
                 negative=5,  # number of negative samples (default 5)
                 min_count=3, # ignore infrequent words (default 5)
                 workers=4,   # number of threads (default 3)
                 epochs=5)      # number of epochs (default 5)

In [None]:
# # Save model

# import os

# # Create the directory if it doesn't exist
# if not os.path.exists('../data/models'):
#     os.makedirs('../data/models')

# # Save the model
# model.save('../data/models/shakespeare_w2v_100_5_full.bin')

In [None]:
# Get vocab

key_to_index = model.wv.key_to_index
index_to_key = model.wv.index_to_key
full_vocab = list(key_to_index.keys())
print(full_vocab[:25])

In [None]:
# Show closest terms by similarity

model.wv.most_similar(positive=['romeo'], topn=3)

In [None]:
# Show closest terms by similarity, 2

model.wv.most_similar(positive=['king'], topn=3)

In [None]:
# Show closest terms by similarity, 3
model.wv.most_similar(positive=['fool'], topn=3)

In [None]:
# Show closest terms with negative term

model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

## Blueprint: Applying Dimensionality Reduction

In [None]:
# Reduce dimensionality

from umap import umap_ as umap

vocab = model.wv.index_to_key
df = pd.DataFrame(model.wv.vectors, index=vocab)
reduced_wv = embedding = umap.UMAP(random_state=42).fit_transform(df.values)

In [None]:
# Plot reduced vectors

import matplotlib.pyplot as plt

plt.scatter(embedding[:, 0], embedding[:, 1])
plt.title('UMAP clustering of 4406 cells', fontsize=20)
plt.xlabel('UMAP_1')
plt.ylabel('UMAP_2')
plt.show()

In [None]:
# Plot reduced vectors, explorative interface

import plotly.express as px

plot_df = pd.DataFrame.from_records(reduced_wv, columns=['x', 'y'])
plot_df['word'] = vocab
params = {'hover_data': {c: False for c in plot_df.columns},
          'hover_name': 'word'}

fig = px.scatter(plot_df, x="x", y="y", opacity=0.3, size_max=3, **params)
fig.show()

In [None]:
# Plot reduced vectors, selective (uses custom script)

from bp_embeddings import plot_embeddings

search = "goneril regan cordelia".split()

plot_embeddings(model, search, topn=10, show_all=False, labels=True,
    algo='umap', n_neighbors=15, min_dist=10, spread=25)

In [None]:
# Plot reduced vectors, selective (uses custom script), 2

from bp_embeddings import plot_embeddings

search = "goneril regan cordelia lear".split()

plot_embeddings(model, search, topn=10, show_all=False, labels=True,
    algo='umap', n_neighbors=15, min_dist=10, spread=25)

In [None]:
# Plot reduced vectors, selective (uses custom script), 3

from bp_embeddings import plot_embeddings

search = "goneril regan cordelia lear beatrice".split()

plot_embeddings(model, search, topn=10, show_all=False, labels=True,
    algo='umap', n_neighbors=15, min_dist=10, spread=25)

## Blueprint: Constructing a Similarity Tree

In [None]:
import networkx as nx
from collections import deque

def sim_tree(model, word, top_n, max_dist):

    graph = nx.Graph()
    graph.add_node(word, dist=0)

    to_visit = deque([word])
    while len(to_visit) > 0:
        source = to_visit.popleft() # visit next node
        dist = graph.nodes[source]['dist']+1

        if dist <= max_dist: # discover new nodes
            for target, sim in model.wv.most_similar(source, topn=top_n):
                if target not in graph:
                    to_visit.append(target)
                    graph.add_node(target, dist=dist)
                    graph.add_edge(source, target, sim=sim, dist=dist)
    return graph

In [None]:
from networkx.drawing.nx_agraph import graphviz_layout

def plot_tree(graph, node_size=1000, font_size=12):

    pos = graphviz_layout(graph, prog='twopi', root=list(graph.nodes)[0])

    colors = [graph.nodes[n]['dist'] for n in graph] # colorize by distance
    nx.draw_networkx_nodes(graph, pos, node_size=node_size, node_color=colors,
                           cmap='Set1', alpha=0.4)
    nx.draw_networkx_labels(graph, pos, font_size=font_size)

    for (n1, n2, sim) in graph.edges(data='sim'):
         nx.draw_networkx_edges(graph, pos, [(n1, n2)], width=sim, alpha=0.2)

    plt.show()

In [None]:
graph = sim_tree(model, 'caesar', top_n=5, max_dist=3)
plot_tree(graph, node_size=250, font_size=8)