# Text representation

Code notebook for TAHLR Working Group (Spring 2024) based on:  

- Vajjala, S., Majumder, B., Gupta, A., and Surana, H. 2020. *Practical Natural Language Processing: A Comprehensive Guide to Building Real-World NLP Systems*. Sebastopol, CA: O’Reilly Media.

More info on book here: https://www.oreilly.com/library/view/practical-natural-language/9781492054047/

In [None]:
# Installs
!pip install -U git+https://github.com/diyclassics/cltk_readers.git#egg=cltk-readers
from cltk.data.fetch import FetchCorpus
corpus_downloader = FetchCorpus(language="lat")
corpus_downloader.import_corpus('lat_text_tesserae')
corpus_downloader.import_corpus('lat_models_cltk')

# Imports

from pprint import pprint

## Pretrained word embeddings

In [None]:
# Download "slim" version of model

model_url = "https://github.com/eyaler/word2vec-slim/raw/master/GoogleNews-vectors-negative300-SLIM.bin.gz"
!curl -L $model_url -o models/GoogleNews-vectors-negative300-SLIM.bin.gz


In [None]:
# Load model

from gensim.models import KeyedVectors, Word2Vec

pretrainedpath = "models/GoogleNews-vectors-negative300-SLIM.bin.gz"
w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True)
print('done loading Word2Vec')

In [None]:
# Show number of words in the vocabulary.

print(f'There are {len(w2v_model.key_to_index)} words in the vocabulary')

In [None]:
# Show "similar" words
print("Using this w2v model, the most similar words to 'beautiful' are:")
pprint(w2v_model.most_similar('beautiful')) # Note the parentheses, not square brackets

In [None]:
# Show the vector representation of a word

w2v_model['beautiful']

In [None]:
# Show shape of the vector

w2v_model['beautiful'].shape

# Training our own embeddings

In [None]:
#Import a test data set provided in gensim to train a model

from gensim.test.utils import common_texts

In [None]:
#Build the model, by selecting the parameters; save

our_model = Word2Vec(common_texts, vector_size=10, window=5, min_count=1, workers=4)
our_model.save("models/tempmodel.w2v")


In [None]:
pprint(our_model.wv.most_similar('computer', topn=5))

In [None]:
print(our_model.wv['computer'])

## Going Beyond Words

In [None]:
# Load spaCy English model

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
# Process a sentence using the model
doc = nlp("Canada is a large country and a neighbor to the north.")

#Get a vector for individual words
print(doc[0].vector) #vector for 'Canada', the first word in the text

In [None]:
print(doc.vector) #Averaged vector for the entire sentence
print(doc.vector.shape)

In [None]:
doc[2].text == doc[6].text

In [None]:
doc[2].vector == doc[6].vector

## Visualize the vectors with tsne

In [None]:
# Get vectors for specific set of words

import numpy as np
from sklearn.manifold import TSNE

# Get the vectors for the words in w2v_model
words = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"]
words += ['apple', 'banana', 'cherry', 'date', 'elderberry', 'fig', 'grape', 'honeydew', 'kiwi', 'lemon']
words += ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'violet', 'black', 'white', 'gray']
words += ['dog', 'cat', 'bird', 'fish', 'hamster', 'rabbit', 'turtle', 'lizard', 'snake', 'frog']

vectors = [w2v_model[word] for word in words]
vectors = np.array(vectors)

In [None]:
# Reduce the dimensionality of the vectors to 2D

tsne = TSNE(n_components=2, random_state=0, perplexity=10)
vectors_2d = tsne.fit_transform(vectors)
x, y = zip(*vectors_2d)

In [None]:
# Plot vectors

from matplotlib import pyplot as plt

# Create a list of colors for each marker
colors = ['red', 'blue', 'green', 'orange']

# Set figure size
plt.figure(figsize=(10, 10))

# Plot the scatter plot with different colors for every 10 items
for i in range(0, len(x), 10):
    plt.scatter(x[i:i+10], y[i:i+10], c=colors[i//10])

for i in range(len(words)):
    plt.text(vectors_2d[i, 0], vectors_2d[i, 1], words[i])

plt.title('t-SNE visualization of Word2Vec vectors for four categorical lists')

plt.show()

## Word2Vec with Latin, pretrained

In [None]:
## LiLa lemma embeddings
# cf. https://embeddings.lila-erc.eu/samples/download/word2vec/

pretrained_latin_url = "https://embeddings.lila-erc.eu/samples/download/word2vec/allLASLAlemmi-vector-100-nocase-w10-SKIP.vec"
!curl -L $pretrained_latin_url -o models/allLASLAlemmi-vector-100-nocase-w10-SKIP.vec

In [None]:
# Load the LiLa embeddings

from gensim.models import KeyedVectors

pretrained_lila_path = "models/allLASLAlemmi-vector-100-nocase-w10-SKIP.vec"
lila_model = KeyedVectors.load_word2vec_format(pretrained_lila_path, binary=False)
print('done loading LiLa')

In [None]:
lila_model.most_similar('oratio')

## Training our own Latin embeddings

In [None]:
# Training Latin w2v vectors

from cltkreaders.lat import LatinTesseraeCorpusReader
CR = LatinTesseraeCorpusReader()
files = CR.fileids(match="cicero")


In [None]:
# Make helper function

def preprocess(text):
    text = text.lower()
    text = text.replace('v', 'u')
    text = text.replace('j', 'i')

    from string import punctuation
    text = ''.join([c for c in text if c not in punctuation])

    return text.strip()

In [None]:
# Get list of sentences; NB: this takes several minutes (~7)

sents = list(CR.sents(files))
sents = [preprocess(" ".join([token.lemma_ for token in sent])) for sent in sents ]
sents = [sent.split() for sent in sents]         

In [None]:
import pickle

pickle.dump(sents, open("data/cicero_lemma_sents.pkl", "wb"))
sents = pickle.load(open("data/cicero_lemma_sents.pkl", "rb"))

In [None]:
# Build the model as above for the Latin data

our_model = Word2Vec(sents, vector_size=100, window=5, min_count=2, epochs=10, workers=4)
our_model.save("models/temp-latmodel.w2v")

In [None]:
our_model.wv.most_similar('oratio', topn=5)