# Demo notebook for LatinCy model `la_core_web_lg`

Written by [Patrick J. Burns](https://diyclassics.github.io), May 2023

In [None]:
# Imports 

import spacy
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from spacy import displacy
from pprint import pprint

In [None]:
# Set up spaCy NLP

model = 'la_core_web_lg'
nlp = spacy.load(model)

In [None]:
# Get sample text; first story from Ritchie's fables

text = """Haec narrantur a poetis de Perseo. Perseus filius erat Iovis, maximi deorum; avus eius Acrisius appellabatur. Acrisius volebat Perseum nepotem suum necare; nam propter oraculum puerum timebat. Comprehendit igitur Perseum adhuc infantem, et cum matre in arca lignea inclusit. Tum arcam ipsam in mare coniecit. Danae, Persei mater, magnopere territa est; tempestas enim magna mare turbabat. Perseus autem in sinu matris dormiebat."""

text =  text.replace("v","u").replace("V","U")

In [None]:
# Create spacy Doc object

doc = nlp(text)

In [None]:
# Helper function
def enumerate_print(l):
    for i, x in enumerate(l, 1):
        print(f"{i}: {x}")

In [None]:
# Get sentences from text

sents = doc.sents

enumerate_print(sents)

In [None]:
# Get tokens from text

for token in doc:
    print(token)
    print(type(token))
    print([item for item in dir(token) if not item.startswith("_")])
    break

In [None]:
# Make dataframe with token attributes

data = []

for token in doc[:25]:
    data.append(
        [
            token.text,
            token.norm_,
            token.lower_,
            token.lemma_,
            token.pos_,
            token.tag_,
            token.dep_,
            token.has_vector,
            token.morph,
            token.ent_type_,
            token.text in nlp.vocab,
            token.is_oov,
        ]
    )

df = pd.DataFrame(
    data,
    columns=[
        "text",
        "norm",
        "lower",
        "lemma",
        "pos",
        "tag",
        "dep",
        "has_vector",
        "morph",
        "ent_type",
        "in_vocab",
        "is_oov",
    ],
)

df

In [None]:
# Show dependency parse for sample sentence

text = """Iason et Medea e Thessalia expulsi ad urbem Corinthum venerunt."""
text =  text.replace("v","u").replace("V","U")

sents = nlp(text).sents

for sent in sents:
    print(f'spaCy dependecy parse for "{sent}"')
    displacy.render(sent, style="dep", jupyter=True)
    break

In [None]:
# Noun chunks

text = "Turpe est hanc ignavam vitam agere; iam dudum tu adulescens es. Quo usque hic manebis? Tempus est arma capere et virtutem praestare. Hinc abi, et caput Medusae mihi refer."
selection = nlp(text)

selection.spans['NP'] = []
for chunk in selection.noun_chunks:
    if len(chunk) > 1:
        selection.spans['NP'].append(chunk)

colors = {'NP': '#85C1E9'}
options = {'spans_key': 'NP', 'colors': colors}
displacy.render(selection, style="span", jupyter=True, options=options)

In [None]:
# Named entities

text = """Iason et Medea e Thessalia expulsi ad urbem Corinthum venerunt."""
text =  text.replace("v","u").replace("V","U")

doc = nlp(text)

print(f'spaCy dependecy parse for "{sent}"')
displacy.render(doc, style="ent", jupyter=True)

In [None]:
# Plot proper_noun vectors with TSNE based on Ritchie's fables

with open('ritchies.txt', 'r') as f:
    contents = f.readlines()
    text = " ".join([line.strip() for line in contents if line.strip() and not line.startswith('#')])
    doc = nlp(text)

In [None]:
# Clearer with fewer elements; so only proper_nouns; extract vectors for text

vector_dict = {}

for item in doc:
    if item.tag_ == "proper_noun":
        vector_dict[item.norm_] = item.vector
        
words = list(vector_dict.keys())
vecs = list(vector_dict.values())

In [None]:
# Reduce vectors to 2D with TSNE; make dataframe

tsne = TSNE(n_components=2, perplexity=3, init='pca', random_state=42)
reduced_vecs = tsne.fit_transform(np.asarray(vecs))
df = pd.DataFrame(reduced_vecs, index=words, columns=['x', 'y'])
df['word'] = df.index

In [None]:
# Plot TSNE

ax = df.plot(kind='scatter', x='x', y='y', figsize=(15, 15), title="TSNE lat_core_web_lg vectors for proper nouns in Ritchie's Fables")

for idx, row in df.iterrows():
    ax.annotate(row['word'], (row['x'], row['y']))