# Demo notebook for LatinCy model `la_core_web_trf`

Written by [Patrick J. Burns](https://diyclassics.github.io), May 2023

In [1]:
# Imports 

import spacy
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from spacy import displacy

In [2]:
# Set up spaCy NLP

model = 'la_core_web_trf'
nlp = spacy.load(model)

In [3]:
# Get sample text; first story from Ritchie's fables

text = """Haec narrantur a poetis de Perseo. Perseus filius erat Iovis, maximi deorum; avus eius Acrisius appellabatur. Acrisius volebat Perseum nepotem suum necare; nam propter oraculum puerum timebat. Comprehendit igitur Perseum adhuc infantem, et cum matre in arca lignea inclusit. Tum arcam ipsam in mare coniecit. Danae, Persei mater, magnopere territa est; tempestas enim magna mare turbabat. Perseus autem in sinu matris dormiebat."""

text =  text.replace("v","u").replace("V","U")

In [4]:
# Create spacy Doc object

doc = nlp(text)

In [5]:
# Helper function
def enumerate_print(l):
    for i, x in enumerate(l, 1):
        print(f"{i}: {x}")

In [6]:
# Get sentences from text

sents = doc.sents

enumerate_print(sents)

1: Haec narrantur a poetis de Perseo.
2: Perseus filius erat Iouis, maximi deorum;
3: auus eius Acrisius appellabatur.
4: Acrisius uolebat Perseum nepotem suum necare;
5: nam propter oraculum puerum timebat.
6: Comprehendit igitur Perseum adhuc infantem, et cum matre in arca lignea inclusit.
7: Tum arcam ipsam in mare coniecit.
8: Danae, Persei mater, magnopere territa est;
9: tempestas enim magna mare turbabat.
10: Perseus autem in sinu matris dormiebat.


In [7]:
# Get tokens from text

for token in doc:
    print(token)
    print(type(token))
    print([item for item in dir(token) if not item.startswith("_")])
    break

Haec
<class 'spacy.tokens.token.Token'>
['ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_dep', 'has_extension', 'has_head', 'has_morph', 'has_vector', 'head', 'i', 'idx', 'iob_strings', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_end', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', 'lemma_', 'lex', 'lex_id', 'like_email', 'like_num', 'like_url', 'lower', 'lower_', 'morph', 'n_lefts', 'n_rights', 'nbor', 'norm', 'norm_', 'orth', 'orth_', 'pos', 'pos_', 'prefix', 'prefix_', 'prob', 'rank', 'remove_extension', 'right_edge', 'rights', 'sent', 'sent_start', 'sentiment', 'set_extension', 'set_morph', 'shape', 'shape_', 'similarity', 'subtree', 'suffix'

In [8]:
# Make dataframe with token attributes

data = []

for token in doc[:25]:
    data.append(
        [
            token.text,
            token.norm_,
            token.lower_,
            token.lemma_,
            token.pos_,
            token.tag_,
            token.dep_,
            token.has_vector,
            token.morph,
            token.ent_type_,
            token.text in nlp.vocab,
            token.is_oov,
        ]
    )

df = pd.DataFrame(
    data,
    columns=[
        "text",
        "norm",
        "lower",
        "lemma",
        "pos",
        "tag",
        "dep",
        "has_vector",
        "morph",
        "ent_type",
        "in_vocab",
        "is_oov",
    ],
)

df

Unnamed: 0,text,norm,lower,lemma,pos,tag,dep,has_vector,morph,ent_type,in_vocab,is_oov
0,Haec,haec,haec,Hic,DET,pronoun,nsubj,False,"(Case=Nom, Gender=Neut, Number=Plur)",,True,True
1,narrantur,narrantur,narrantur,narro,VERB,verb,ROOT,False,"(Mood=Ind, Number=Plur, Person=3, Tense=Pres, ...",,True,True
2,a,a,a,ab,ADP,preposition,case,False,(),,True,True
3,poetis,poetis,poetis,poetus,NOUN,noun,obl:agent,False,"(Case=Abl, Gender=Masc, Number=Plur)",,True,True
4,de,de,de,de,ADP,preposition,case,False,(),,True,True
5,Perseo,perseo,perseo,Perseus,NOUN,proper_noun,obl,False,"(Case=Abl, Gender=Masc, Number=Sing)",PERSON,True,True
6,.,.,.,.,PUNCT,punc,punct,False,(),,True,True
7,Perseus,perseus,perseus,Perseus,NOUN,proper_noun,nsubj,False,"(Case=Nom, Gender=Masc, Number=Sing)",PERSON,True,True
8,filius,filius,filius,filius,NOUN,noun,ROOT,False,"(Case=Nom, Gender=Masc, Number=Sing)",,True,True
9,erat,erat,erat,sum,AUX,verb,cop,False,(),,True,True


In [9]:
# Show dependency parse for sample sentence

text = """Iason et Medea e Thessalia expulsi ad urbem Corinthum venerunt."""
text =  text.replace("v","u").replace("V","U")

sents = nlp(text).sents

for sent in sents:
    print(f'spaCy dependecy parse for "{sent}"')
    displacy.render(sent, style="dep")
    break

spaCy dependecy parse for "Iason et Medea e Thessalia expulsi ad urbem Corinthum uenerunt."


In [10]:
# Noun chunks
# NB: Noun chunks will be implemented in spaCy 3.6

# for chunk in doc.noun_chunks:
#     if len(chunk.text.split()) > 1:
#         print(chunk)

In [11]:
# Named entities

text = """Iason et Medea e Thessalia expulsi ad urbem Corinthum venerunt."""
text =  text.replace("v","u").replace("V","U")

doc = nlp(text)

print(f'spaCy dependecy parse for "{sent}"')
displacy.render(doc, style="ent")

spaCy dependecy parse for "Iason et Medea e Thessalia expulsi ad urbem Corinthum uenerunt."
