In [73]:
# from cltk.corpus.utils.importer import CorpusImporter
# corpus_importer = CorpusImporter('latin')
# corpus_importer.import_corpus('latin_text_latin_library')

Downloaded 100% 35.47 MiB | 3.25 MiB/s 

In [1]:
import pandas as pd

from collections import Counter


from nltk.tokenize import WordPunctTokenizer

from cltk.corpus.latin import latinlibrary
from cltk.stop.latin import CorpusStoplist

# import pickle

from pprint import pprint

In [2]:
def truncate_text(text):
    temp = text[500:-500]
    start = temp.find(' ')
    end = temp.rfind(' ')
    return temp[start:end]

In [3]:
# Preprocess texts

import html
from cltk.stem.latin.j_v import JVReplacer

replacer = JVReplacer()

def preprocess(text):    
    text = html.unescape(text) # Handle html entities
    text = replacer.replace(text) #Normalize u/v & i/j    
    return text

In [4]:
ll_files = latinlibrary.fileids()
ll_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in ll_files]
ll_docs = [doc for doc in ll_docs if len(doc) > 100]
ll_size = len(ll_files)

In [5]:
# CITED IN ARTICLE

print(f'There are {ll_size} files in the CLTK Latin Library corpus.')

There are 2141 files in the CLTK Latin Library corpus.


In [6]:
ll_tokens = [WordPunctTokenizer().tokenize(doc) for doc in ll_docs]
ll_tokens = [item for sublist in ll_tokens for item in sublist]

In [7]:
# CITED IN ARTICLE

print(f'There are {len(ll_tokens)} tokens in the CLTK Latin Library corpus.')
print(f'There are {len(set(ll_tokens))} unique tokens in the CLTK Latin Library corpus.')
print(f'Of the tokens appearing in the CLTK Latin Library corpus, {len([k for k, v in Counter(ll_tokens).items() if v == 1 ])} tokens appear once.')

There are 16287272 tokens in the CLTK Latin Library corpus.
There are 482167 unique tokens in the CLTK Latin Library corpus.
Of the tokens appearing in the CLTK Latin Library corpus, 225613 tokens appear once.


In [8]:
cic_files = [file for file in latinlibrary.fileids() if 'cicero/' in file]
cic_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in cic_files]
cic_tokens = [WordPunctTokenizer().tokenize(doc) for doc in cic_docs]
cic_tokens = [item for sublist in cic_tokens for item in sublist] #flatten

bib_files = [file for file in latinlibrary.fileids() if 'bible/' in file]
bib_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in bib_files]
bib_tokens = [WordPunctTokenizer().tokenize(doc) for doc in bib_docs]
bib_tokens = [item for sublist in bib_tokens for item in sublist] #flatten

ius_files = [file for file in latinlibrary.fileids() if 'justinian' in file 
                     or 'gaius' in file 
                     or 'theod' in file]
ius_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in ius_files]
ius_tokens = [WordPunctTokenizer().tokenize(doc) for doc in ius_docs]
ius_tokens = [item for sublist in ius_tokens for item in sublist] #flatten

In [9]:
colls = ['Latin Library', 'Cicero', 'Biblia Sacra', 'Ius Romanum']
colls_abbrev = ['LL', 'LL-Cic', 'LL-Bib', 'LL-Ius']
colls_docs = [ll_docs, cic_docs, bib_docs, ius_docs]
colls_tokens = [ll_tokens, cic_tokens, bib_tokens, ius_tokens]
colls_file_counts = [len(docs) for docs in colls_docs]
colls_token_counts = [len(tokens) for tokens in colls_tokens]
colls_unique_counts = [len(set(tokens)) for tokens in colls_tokens]
colls_single_counts = [len([k for k, v in Counter(tokens).items() if v == 1 ]) for tokens in colls_tokens]

In [10]:
# Make table

data = {'Collections': colls_abbrev, 'Description': colls, 'Files': colls_file_counts, 'Tokens': colls_token_counts, 'Unique Tokens': colls_unique_counts, 'Single Tokens': colls_single_counts}
df = pd.DataFrame.from_dict(data)

df.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Collections,Description,Files,Tokens,Unique Tokens,Single Tokens
0,LL,Latin Library,2008,16287272,482167,225613
1,LL-Cic,Cicero,138,1361839,84659,42124
2,LL-Bib,Biblia Sacra,77,684833,50854,24544
3,LL-Ius,Ius Romanum,88,2360411,71231,29197


In [11]:
c = CorpusStoplist('latin')

In [17]:
ll_freq_stops = c.build_stoplist(ll_docs, size=25, basis='frequency', inc_values=True, sort_words=False)
print(list(zip(*ll_freq_stops))[0])

[('et', 438825), ('in', 268232), ('est', 164894), ('non', 163872), ('ad', 131026), ('ut', 117019), ('quod', 102721), ('cum', 99165), ('si', 92656), ('qui', 91811), ('de', 78294), ('sed', 73410), ('quae', 63084), ('ex', 58918), ('quam', 55256), ('per', 49955), ('esse', 48823), ('nec', 44657), ('sunt', 43471), ('hoc', 43026), ('enim', 42090), ('uel', 41193), ('se', 41124), ('aut', 40233), ('autem', 40152)]


In [20]:
print(sorted(list(zip(*ll_freq_stops))[0]))

['ad', 'aut', 'autem', 'cum', 'de', 'enim', 'esse', 'est', 'et', 'ex', 'hoc', 'in', 'nec', 'non', 'per', 'quae', 'quam', 'qui', 'quod', 'se', 'sed', 'si', 'sunt', 'uel', 'ut']


In [18]:
words, values = zip(*ll_freq_stops)
data = {'Word': words, 'Frequency': values}
df = pd.DataFrame.from_dict(data)

df.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Word,Frequency
0,et,438825
1,in,268232
2,est,164894
3,non,163872
4,ad,131026
5,ut,117019
6,quod,102721
7,cum,99165
8,si,92656
9,qui,91811


In [21]:
cic_freq_stops = c.build_stoplist(cic_docs, size=25, basis='frequency', inc_values=False)
print(cic_freq_stops)

['ad', 'atque', 'aut', 'cum', 'de', 'enim', 'esse', 'est', 'et', 'etiam', 'ex', 'hoc', 'in', 'me', 'mihi', 'non', 'quae', 'quam', 'qui', 'quid', 'quod', 'sed', 'si', 'te', 'ut']


In [22]:
bib_freq_stops = c.build_stoplist(bib_docs, size=25, basis='frequency', inc_values=False)
print(bib_freq_stops)

['ad', 'autem', 'cum', 'de', 'deus', 'domini', 'dominus', 'eius', 'enim', 'eos', 'est', 'et', 'eum', 'in', 'israhel', 'me', 'non', 'quae', 'qui', 'quia', 'quod', 'sunt', 'super', 'te', 'ut']


In [23]:
ius_freq_stops = c.build_stoplist(ius_docs, size=25, basis='frequency', inc_values=False)
print(ius_freq_stops)

['ad', 'aut', 'ci', 'conss', 'cum', 'de', 'dig', 'ed', 'esse', 'est', 'et', 'ex', 'hoc', 'id', 'in', 'non', 'quae', 'qui', 'quod', 'sed', 'si', 'sit', 'uel', 'ulpianus', 'ut']


In [24]:
cic_mean_stops = c.build_stoplist(cic_docs, size=25, basis='mean', inc_values=True)
print(cic_mean_stops)

[('ad', 0.008868650848423294), ('atque', 0.00436167152162217), ('aut', 0.00556467118374423), ('cum', 0.010126709007477934), ('de', 0.00718002786254848), ('enim', 0.005821603018043596), ('esse', 0.008112803360538887), ('est', 0.012972884528270714), ('et', 0.023066765999706636), ('etiam', 0.004821056704311927), ('ex', 0.004414692768747299), ('hoc', 0.004349842163502997), ('in', 0.019762320675769322), ('me', 0.005838325477431875), ('mihi', 0.00405195693089243), ('non', 0.01550952166175899), ('quae', 0.006651064673980416), ('quam', 0.007083653585908187), ('qui', 0.008573038153051608), ('quid', 0.00533808081931531), ('quod', 0.009131624568647143), ('sed', 0.007811200529349928), ('si', 0.00823708787137525), ('te', 0.005729445842959163), ('ut', 0.012302617380402503)]


In [25]:
cic_variance_stops = c.build_stoplist(cic_docs, size=25, basis='variance', inc_values=True)
print(cic_variance_stops)

[('ad', 8.671632756845375e-05), ('atque', 2.641496237121225e-05), ('aut', 4.90936891891087e-05), ('cum', 0.00010912207383475996), ('de', 5.886823178032977e-05), ('enim', 4.020005598993664e-05), ('esse', 7.171833606117633e-05), ('est', 0.0001838499615104435), ('et', 0.0005641948080530672), ('etiam', 2.5831891081588447e-05), ('ex', 2.409054806686975e-05), ('hoc', 2.20723116231951e-05), ('in', 0.00039979283807088965), ('me', 6.115689201714936e-05), ('mihi', 2.6028589738983625e-05), ('non', 0.0002523842764074881), ('quae', 5.237976921478452e-05), ('quam', 5.767995478841798e-05), ('qui', 8.189376959070727e-05), ('quid', 3.2333137998599984e-05), ('quod', 8.88225949408833e-05), ('sed', 6.751202612126696e-05), ('si', 7.488524747399057e-05), ('te', 6.554681435550538e-05), ('ut', 0.00016076030985712696)]


In [26]:
cic_entropy_stops = c.build_stoplist(cic_docs, size=25, basis='entropy', inc_values=True)
print(cic_entropy_stops)

[('ad', 2.4792060765261255), ('atque', 1.3706427272074302), ('aut', 1.6616399896744882), ('cum', 2.757862793786304), ('de', 2.089676434710912), ('enim', 1.7570592740896032), ('esse', 2.3138531143504157), ('est', 3.3345397959458056), ('et', 5.161713066585444), ('etiam', 1.5208735992904556), ('ex', 1.403614100452883), ('hoc', 1.3930920633606787), ('in', 4.623517566572637), ('me', 1.6572491006667622), ('mihi', 1.2674271105554247), ('non', 3.837931138878116), ('quae', 1.9597062651709616), ('quam', 2.0754723536848), ('qui', 2.4093778369827463), ('quid', 1.6473232771340665), ('quod', 2.5457005142781735), ('sed', 2.2397454511212103), ('si', 2.3376757959213985), ('te', 1.6026890003209369), ('ut', 3.2105839980402955)]


In [28]:
cic_stops = c.build_stoplist(cic_docs, size=25, basis='zou', inc_values=False)
print(cic_stops)

['ad', 'atque', 'aut', 'cum', 'de', 'enim', 'esse', 'est', 'et', 'etiam', 'ex', 'hoc', 'in', 'me', 'mihi', 'non', 'quae', 'quam', 'qui', 'quid', 'quod', 'sed', 'si', 'te', 'ut']


In [29]:
cic_stops = c.build_stoplist(cic_docs, size=100, basis='zou', inc_values=False)
print(cic_stops)

['ab', 'ac', 'ad', 'an', 'ante', 'apud', 'atque', 'aut', 'autem', 'causa', 'cum', 'de', 'ea', 'ego', 'eius', 'enim', 'eo', 'erat', 'esse', 'esset', 'est', 'et', 'etiam', 'eum', 'ex', 'fuit', 'haec', 'hic', 'hoc', 'iam', 'id', 'igitur', 'iis', 'illa', 'ille', 'illud', 'in', 'ipse', 'is', 'ita', 'itaque', 'me', 'mihi', 'modo', 'nam', 'ne', 'nec', 'neque', 'nihil', 'nisi', 'nobis', 'non', 'nos', 'nunc', 'omnes', 'omnia', 'omnibus', 'omnium', 'per', 'potest', 'pro', 'publica', 'publicae', 'qua', 'quae', 'quam', 'quasi', 'quem', 'qui', 'quibus', 'quid', 'quidem', 'quis', 'quo', 'quod', 'quos', 're', 'rebus', 'rei', 'rem', 'rerum', 'res', 'se', 'sed', 'senatus', 'si', 'sic', 'sine', 'sit', 'sunt', 'tam', 'tamen', 'te', 'tibi', 'tu', 'tum', 'uel', 'uero', 'uos', 'ut']


## Top value lists for LL

In [30]:
ll_freq_stops = c.build_stoplist(ll_docs, size=25, basis='frequency', inc_values=True, sort_words=False)

words, values = zip(*ll_freq_stops)
data = {'Word': words, 'Frequency': values}
df = pd.DataFrame.from_dict(data)

df.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Word,Frequency
0,et,438825
1,in,268232
2,est,164894
3,non,163872
4,ad,131026
5,ut,117019
6,quod,102721
7,cum,99165
8,si,92656
9,qui,91811


In [31]:
ll_mean_stops = c.build_stoplist(ll_docs, size=25, basis='mean', inc_values=True, sort_words=False)

words, values = zip(*ll_mean_stops)
data = {'Word': words, 'Frequency': values}
df = pd.DataFrame.from_dict(data)

df.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Word,Frequency
0,et,0.0324513
1,in,0.0201319
2,est,0.0130033
3,non,0.0117544
4,ad,0.00871411
5,ut,0.00864474
6,cum,0.00806065
7,quod,0.00771674
8,qui,0.00659552
9,si,0.00560924


In [32]:
ll_variance_stops = c.build_stoplist(ll_docs, size=25, basis='variance', inc_values=True, sort_words=False)

words, values = zip(*ll_variance_stops)
data = {'Word': words, 'Frequency': values}
df = pd.DataFrame.from_dict(data)

df.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Word,Frequency
0,et,0.00136682
1,in,0.000504233
2,est,0.000261346
3,non,0.000196141
4,ad,0.000111013
5,quod,0.000110762
6,cum,0.000107114
7,ut,0.000104016
8,qui,6.78587e-05
9,de,6.11444e-05


In [33]:
ll_entropy_stops = c.build_stoplist(ll_docs, size=25, basis='entropy', inc_values=True, sort_words=False)

words, values = zip(*ll_entropy_stops)
data = {'Word': words, 'Frequency': values}
df = pd.DataFrame.from_dict(data)

df.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Word,Frequency
0,quod,30.3664
1,ad,34.305
2,qui,27.5049
3,non,43.304
4,est,46.3114
5,et,93.1383
6,cum,32.12
7,in,66.4521
8,ut,34.365
9,sed,24.0617


In [39]:
ll_stops = c.build_stoplist(ll_docs, size=92, basis='zou')
print(ll_stops)

['ab', 'ac', 'ad', 'ante', 'atque', 'aut', 'autem', 'cum', 'de', 'dei', 'deus', 'dum', 'ea', 'ego', 'ei', 'eius', 'enim', 'eo', 'erat', 'ergo', 'esse', 'esset', 'est', 'et', 'etiam', 'eum', 'ex', 'fuit', 'haec', 'hic', 'his', 'hoc', 'iam', 'id', 'illa', 'ille', 'in', 'inter', 'ipse', 'ita', 'me', 'mihi', 'modo', 'nam', 'ne', 'nec', 'neque', 'nihil', 'nisi', 'nobis', 'non', 'nos', 'nunc', 'omnes', 'omnia', 'omnibus', 'per', 'post', 'potest', 'pro', 'qua', 'quae', 'quam', 'quem', 'qui', 'quia', 'quibus', 'quid', 'quidem', 'quis', 'quo', 'quod', 'quoque', 'res', 'se', 'secundum', 'sed', 'si', 'sibi', 'sic', 'sicut', 'sine', 'sit', 'sub', 'sunt', 'tamen', 'te', 'tibi', 'tu', 'uel', 'uero', 'ut']


In [40]:
from cltk.stop.latin import PERSEUS_STOPS

In [41]:
print(len(PERSEUS_STOPS))

92
