# Constructing Stoplists for Historical Languages
Code repository associated with the Constructing Stoplists for Historical Languages for Digital Classics Online

## Setup

In [1]:
# from cltk.corpus.utils.importer import CorpusImporter
# corpus_importer = CorpusImporter('latin')
# corpus_importer.import_corpus('latin_text_latin_library')

In [26]:
# Imports

import pandas as pd

from collections import Counter


from nltk.tokenize import WordPunctTokenizer

from cltk.corpus.latin import latinlibrary
from cltk.stop.latin import CorpusStoplist
from cltk.stop.latin import PERSEUS_STOPS

from pprint import pprint

import pickle

In [3]:
# Function for preprocessing texts

def truncate_text(text):
    temp = text[500:-500]
    start = temp.find(' ')
    end = temp.rfind(' ')
    return temp[start:end]

In [4]:
# Preprocess texts

import html
from cltk.stem.latin.j_v import JVReplacer

replacer = JVReplacer()

def preprocess(text):    
    text = html.unescape(text) # Handle html entities
    text = replacer.replace(text) #Normalize u/v & i/j    
    return text

In [5]:
# Load CLTK Latin Library corpus; get size

ll_files = latinlibrary.fileids()
ll_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in ll_files]
ll_docs = [doc for doc in ll_docs if len(doc) > 100]
ll_size = len(ll_files)

# CITED IN ARTICLE
print(f'There are {ll_size} files in the CLTK Latin Library corpus.')

There are 2152 files in the CLTK Latin Library corpus.


In [6]:
# Get tokens for Latin Library; get stats

ll_tokens = [WordPunctTokenizer().tokenize(doc) for doc in ll_docs]
ll_tokens = [item for sublist in ll_tokens for item in sublist]

# CITED IN ARTICLE

print(f'There are {len(ll_tokens)} tokens in the CLTK Latin Library corpus.')
print(f'There are {len(set(ll_tokens))} unique tokens in the CLTK Latin Library corpus.')
print(f'Of the tokens appearing in the CLTK Latin Library corpus, {len([k for k, v in Counter(ll_tokens).items() if v == 1 ])} tokens appear once.')

There are 16287634 tokens in the CLTK Latin Library corpus.
There are 482172 unique tokens in the CLTK Latin Library corpus.
Of the tokens appearing in the CLTK Latin Library corpus, 225612 tokens appear once.


In [7]:
# Create subcorpora of LL corpus

# Cicero files/tokens
cic_files = [file for file in latinlibrary.fileids() if 'cicero/' in file]
cic_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in cic_files]
cic_tokens = [WordPunctTokenizer().tokenize(doc) for doc in cic_docs]
cic_tokens = [item for sublist in cic_tokens for item in sublist] #flatten

# Biblia Sacra files/tokens
bib_files = [file for file in latinlibrary.fileids() if 'bible/' in file]
bib_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in bib_files]
bib_tokens = [WordPunctTokenizer().tokenize(doc) for doc in bib_docs]
bib_tokens = [item for sublist in bib_tokens for item in sublist] #flatten

# Roman Legal Texts files/tokens
ius_files = [file for file in latinlibrary.fileids() if 'justinian' in file 
                     or 'gaius' in file 
                     or 'theod' in file]
ius_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in ius_files]
ius_tokens = [WordPunctTokenizer().tokenize(doc) for doc in ius_docs]
ius_tokens = [item for sublist in ius_tokens for item in sublist] #flatten

# Build a table to store the data produced above that can be imported to Pandas
colls = ['Latin Library', 'Cicero', 'Biblia Sacra', 'Ius Romanum']
colls_abbrev = ['LL', 'LL-Cic', 'LL-Bib', 'LL-Ius']
colls_docs = [ll_docs, cic_docs, bib_docs, ius_docs]
colls_tokens = [ll_tokens, cic_tokens, bib_tokens, ius_tokens]
colls_file_counts = [len(docs) for docs in colls_docs]
colls_token_counts = [len(tokens) for tokens in colls_tokens]
colls_unique_counts = [len(set(tokens)) for tokens in colls_tokens]
colls_single_counts = [len([k for k, v in Counter(tokens).items() if v == 1 ]) for tokens in colls_tokens]

## Table 1

In [8]:
# Make Pandas table

data = {'Collections': colls_abbrev, 'Description': colls, 'Files': colls_file_counts, 'Tokens': colls_token_counts, 'Unique Tokens': colls_unique_counts, 'Single Tokens': colls_single_counts}
df = pd.DataFrame.from_dict(data)

df.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Collections,Description,Files,Tokens,Unique Tokens,Single Tokens
0,LL,Latin Library,2008,16287634,482172,225612
1,LL-Cic,Cicero,138,1361919,84663,42124
2,LL-Bib,Biblia Sacra,77,684833,50854,24544
3,LL-Ius,Ius Romanum,88,2360411,71231,29197


## Table 2

In [9]:
# Create stoplist instance

c = CorpusStoplist('latin')

In [36]:
# Get frequency stops for corpora

ll_freq_stops = c.build_stoplist(ll_docs, size=25, basis='frequency', inc_values=True, sort_words=False)
cic_freq_stops = c.build_stoplist(cic_docs, size=25, basis='frequency', inc_values=True, sort_words=False)
bib_freq_stops = c.build_stoplist(bib_docs, size=25, basis='frequency', inc_values=True, sort_words=False)
ius_freq_stops = c.build_stoplist(ius_docs, size=25, basis='frequency', inc_values=True, sort_words=False)

In [37]:
# Print table

data = {
    'LL Words': list(zip(*ll_freq_stops))[0],
    'LL Freq.': list(zip(*ll_freq_stops))[1],
    'LL-Cic Words': list(zip(*cic_freq_stops))[0],
    'LL-Cic Freq.': list(zip(*cic_freq_stops))[1],
    'LL-Bib Words': list(zip(*bib_freq_stops))[0],
    'LL-Bib Freq.': list(zip(*bib_freq_stops))[1],
    'LL-Ius Words': list(zip(*ius_freq_stops))[0],
    'LL-Ius Freq.': list(zip(*ius_freq_stops))[1],
}

df = pd.DataFrame.from_dict(data)
df.index += 1
df

Unnamed: 0,LL Words,LL Freq.,LL-Cic Words,LL-Cic Freq.,LL-Bib Words,LL-Bib Freq.,LL-Ius Words,LL-Ius Freq.
1,et,438829,et,25851,et,52695,et,46861
2,in,268237,in,22593,in,22756,si,31844
3,est,164895,non,17168,est,8822,non,27746
4,non,163873,est,14688,ad,7729,in,27310
5,ad,131032,ut,13974,non,7642,ad,25548
6,ut,117020,cum,11329,qui,6988,dig,24494
7,quod,102722,quod,10453,eius,5244,est,20956
8,cum,99167,ad,9735,autem,5174,uel,16173
9,si,92659,qui,9317,de,4969,qui,14174
10,qui,91813,esse,9113,ut,4714,ut,13553


## Table 3

In [38]:
# Get stoplists for different bases

ll_mean_stops = c.build_stoplist(ll_docs, size=100, basis='mean', inc_values=True, sort_words=False)
ll_variance_stops = c.build_stoplist(ll_docs, size=100, basis='variance', inc_values=True, sort_words=False)
ll_entropy_stops = c.build_stoplist(ll_docs, size=100, basis='entropy', inc_values=True, sort_words=False)
ll_zou_stops = c.build_stoplist(ll_docs, size=25, basis='zou', inc_values=True, sort_words=False)

In [39]:
# Get relevant figures for 'zou' derived list

ll_zou_words = list(zip(*ll_zou_stops))[0]
ll_zou_mean = [round(dict(ll_mean_stops)[word], 4) for word in ll_zou_words]
ll_zou_variance = [round(dict(ll_variance_stops)[word], 6) for word in ll_zou_words]
ll_zou_entropy = [round(dict(ll_entropy_stops)[word], 4) for word in ll_zou_words]

In [40]:
# Print table

data = {
    'LL \'Zou\' Stopwords': ll_zou_words,
    'Mean Prob.': ll_zou_mean,
    'Var. Prob.': ll_zou_variance,
    'Entropy': ll_zou_entropy,
}

df = pd.DataFrame.from_dict(data)
df.index += 1
df

Unnamed: 0,LL 'Zou' Stopwords,Mean Prob.,Var. Prob.,Entropy
1,et,0.0324,0.001366,93.1182
2,in,0.0201,0.000503,66.4403
3,est,0.013,0.000261,46.306
4,non,0.0118,0.000196,43.3043
5,ad,0.0087,0.000112,34.3664
6,ut,0.0086,0.000104,34.3602
7,cum,0.0081,0.000107,32.1202
8,quod,0.0077,0.000111,30.3615
9,qui,0.0066,6.8e-05,27.5285
10,si,0.0056,6e-05,23.4608


## Appendix A: Comparison of Different Latin Stoplists

In [15]:
# Show Perseus stoplist

print(PERSEUS_STOPS)
print(f'The Perseus stoplist has {len(PERSEUS_STOPS)} words.')

['ab', 'ac', 'ad', 'adhic', 'aliqui', 'aliquis', 'an', 'ante', 'apud', 'at', 'atque', 'aut', 'autem', 'cum', 'cur', 'de', 'deinde', 'dum', 'ego', 'enim', 'ergo', 'es', 'est', 'et', 'etiam', 'etsi', 'ex', 'fio', 'haud', 'hic', 'iam', 'idem', 'igitur', 'ille', 'in', 'infra', 'inter', 'interim', 'ipse', 'is', 'ita', 'magis', 'modo', 'mox', 'nam', 'ne', 'nec', 'necque', 'neque', 'nisi', 'non', 'nos', 'o', 'ob', 'per', 'possum', 'post', 'pro', 'quae', 'quam', 'quare', 'qui', 'quia', 'quicumque', 'quidem', 'quilibet', 'quis', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quo', 'quoniam', 'sed', 'si', 'sic', 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero', 'unus', 'ut']
The Perseus stoplist has 92 words.


In [16]:
# Show 100-word LL 'Zou' stoplist

ll_stops = c.build_stoplist(ll_docs, size=100, basis='zou')
print(ll_stops)

['ab', 'ac', 'ad', 'ante', 'apud', 'atque', 'aut', 'autem', 'causa', 'cui', 'cuius', 'cum', 'de', 'dei', 'deus', 'dum', 'ea', 'ego', 'ei', 'eius', 'enim', 'eo', 'erat', 'ergo', 'esse', 'esset', 'est', 'et', 'etiam', 'eum', 'ex', 'fuit', 'haec', 'hic', 'his', 'hoc', 'iam', 'id', 'igitur', 'illa', 'ille', 'in', 'inter', 'ipse', 'ita', 'me', 'mihi', 'modo', 'nam', 'ne', 'nec', 'neque', 'nihil', 'nisi', 'nobis', 'non', 'nos', 'nunc', 'omnes', 'omnia', 'omnibus', 'per', 'post', 'potest', 'pro', 'qua', 'quae', 'quam', 'quem', 'qui', 'quia', 'quibus', 'quid', 'quidem', 'quis', 'quo', 'quod', 'quoque', 'res', 'se', 'secundum', 'sed', 'si', 'sibi', 'sic', 'sicut', 'sine', 'sit', 'sub', 'sunt', 'tamen', 'te', 'tibi', 'tu', 'tunc', 'ubi', 'uel', 'uero', 'uos', 'ut']


In [17]:
# Show intersection of Perseus & LL

perseus_intersection = set(PERSEUS_STOPS).intersection(set(ll_stops))
print(f'There are {len(perseus_intersection)} words shared by the two lists. This amounts to {(len(perseus_intersection)/len(PERSEUS_STOPS))*100}% of the Perseus list.')
print(sorted(perseus_intersection))

There are 54 words shared by the two lists. This amounts to 58.69565217391305% of the Perseus list.
['ab', 'ac', 'ad', 'ante', 'apud', 'atque', 'aut', 'autem', 'cum', 'de', 'dum', 'ego', 'enim', 'ergo', 'est', 'et', 'etiam', 'ex', 'hic', 'iam', 'igitur', 'ille', 'in', 'inter', 'ipse', 'ita', 'modo', 'nam', 'ne', 'nec', 'neque', 'nisi', 'non', 'nos', 'per', 'post', 'pro', 'quae', 'quam', 'qui', 'quia', 'quidem', 'quis', 'quo', 'sed', 'si', 'sic', 'sub', 'tamen', 'tu', 'ubi', 'uel', 'uero', 'ut']


In [18]:
# Show difference of Perseus & LL

perseus_difference = set(ll_stops).difference(set(PERSEUS_STOPS))
print(f'There are {len(perseus_intersection)} words from the LL list that are not found on the Perseus list.')
print(sorted(perseus_difference))

There are 54 words from the LL list that are not found on the Perseus list.
['causa', 'cui', 'cuius', 'dei', 'deus', 'ea', 'ei', 'eius', 'eo', 'erat', 'esse', 'esset', 'eum', 'fuit', 'haec', 'his', 'hoc', 'id', 'illa', 'me', 'mihi', 'nihil', 'nobis', 'nunc', 'omnes', 'omnia', 'omnibus', 'potest', 'qua', 'quem', 'quibus', 'quid', 'quod', 'quoque', 'res', 'se', 'secundum', 'sibi', 'sicut', 'sine', 'sit', 'sunt', 'te', 'tibi', 'tunc', 'uos']


In [19]:
# Show difference of Perseus & LL

perseus_difference = set(PERSEUS_STOPS).difference(set(ll_stops))
print(f'There are {len(perseus_intersection)} words from the Perseus list that are not found on the LL list.')
print(sorted(perseus_difference))

There are 54 words from the Perseus list that are not found on the LL list.
['adhic', 'aliqui', 'aliquis', 'an', 'at', 'cur', 'deinde', 'es', 'etsi', 'fio', 'haud', 'idem', 'infra', 'interim', 'is', 'magis', 'mox', 'necque', 'o', 'ob', 'possum', 'quare', 'quicumque', 'quilibet', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quoniam', 'sive', 'sui', 'sum', 'super', 'suus', 'tam', 'trans', 'tum', 'unus']


In [21]:
# Show stopwords-json list

json_stops = ["a","ab","ac","ad","at","atque","aut","autem","cum","de","dum","e","erant","erat","est","et","etiam","ex","haec","hic","hoc","in","ita","me","nec","neque","non","per","qua","quae","quam","qui","quibus","quidem","quo","quod","re","rebus","rem","res","sed","si","sic","sunt","tamen","tandem","te","ut","vel"]

print(json_stops)
print(f'The stopwords-json stoplist has {len(json_stops)} words.')

['a', 'ab', 'ac', 'ad', 'at', 'atque', 'aut', 'autem', 'cum', 'de', 'dum', 'e', 'erant', 'erat', 'est', 'et', 'etiam', 'ex', 'haec', 'hic', 'hoc', 'in', 'ita', 'me', 'nec', 'neque', 'non', 'per', 'qua', 'quae', 'quam', 'qui', 'quibus', 'quidem', 'quo', 'quod', 're', 'rebus', 'rem', 'res', 'sed', 'si', 'sic', 'sunt', 'tamen', 'tandem', 'te', 'ut', 'vel']
The stopwords-json stoplist has 49 words.


In [22]:
# Show intersection of stopwords-json & LL

json_intersection = set(json_stops).intersection(set(ll_stops))
print(f'There are {len(json_intersection)} words shared by the two lists. This amounts to {(len(json_intersection)/len(json_stops))*100}% of the stopwords-json list.')
print(json_intersection)

There are 40 words shared by the two lists. This amounts to 81.63265306122449% of the stopwords-json list.
{'autem', 'quidem', 'etiam', 'cum', 'haec', 'nec', 'sic', 'sunt', 'ad', 'te', 'quae', 'quod', 'de', 'tamen', 'sed', 'ita', 'me', 'ex', 'qua', 'hic', 'dum', 'et', 'quibus', 'est', 'quo', 'in', 'aut', 'neque', 'quam', 'erat', 'qui', 'atque', 'ab', 'ut', 'non', 'si', 'ac', 'per', 'res', 'hoc'}


In [23]:
# Show difference of stopwords-json & LL

json_difference = set(ll_stops).difference(set(json_stops))
print(f'There are {len(json_intersection)} words from the LL list that are not found on the stopwords-json list.')
print(json_difference)

There are 40 words from the LL list that are not found on the stopwords-json list.
{'eius', 'pro', 'ea', 'igitur', 'quem', 'fuit', 'tibi', 'deus', 'ergo', 'eo', 'apud', 'omnia', 'secundum', 'modo', 'sub', 'ille', 'sicut', 'ne', 'tunc', 'illa', 'nos', 'cui', 'omnes', 'ipse', 'nihil', 'causa', 'esset', 'uero', 'ego', 'uel', 'uos', 'se', 'quoque', 'cuius', 'nobis', 'nunc', 'sit', 'potest', 'enim', 'ante', 'eum', 'quia', 'esse', 'omnibus', 'inter', 'sibi', 'nam', 'id', 'tu', 'nisi', 'mihi', 'ei', 'his', 'post', 'quis', 'dei', 'ubi', 'sine', 'quid', 'iam'}


In [24]:
# Show difference of stopwords-json & LL

json_difference = set(json_stops).difference(set(ll_stops))
print(f'There are {len(json_intersection)} words from the stopwords-json list that are not found on the LL list.')
print(json_difference)

There are 40 words from the stopwords-json list that are not found on the LL list.
{'rebus', 're', 'tandem', 'a', 'erant', 'vel', 'e', 'rem', 'at'}


In [27]:
with open('../data/serial/voyant.p', 'rb') as f:
    voyant_stops = pickle.load(f)

In [28]:
print(f'The Voyant Tools stoplist has {len(voyant_stops)} words.')

The Voyant Tools stoplist has 4015 words.


In [29]:
# Show intersection of Voyant Tools & LL

voyant_intersection = set(voyant_stops).intersection(set(ll_stops))
print(len(voyant_intersection))
print(voyant_intersection)

98
{'eius', 'quidem', 'quem', 'fuit', 'tibi', 'haec', 'sunt', 'te', 'omnia', 'quod', 'tamen', 'ne', 'sed', 'tunc', 'ita', 'nos', 'omnes', 'me', 'et', 'uero', 'se', 'potest', 'omnibus', 'sibi', 'nam', 'nisi', 'ab', 'ut', 'ubi', 'apud', 'atque', 'iam', 'autem', 'cui', 'quibus', 'in', 'quid', 'aut', 'erat', 'tu', 'ei', 'his', 'post', 'sine', 'ac', 'per', 'pro', 'eo', 'sic', 'secundum', 'modo', 'illa', 'qua', 'nihil', 'causa', 'esset', 'dum', 'ego', 'quoque', 'ante', 'id', 'quam', 'qui', 'mihi', 'ea', 'etiam', 'igitur', 'cum', 'ergo', 'nec', 'ad', 'quae', 'de', 'sub', 'ille', 'sicut', 'ex', 'ipse', 'hic', 'uel', 'uos', 'est', 'cuius', 'nobis', 'quo', 'nunc', 'sit', 'enim', 'eum', 'quia', 'esse', 'inter', 'neque', 'quis', 'non', 'si', 'res', 'hoc'}


In [30]:
# Show difference of Voyant Tools & LL

voyant_difference = set(ll_stops).difference(set(voyant_stops))
print(len(voyant_difference))
print(voyant_difference)

2
{'deus', 'dei'}


In [31]:
# Show difference of Voyant Tools & LL (selection)

voyant_difference = set(voyant_stops).difference(set(ll_stops))
print(len(voyant_difference))
print(sorted(voyant_difference)[:100])
print(sorted(voyant_difference)[-100:])

3848
['', '!', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '0', '1', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86']
['visust', 'visuve', 'vit', 'vix', 'vobis', 'vobiscum', 'vobismet', 'vobisne', 'vobisque', 'vol', 'von', 'vop', 'vos', 'vosmet', 'vosne', 'vosque', 'voster', 'vostra', 'vostrae', 'vostraeque', 'vostram', 'vostraque', 'vostrarum', 'vostras', 'vostrast', 'vostri', 'vostris', 'vostrist', 'vostro', 'vostrorum', 'vostros', 'vostrosque', 'vostrost', 'vostrum', 'vostrumque', 'vostrumst', 'vulg', 'vv', 'w', 'x', 'xc', 'x

# Appendix B: 100-Word Stoplists for various Latin corpora

In [32]:
# LL 'zou' stoplist

ll_stops = c.build_stoplist(ll_docs, size=100, basis='zou')
print(ll_stops)

['ab', 'ac', 'ad', 'ante', 'apud', 'atque', 'aut', 'autem', 'causa', 'cui', 'cuius', 'cum', 'de', 'dei', 'deus', 'dum', 'ea', 'ego', 'ei', 'eius', 'enim', 'eo', 'erat', 'ergo', 'esse', 'esset', 'est', 'et', 'etiam', 'eum', 'ex', 'fuit', 'haec', 'hic', 'his', 'hoc', 'iam', 'id', 'igitur', 'illa', 'ille', 'in', 'inter', 'ipse', 'ita', 'me', 'mihi', 'modo', 'nam', 'ne', 'nec', 'neque', 'nihil', 'nisi', 'nobis', 'non', 'nos', 'nunc', 'omnes', 'omnia', 'omnibus', 'per', 'post', 'potest', 'pro', 'qua', 'quae', 'quam', 'quem', 'qui', 'quia', 'quibus', 'quid', 'quidem', 'quis', 'quo', 'quod', 'quoque', 'res', 'se', 'secundum', 'sed', 'si', 'sibi', 'sic', 'sicut', 'sine', 'sit', 'sub', 'sunt', 'tamen', 'te', 'tibi', 'tu', 'tunc', 'ubi', 'uel', 'uero', 'uos', 'ut']


In [33]:
# LL-Cic 'zou' stoplist

ll_cic_stops = c.build_stoplist(cic_docs, size=100, basis='zou')
print(ll_cic_stops)

['ab', 'ac', 'ad', 'an', 'ante', 'apud', 'atque', 'aut', 'autem', 'causa', 'cum', 'de', 'ea', 'ego', 'eius', 'enim', 'eo', 'erat', 'esse', 'esset', 'est', 'et', 'etiam', 'eum', 'ex', 'fuit', 'haec', 'hic', 'hoc', 'iam', 'id', 'igitur', 'iis', 'illa', 'ille', 'illud', 'in', 'ipse', 'is', 'ita', 'itaque', 'me', 'mihi', 'modo', 'nam', 'ne', 'nec', 'neque', 'nihil', 'nisi', 'nobis', 'non', 'nos', 'nunc', 'omnes', 'omnia', 'omnibus', 'omnium', 'per', 'potest', 'pro', 'publica', 'publicae', 'qua', 'quae', 'quam', 'quasi', 'quem', 'qui', 'quibus', 'quid', 'quidem', 'quis', 'quo', 'quod', 'quos', 're', 'rebus', 'rei', 'rem', 'rerum', 'res', 'se', 'sed', 'senatus', 'si', 'sic', 'sine', 'sit', 'sunt', 'tam', 'tamen', 'te', 'tibi', 'tu', 'tum', 'uel', 'uero', 'uos', 'ut']


In [34]:
# LL-Bib 'zou' stoplist

ll_bib_stops = c.build_stoplist(bib_docs, size=100, basis='zou')
print(ll_bib_stops)

['ab', 'ad', 'ait', 'aut', 'autem', 'caput', 'christi', 'cum', 'de', 'dei', 'deo', 'deus', 'dicit', 'die', 'dixit', 'domini', 'domino', 'dominum', 'dominus', 'domus', 'ecce', 'ego', 'ei', 'eis', 'eius', 'enim', 'eo', 'eorum', 'eos', 'erat', 'ergo', 'erit', 'est', 'estis', 'et', 'eum', 'ex', 'filii', 'filius', 'fratres', 'haec', 'hierusalem', 'his', 'hoc', 'iesu', 'in', 'ipse', 'israhel', 'me', 'mea', 'meum', 'mihi', 'ne', 'nec', 'neque', 'nobis', 'non', 'nos', 'nunc', 'omnes', 'omnia', 'omnibus', 'omnis', 'per', 'pro', 'propter', 'quae', 'quam', 'quasi', 'quem', 'qui', 'quia', 'quid', 'quis', 'quod', 'quoniam', 'rex', 'se', 'secundum', 'sed', 'si', 'sicut', 'suis', 'sum', 'sunt', 'super', 'suum', 'te', 'terra', 'terram', 'tibi', 'tu', 'tua', 'tuam', 'tui', 'tuum', 'uobis', 'uos', 'usque', 'ut']


In [35]:
# LL-Ius 'zou' stoplist

ll_ius_stops = c.build_stoplist(ius_docs, size=100, basis='zou')
print(ll_ius_stops)

['aa', 'ab', 'ac', 'actio', 'actionem', 'ad', 'ait', 'an', 'aut', 'autem', 'causa', 'ci', 'conss', 'constantinopoli', 'contra', 'cth', 'cum', 'dat', 'de', 'debet', 'dig', 'ea', 'eam', 'ed', 'ei', 'eius', 'enim', 'eo', 'eorum', 'eos', 'erit', 'esse', 'esset', 'est', 'et', 'etiam', 'eum', 'ex', 'fuerit', 'hereditatem', 'heres', 'his', 'hoc', 'id', 'idem', 'imperator', 'imperatores', 'in', 'is', 'ita', 'iulianus', 'iure', 'ius', 'kal', 'mihi', 'nam', 'ne', 'nec', 'neque', 'nihil', 'nisi', 'nomine', 'non', 'paulus', 'per', 'posse', 'possit', 'post', 'potest', 'pp', 'pr', 'pro', 'quae', 'quam', 'qui', 'quia', 'quibus', 'quid', 'quidem', 'quis', 'quo', 'quod', 'quoque', 'rei', 'rem', 'res', 'sab', 'se', 'sed', 'si', 'sibi', 'sine', 'sit', 'siue', 'sunt', 'tamen', 'uel', 'uero', 'ulpianus', 'ut']
