# Constructing Stoplists for Historical Languages
Code repository associated with the Constructing Stoplists for Historical Languages for Digital Classics Online

## Setup

In [1]:
# from cltk.corpus.utils.importer import CorpusImporter
# corpus_importer = CorpusImporter('latin')
# corpus_importer.import_corpus('latin_text_latin_library')

In [2]:
# Imports

import pandas as pd

from collections import Counter


from nltk.tokenize import WordPunctTokenizer

from cltk.corpus.latin import latinlibrary
from cltk.stop.latin import CorpusStoplist
from cltk.stop.latin import PERSEUS_STOPS

from pprint import pprint

import pickle

In [3]:
# Function for preprocessing texts

def truncate_text(text):
    temp = text[500:-500]
    start = temp.find(' ')
    end = temp.rfind(' ')
    return temp[start:end]

In [4]:
# Preprocess texts

import html
from cltk.stem.latin.j_v import JVReplacer

replacer = JVReplacer()

def preprocess(text):    
    text = html.unescape(text) # Handle html entities
    text = replacer.replace(text) #Normalize u/v & i/j    
    return text

In [5]:
# Load CLTK Latin Library corpus; get size

ll_files = latinlibrary.fileids()
ll_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in ll_files]
ll_docs = [doc for doc in ll_docs if len(doc) > 100]
ll_size = len(ll_files)

# CITED IN ARTICLE
print(f'There are {ll_size} files in the CLTK Latin Library corpus.')

There are 2152 files in the CLTK Latin Library corpus.


In [6]:
# Get tokens for Latin Library; get stats

ll_tokens = [WordPunctTokenizer().tokenize(doc) for doc in ll_docs]
ll_tokens = [item for sublist in ll_tokens for item in sublist]

# CITED IN ARTICLE

print(f'There are {len(ll_tokens)} tokens in the CLTK Latin Library corpus.')
print(f'There are {len(set(ll_tokens))} unique tokens in the CLTK Latin Library corpus.')
print(f'Of the tokens appearing in the CLTK Latin Library corpus, {len([k for k, v in Counter(ll_tokens).items() if v == 1 ])} tokens appear once.')

There are 16287634 tokens in the CLTK Latin Library corpus.
There are 482172 unique tokens in the CLTK Latin Library corpus.
Of the tokens appearing in the CLTK Latin Library corpus, 225612 tokens appear once.


In [7]:
# Create stoplist instance

c = CorpusStoplist('latin')

In [8]:
# Get frequency stops for corpora

ll_freq_stops = c.build_stoplist(ll_docs, size=25, basis='frequency', inc_values=True, sort_words=False)

## Appendix A: Results for Different Stoplist Construction "Bases"

In [9]:
# Get stoplists for different bases

ll_mean_stops = c.build_stoplist(ll_docs, size=500, basis='mean', inc_values=True, sort_words=False)
ll_variance_stops = c.build_stoplist(ll_docs, size=500, basis='variance', inc_values=True, sort_words=False)
ll_entropy_stops = c.build_stoplist(ll_docs, size=500, basis='entropy', inc_values=True, sort_words=False)
ll_zou_stops = c.build_stoplist(ll_docs, size=100, basis='zou', inc_values=True, sort_words=False)

KeyboardInterrupt: 

In [None]:
# Get relevant figures for 'zou' derived list

ll_zou_words = list(zip(*ll_zou_stops))[0]
ll_zou_mean = [round(dict(ll_mean_stops)[word], 4) for word in ll_zou_words]
ll_zou_variance = [round(dict(ll_variance_stops)[word], 6) for word in ll_zou_words]
ll_zou_entropy = [round(dict(ll_entropy_stops)[word], 4) for word in ll_zou_words]

In [None]:
# Print table

data = {
    'LL \'Zou\' Stopwords': ll_zou_words,
    'Mean Prob.': ll_zou_mean,
    'Var. Prob.': ll_zou_variance,
    'Entropy': ll_zou_entropy,
}

df = pd.DataFrame.from_dict(data)
df.index += 1
df

## Appendix B: Comparison of Different Latin Stoplists

In [None]:
# Show Perseus stoplist

print(PERSEUS_STOPS)
print(f'The Perseus stoplist has {len(PERSEUS_STOPS)} words.')

In [None]:
# Show 100-word LL 'Zou' stoplist

ll_stops = c.build_stoplist(ll_docs, size=100, basis='zou')
print(ll_stops)

In [None]:
# Show intersection of Perseus & LL

perseus_intersection = set(PERSEUS_STOPS).intersection(set(ll_stops))
print(f'There are {len(perseus_intersection)} words shared by the two lists. This amounts to {(len(perseus_intersection)/len(PERSEUS_STOPS))*100}% of the Perseus list.')
print(sorted(perseus_intersection))

In [None]:
# Show difference of Perseus & LL

perseus_difference = set(ll_stops).difference(set(PERSEUS_STOPS))
print(f'There are {len(perseus_intersection)} words from the LL list that are not found on the Perseus list.')
print(sorted(perseus_difference))

In [None]:
# Show difference of Perseus & LL

perseus_difference = set(PERSEUS_STOPS).difference(set(ll_stops))
print(f'There are {len(perseus_intersection)} words from the Perseus list that are not found on the LL list.')
print(sorted(perseus_difference))

In [None]:
# Show stopwords-json list

json_stops = ["a","ab","ac","ad","at","atque","aut","autem","cum","de","dum","e","erant","erat","est","et","etiam","ex","haec","hic","hoc","in","ita","me","nec","neque","non","per","qua","quae","quam","qui","quibus","quidem","quo","quod","re","rebus","rem","res","sed","si","sic","sunt","tamen","tandem","te","ut","vel"]

print(json_stops)
print(f'The stopwords-json stoplist has {len(json_stops)} words.')

In [None]:
# Show intersection of stopwords-json & LL

json_intersection = set(json_stops).intersection(set(ll_stops))
print(f'There are {len(json_intersection)} words shared by the two lists. This amounts to {(len(json_intersection)/len(json_stops))*100}% of the stopwords-json list.')
print(json_intersection)

In [None]:
# Show difference of stopwords-json & LL

json_difference = set(ll_stops).difference(set(json_stops))
print(f'There are {len(json_intersection)} words from the LL list that are not found on the stopwords-json list.')
print(json_difference)

In [None]:
# Show difference of stopwords-json & LL

json_difference = set(json_stops).difference(set(ll_stops))
print(f'There are {len(json_intersection)} words from the stopwords-json list that are not found on the LL list.')
print(json_difference)

In [None]:
with open('../data/serial/voyant.p', 'rb') as f:
    voyant_stops = pickle.load(f)

In [None]:
print(f'The Voyant Tools stoplist has {len(voyant_stops)} words.')

In [None]:
# Show intersection of Voyant Tools & LL

voyant_intersection = set(voyant_stops).intersection(set(ll_stops))
print(len(voyant_intersection))
print(voyant_intersection)

In [None]:
# Show difference of Voyant Tools & LL

voyant_difference = set(ll_stops).difference(set(voyant_stops))
print(len(voyant_difference))
print(voyant_difference)

In [None]:
# Show difference of Voyant Tools & LL (selection)

voyant_difference = set(voyant_stops).difference(set(ll_stops))
print(len(voyant_difference))
print(sorted(voyant_difference)[:100])
print(sorted(voyant_difference)[-100:])

# Appendix C: 100-Word Stoplists for various Latin corpora

In [None]:
# Create subcorpora of LL corpus

# Cicero files/tokens
cic_files = [file for file in latinlibrary.fileids() if 'cicero/' in file]
cic_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in cic_files]
cic_tokens = [WordPunctTokenizer().tokenize(doc) for doc in cic_docs]
cic_tokens = [item for sublist in cic_tokens for item in sublist] #flatten

# Biblia Sacra files/tokens
bib_files = [file for file in latinlibrary.fileids() if 'bible/' in file]
bib_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in bib_files]
bib_tokens = [WordPunctTokenizer().tokenize(doc) for doc in bib_docs]
bib_tokens = [item for sublist in bib_tokens for item in sublist] #flatten

# Roman Legal Texts files/tokens
ius_files = [file for file in latinlibrary.fileids() if 'justinian' in file 
                     or 'gaius' in file 
                     or 'theod' in file]
ius_docs = [truncate_text(preprocess(latinlibrary.raw(file))) for file in ius_files]
ius_tokens = [WordPunctTokenizer().tokenize(doc) for doc in ius_docs]
ius_tokens = [item for sublist in ius_tokens for item in sublist] #flatten

In [None]:
# LL 'zou' stoplist

ll_stops = c.build_stoplist(ll_docs, size=100, basis='zou')
print(ll_stops)

In [None]:
# LL-Cic 'zou' stoplist

ll_cic_stops = c.build_stoplist(cic_docs, size=100, basis='zou')
print(ll_cic_stops)

In [None]:
# LL-Bib 'zou' stoplist

ll_bib_stops = c.build_stoplist(bib_docs, size=100, basis='zou')
print(ll_bib_stops)

In [None]:
# LL-Ius 'zou' stoplist

ll_ius_stops = c.build_stoplist(ius_docs, size=100, basis='zou')
print(ll_ius_stops)