# Lemma Comparions for Gratian Texts

### Imports etc.

In [1]:
# Imports

from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.tokenize.latin.word import WordTokenizer

In [2]:
# Set up tools

tokenizer = WordTokenizer()
lemmatizer = BackoffLatinLemmatizer()

In [3]:
# Helper function for preprocessing

import html, re
from cltk.stem.latin.j_v import JVReplacer

replacer = JVReplacer()

def preprocess(text, lower=True, remove_list=[]):
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'[\x1a-\x1a]', ' ', text) # ASCII control characters
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    
    
    if lower:
        text = text.lower()

    text = replacer.replace(text) #Normalize u/v & i/j    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text.strip()

In [4]:
# Load texts into memory; show sample

with open('./Gratian1.txt','r') as f:
    text_1 = preprocess(f.read())
with open('./Gratian2.txt','r') as f:
    text_2 = preprocess(f.read())
    
print(text_1[:100],'\n')
print(text_2[:100])

humanum genus duobus regitur naturali uidelicet iure et moribus ius naturae est quod in lege et euan 

quia uero de naturae superfluitate sermo cepit haberi queritur an post illusionem que per somnium ac


### Analyze texts

In [5]:
# Get tokens

tokens_1 = tokenizer.tokenize(text_1)
tokens_2 = tokenizer.tokenize(text_2)

print(f'Number of tokens in text 1: {len(tokens_1)}')
print(f'Number of tokens in text 2: {len(tokens_2)}')
print()
print(f'Number of unique tokens in text 1: {len(set(tokens_1))}')
print(f'Number of unique tokens in text 2: {len(set(tokens_2))}')

Number of tokens in text 1: 57219
Number of tokens in text 2: 14389

Number of unique tokens in text 1: 10993
Number of unique tokens in text 2: 4860


In [6]:
# Get lemmas

lemmas_pairs_1 = lemmatizer.lemmatize(tokens_1)
lemmas_1 = [lemma for _, lemma in lemmas_pairs_1]
lemmas_pairs_2 = lemmatizer.lemmatize(tokens_2)
lemmas_2 = [lemma for _, lemma in lemmas_pairs_2]
lemmas_set_1 = set(lemmas_1)
lemmas_set_2 = set(lemmas_2)

print(f'Number of unique lemmas in text 1: {len(lemmas_set_1)}')
print(f'Number of unique lemmas in text 2: {len(lemmas_set_2)}')
print()
full_1_2 = lemmas_set_1.union(lemmas_set_2)
print(f'Number of unique lemmas in either group 1 or group 2: {len(full_1_2)}')
print()
shared_1_2 = lemmas_set_1.intersection(lemmas_set_2)
print(f'Number of lemmas in both group 1 and group 2: {len(shared_1_2)}')
print()
diff_1_2 = lemmas_set_1.difference(lemmas_set_2)
diff_2_1 = lemmas_set_2.difference(lemmas_set_1)
print(f'Number of lemmas in group 1 not in group 2: {len(diff_1_2)}')
print(f'Number of lemmas in group 2 not in group 1: {len(diff_2_1)}')

Number of unique lemmas in text 1: 5595
Number of unique lemmas in text 2: 2967

Number of unique lemmas in either group 1 or group 2: 6595

Number of lemmas in both group 1 and group 2: 1967

Number of lemmas in group 1 not in group 2: 3628
Number of lemmas in group 2 not in group 1: 1000


In [7]:
# Possible ways forward?
#
# Postprocess lemmas? i.e. remove Morpheus artifacts, etc.

lemmas_1 = preprocess(" ".join([lemma for _, lemma in lemmas_pairs_1])).split()
lemmas_2 = preprocess(" ".join([lemma for _, lemma in lemmas_pairs_2])).split()
lemmas_set_1 = set(lemmas_1)
lemmas_set_2 = set(lemmas_2)

print('After postprocessing...')
print(f'Number of unique lemmas in text 1: {len(lemmas_set_1)}')
print(f'Number of unique lemmas in text 2: {len(lemmas_set_2)}')
print()
full_1_2 = lemmas_set_1.union(lemmas_set_2)
print(f'Number of unique lemmas in either group 1 or group 2: {len(full_1_2)}')
print()
shared_1_2 = lemmas_set_1.intersection(lemmas_set_2)
print(f'Number of lemmas in both group 1 and group 2: {len(shared_1_2)}')
print()
diff_1_2 = lemmas_set_1.difference(lemmas_set_2)
diff_2_1 = lemmas_set_2.difference(lemmas_set_1)
print(f'Number of lemmas in group 1 not in group 2: {len(diff_1_2)}')
print(f'Number of lemmas in group 2 not in group 1: {len(diff_2_1)}')

After postprocessing...
Number of unique lemmas in text 1: 5420
Number of unique lemmas in text 2: 2882

Number of unique lemmas in either group 1 or group 2: 6379

Number of lemmas in both group 1 and group 2: 1923

Number of lemmas in group 1 not in group 2: 3497
Number of lemmas in group 2 not in group 1: 959


In [8]:
# Review lemmas and set up custom backoff chain

with open('lemmas.txt','w') as f:
    for lemma in sorted(list(full_1_2)):
        f.write(f'{lemma}\n')