# Building a Text Analysis Pipeline for Ancient Languages with CLTK

This workshop will introduce the Classical Language Toolkit, an open-source Python framework dedicated to text analysis and natural language processing for historical languages. Participants will be taken through the basic stages of a text analysis pipeline, namely corpus loading, preprocessing, sentence and word tokenization, lemmatization, part-of-speech and morphological tagging, prosody identification, and more. Examples will given primarily using Latin texts (and English translations), though some attention will be given to other languages supported by the project, including Ancient Greek and Akkadian. Participants are encouraged to bring their laptops; the demonstration can be followed along with interactively in a web browser without any installation or setup.

## Setup

In [None]:
from pprint import pprint #RUN CELLS BY PRESSING SHIFT & RETURN TOGETHER

## Working with CLTK Corpora

### Installing Corpora

In [None]:
## Set up corpora

## You will need the models/datasets that the new lemmatizer uses
## Note that this will generate an error if this and older 
## version of this corpus is already installed. If that happens,
## backup the old version, move or delete it, and reimport the corpus.

#from cltk.corpus.utils.importer import CorpusImporter

#corpus_importer = CorpusImporter('latin')
#corpus_importer.list_corpora

#corpus_importer.import_corpus('latin_models_cltk')

## We will be using the Latin Library corpus for today's workshop,
## so we will also need to import that as well.

#corpus_importer.import_corpus('latin_text_latin_library')

### Loading a CLTK corpus

In [None]:
# Get the Latin Library corpus

from cltk.corpus.readers import get_corpus_reader
ll = get_corpus_reader(language='latin', 
                       corpus_name='latin_text_latin_library')

# The CLTK Latin Library corpus is a web-scraped collection of plaintext files
# from thelatinlibrary.com.

# We can access the individual files as follows...

files = ll.fileids()
print(files[:50]) # The first 50 files in the corpus

# Note the [:50] slice to limit our list to the first 50 items.

In [None]:
# Stats

file_count = len(files)
print(f'There are {file_count} files in this corpus.')

In [None]:
virgil_files = [file for file in files if "vergil" in file]
print(virgil_files)

In [None]:
print(ll.raw(virgil_files[0])[101:616])

In [None]:
print(ll.raw(virgil_files[0])[:200])

In [None]:
print(ll.raw(virgil_files[0])[-200:])

In [None]:
word_count = len(list(ll.words(virgil_files[0])))
print(word_count)

In [None]:
sent_count = len(list(ll.sents(virgil_files[0])))
print(sent_count)

## Preprocessing

In [None]:
# Imports for preprocessing

import re # Regex module, useful for pattern matching
import html # Useful for handling entities

# Import/load a CLTK tool for normalizing i/j and u/v in Latin texts
from cltk.stem.latin.j_v import JVReplacer
replacer = JVReplacer()

In [None]:
def preprocess(text):

    # Remove Latin Library-specific paratexts with regex
    
    remove_list = [
            r'\bP. VERGILI MARONIS AENEIDOS LIBER .+\b',
            r'Vergil: Aeneid .+',
            r'\bThe Latin Library\b',
            r'\bThe Classics Page\b',
            r'\bVergil\b',
        ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)

    # Remove html entities and related html artifacts
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    text = re.sub(r' \xa0 ', '    ', text)
    
    # Lowercase text
    text = text.lower()

    # Normalize text
    text = replacer.replace(text) #Normalize u/v & i/j
    
    # Remove punctuation with translate
    punctuation ="\"#$%&\'()+,-/:;<=>@[\]^_`{|}~.?!«»—"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    # Remove numbers
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    # Handle spacing
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\t',' ', text) # Remove tabs
    text = re.sub('^\s+','', text)
    text = re.sub(' \n', '\n', text)
    text = re.sub('\n\n', '~', text)
    text = re.sub('~+', '\n\n', text)
    
    return text.strip()

In [None]:
virgil_raw = ll.raw(virgil_files[0])
print(virgil_raw[:500])

In [None]:
virgil_pp = preprocess(ll.raw(virgil_files[0]))
print(virgil_pp[:500])

## Tokenization

In [None]:
# Set up CLTK Latin word tokenizer

from cltk.tokenize.word import WordTokenizer
word_tokenizer = WordTokenizer('latin')

In [None]:
# Here is what the same poem looks like as a list of tokens

tokens = word_tokenizer.tokenize(virgil_pp)

print(tokens[:125])
print('\n')
print(f'There are {len(tokens)} tokens in Virgil 1.')

## Lemmatization

In [None]:
# # Set up CLTK Latin backoff lemmatizer

from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
lemmatizer = BackoffLatinLemmatizer()

In [None]:
lemmas = lemmatizer.lemmatize(tokens)

In [None]:
print(lemmas[:100])

In [None]:
from pprint import pprint

pprint(lemmas[:25])

## POS & Morphological Tagging

In [None]:
from cltk.tag.pos import POSTag

tagger = POSTag('latin')

In [None]:
pos_tags = tagger.tag_ngram_123_backoff(' '.join(tokens))
pprint(pos_tags[:10])

In [None]:
# pos_tags_2 = tagger.tag_tnt(' '.join(tokens))
# pprint(pos_tags_2[:10])

## Prosody Tagging

In [None]:
# # import Levenshtein

# from cltk.prosody.latin.hexameter_scanner import HexameterScanner
# scanner = HexameterScanner()

In [None]:
# scansion = scanner.scan(virgil_pp[:44])
# print(scansion.syllables)
# print(scansion.scansion)

In [None]:
# from cltk.tokenize.line import LineTokenizer
# line_tokenizer = LineTokenizer('latin')

In [None]:
# lines = line_tokenizer.tokenize(virgil_pp)

In [None]:
# for line in lines[:10]:
#     scansion = scanner.scan(line)
#     print(scansion.scansion.replace(' ',''))

## Named Entity Recognition

In [None]:
from cltk.tag import ner

text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""

pprint(ner.tag_ner('latin', input_text=text_str, output_type=list))

In [None]:
def preprocess_2(text):

    # Remove Latin Library-specific paratexts with regex
    
    remove_list = [
            r'\bP. VERGILI MARONIS AENEIDOS LIBER .+\b',
            r'Vergil: Aeneid .+',
            r'\bThe Latin Library\b',
            r'\bThe Classics Page\b',
            r'\bVergil\b',
        ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)

    # Remove html entities and related html artifacts
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    text = re.sub(r' \xa0 ', '    ', text)
    
    # Lowercase text
#     text = text.lower()

    # Normalize text
    text = replacer.replace(text) #Normalize u/v & i/j
    
    # Remove punctuation with translate
    punctuation ="\"#$%&\'()+,-/:;<=>@[\]^_`{|}~.?!«»—"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    # Remove numbers
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    # Handle spacing
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\t',' ', text) # Remove tabs
    text = re.sub('^\s+','', text)
    text = re.sub(' \n', '\n', text)
    text = re.sub('\n\n', '~', text)
    text = re.sub('~+', '\n\n', text)
    
    return text.strip()

In [None]:
virgil_pp_2 = preprocess_2(ll.raw(virgil_files[0]))
print(virgil_pp_2[:100])

In [None]:
pprint(ner.tag_ner('latin', input_text=virgil_pp_2, output_type=list)[:25])

## Sample Text Analyses & Visualizations

### Counting Tokens

In [None]:
from collections import Counter

In [None]:
word_count = Counter(tokens)
print(word_count.most_common(25))

In [None]:
running = 0

print('Top 25 words in Virgil 1:\n')
print("{number:>5}  {word:<12}{count:<12}{percent:<12}{running:<12}". \
        format(number="", word="TOKEN", count="COUNT", percent="TOKEN %", running = "RUNNING %"))
for i, pair in enumerate(word_count.most_common(25)):
    running += pair[1]
    print("{number:>5}. {word:<12}{count:<12}{percent:<12}{running:<12}". \
        format(number=i+1, word=pair[0], count=pair[1], \
        percent=str(round(pair[1] / len(tokens)*100, 2))+"%", running = str(round(running / len(tokens)*100, 2))+"%"))

### KWIC

In [None]:
from nltk import Text

In [None]:
virgil_Text = Text(tokens)
virgil_Text.concordance('Aeneas')

### Dispersion Plot

In [None]:
%matplotlib inline  
import matplotlib.pyplot as plt

In [None]:
virgil_Text

In [None]:
plt.figure(figsize=(20, 5))
virgil_Text.dispersion_plot(['aeneas', 'uenus', 'dido'])

In [None]:
# Show books sorting problem

aeneid_files = [file for file in files if "vergil/aen" in file]
print(aeneid_files)

In [None]:
# Fix books sorting problem

aeneid_order = [int(" ".join(re.findall(r'\d+', item))) for item in aeneid_files]
aeneid_files = [x for _, x in sorted(zip(aeneid_order, aeneid_files))]
print(aeneid_files)

In [None]:
aeneid_pp = preprocess(ll.raw(aeneid_files))
aeneid_tokens = word_tokenizer.tokenize(aeneid_pp)
aeneid_lemmas = [x for _, x in lemmatizer.lemmatize(aeneid_tokens)]

# Error in lemmatizer; need to fix
aeneid_lemmas = [lemma if lemma != 'dis-do' else 'dido' for lemma in aeneid_lemmas]

In [None]:
# Dispersion plot of entire Aeneid

aeneid_Text = Text(aeneid_tokens)
plt.figure(figsize=(20, 5))
aeneid_Text.dispersion_plot(['aeneas', 'uenus', 'dido'])

In [None]:
# Lemmatized dispersion plot

aeneid_Text = Text(aeneid_lemmas)
plt.figure(figsize=(20, 5))
aeneid_Text.dispersion_plot(['aeneas', 'uenus', 'dido'])

### Graphed Frequency Distribution

In [None]:
from nltk.probability import FreqDist

fdist = FreqDist(virgil_Text)

plt.figure(figsize=(20, 10))
fdist.plot(50, cumulative=True)

## Working with other CLTK Languages

### Working with Ancient Greek in CLTK

In [None]:
import re
import requests

response = requests.get('https://raw.githubusercontent.com/tesserae/tesserae/master/texts/grc/homer.iliad/homer.iliad.part.1.tess')
iliad = response.text

In [None]:
print(iliad[:100])

In [None]:
iliad = re.sub(r'<.+?>\t', '', iliad)
print(iliad[:100])

In [None]:
import unicodedata

iliad = unicodedata.normalize('NFC', iliad)

In [None]:
from cltk.tokenize.word import WordTokenizer

word_tokenizer_greek = WordTokenizer('greek')
tokens = word_tokenizer_greek.tokenize(iliad)
print(tokens[:100])

In [None]:
from cltk.tokenize.line import LineTokenizer

line_tokenizer = LineTokenizer('greek')
lines = line_tokenizer.tokenize(iliad)

In [None]:
from cltk.tag.pos import POSTag
tagger = POSTag('greek')

tagger.tag_ngram_123_backoff(lines[0])

### Working with Akkadian in CLTK

In [None]:
# In[1]: import os

from cltk.tokenize.word import WordTokenizer

word_tokenizer_akkadian = WordTokenizer('akkadian')

In [None]:
line = 'u2-wa-a-ru at-ta e2-kal2-la-ka _e2_-ka wu-e-er'
tokens = word_tokenizer_akkadian.tokenize(line)
pprint(tokens)

In [None]:
from cltk.stem.akkadian.syllabifier import Syllabifier

word = "epištašu"
syll = Syllabifier()
syll.syllabify(word)

In [None]:
from cltk.stem.akkadian.declension import NaiveDecliner

word = 'ilum'
decliner = NaiveDecliner()
decliner.decline_noun(word, 'm')