# Basic collocations using CLTK Readers & NLTK
[Patrick J. Burns](https://diyclassics.github.io), Institute for the Study of the Ancient World / NYU  
11.9.2023

In [None]:
# Imports

import pandas as pd
from collections import Counter
import nltk
from cltkreaders.lat import LatinTesseraeCorpusReader
# from latintools import preprocess

In [None]:
# Helper function

# Helper function for preprocessing
def preprocess(
    text,
    lower=True,
    normalize=True,
    punctuation=False,
    numbers=False,
    unhyphenate=False,
    remove_lines=False,
    remove_spaces=False,
    entities=False,
    diacriticals=True,
    fill=" ",
):

    import html
    import re
    import unicodedata
    from cltk.alphabet.lat import JVReplacer

    replacer = JVReplacer()

    if not entities:
        text = html.unescape(text)

    if unhyphenate:
        text = re.sub(r"[-»—]\s?\n", "", text, flags=re.MULTILINE)

    if lower:
        text = text.lower()  # Lowercase

    if normalize:
        text = replacer.replace(text)

    if not punctuation:
        # Remove punctuation
        punctuation = "\"#$%&'()*+,/:;<=>@[\]^_`{|}~.?!«»—“-”"
        misc = "¡£¤¥¦§¨©¯°±²³´µ¶·¸¹º¼½¾¿÷·–‘’†•ↄ∞⏑〈〉（）"
        misc += punctuation
        translator = str.maketrans({key: fill for key in misc})
        text = text.translate(translator)

    if not numbers:
        # Remove numbers
        translator = str.maketrans({key: fill for key in "0123456789"})
        text = text.translate(translator)

    if remove_lines:
        text = " ".join(text.split("\n"))

    if remove_spaces:
        text = fill.join(text.split())

    def remove_diacriticals(text):
        combining_character_table = dict.fromkeys(
            c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))
        )
        text = unicodedata.normalize("NFD", text)
        text = text.translate(combining_character_table)
        return text

    if not diacriticals:
        text = remove_diacriticals(text)

    # Fix spacing
    text = re.sub(" +", " ", text)

    text = unicodedata.normalize("NFC", text)

    return text.strip()


In [None]:
# Get corpus reader

T = LatinTesseraeCorpusReader()

## Bigrams

In [None]:
# Show bigram example from the cat

cat = 'cicero.in_catilinam.tess'
cat_sents = T.sents(cat)
cat_sent = next(cat_sents).text
print(cat_sent)
cat_sent = preprocess(cat_sent)
cat_bigrams = list(nltk.bigrams(cat_sent.split()))
print(cat_bigrams[:5])

### Bigram frequency

The simplest way for us to get a sense of which words tend to cooccur in Latin is to observe the phenomenon directly, that is we can create a list of all word pairs and report the highest frequency pairs. Using Cicero's *Brutus* as a test case, let's build this list.

In [None]:
# Set up exploratory test
# Get bigrams for Cicero's Brutus

file = 'cicero.brutus.tess'
words = list(T.words(file, preprocess=preprocess, plaintext=True))
print(" ".join(words[:10]))

In [None]:
# Get bigrams for Cicero's Brutus

bigrams = list(nltk.bigrams(words))
print(bigrams[:5])

In [None]:
# Make counter of bigrams

bigrams_counter = Counter(bigrams)
bigrams_top = bigrams_counter.most_common(10)
bigrams_top_display = [(bigram, count) for bigram, count in bigrams_top]
df_freq = pd.DataFrame(list(bigrams_top_display), columns=['bigram', 'count']).sort_values(by='count', ascending=False).reset_index(drop=True)
df_freq.head(10)

In [None]:
# Make counter of bigrams, with NLTK collocations approach

# Create bigrams and Finder

bigrams = nltk.collocations.BigramAssocMeasures()
Finder = nltk.collocations.BigramCollocationFinder.from_words(words)
Finder.apply_freq_filter(5)

In [None]:
# Bigram frequency

df_freq = pd.DataFrame(list(Finder.ngram_fd.items()), columns=['bigram', 'count']).sort_values(by='count', ascending=False).reset_index(drop=True)
df_freq.head(10)

### Pointwise Mutual Information

In [None]:
# identify "meaningful" bigrams using PMI

df_pmi = pd.DataFrame(list(Finder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI'])
df_pmi = pd.merge(df_freq, df_pmi, on='bigram')
df_pmi.sort_values(by='PMI', ascending=False).head(10).reset_index(drop=True)

### Chi-squared

In [None]:
# identify "meaningful" bigrams using Chi-squared

df_chisq = pd.DataFrame(list(Finder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq'])
df_chisq = pd.merge(df_freq, df_chisq, on='bigram')
df_chisq.head(10)

### Log-likelihood ratio

In [None]:
# identify "meaningful" bigrams using log likelihood Ratio

df_loglike = pd.DataFrame(list(Finder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio'])
df_loglike = pd.merge(df_freq, df_loglike, on='bigram')
df_loglike.head(10)

In [None]:
### Search collocation space

bigrams = nltk.collocations.BigramAssocMeasures()
Finder = nltk.collocations.BigramCollocationFinder.from_words(words)

df_pmi = pd.DataFrame(list(Finder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI'])
df_pmi = pd.merge(df_freq, df_pmi, on='bigram')

term = 'publica'

brutus = df_pmi[df_pmi['bigram'].apply(lambda x: term in " ".join(x))].sort_values(by='PMI', ascending=False).reset_index(drop=True)
brutus.sort_values(by='PMI', ascending=False)