In [20]:
# Imports

import html
import re

import pandas

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

In [14]:
# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

In [3]:
# Setup files

files = latinlibrary.fileids()
print("There are %d files in the Latin Library corpus." % len(files))

There are 2164 files in the Latin Library corpus.


In [5]:
#Filter for classical texts

classical = []

remove = ["The Bible","Ius Romanum","Papal Bulls","Medieval Latin","Christian Latin","Christina Latin","Neo-Latin","The Miscellany","Contemporary Latin"]

for file in files:
   raw = latinlibrary.raw(file)
   if not any(x in raw for x in remove):
       classical.append(file)

files = classical
print("There are %d files in the Latin Library Classical subcorpus." % len(files))

There are 970 files in the Latin Library Classical subcorpus.


In [6]:
# Preprocess texts

def preprocess(text):    

    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub('\x00',' ',text) #Another space problem?
    
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b',
                   r'\bthe classics page\b',
                   r'\bneo-latin\b', 
                   r'\bmedieval latin\b',
                   r'\bchristian latin\b',
                   r'\bchristina latin\b',
                   r'\bpapal bulls\b',
                   r'\bthe miscellany\b',
                  ]

    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text

In [16]:
# Make list of texts

raw_files = []

for file in files:
    raw = latinlibrary.raw(file)
    raw = preprocess(raw)
    if len(raw) < 1000:
        pass
    else:
        raw_tokens = raw.split()
        raw = " ".join(raw_tokens[50:-50])
        raw_files.append(raw)

### Following [Zou et al. 2006; Alajmi 2012]

In [21]:
# Make document-term matrix and vocabulary

vectorizer = CountVectorizer(input='content', min_df=2)
dtm = vectorizer.fit_transform(raw_files)
dtm = dtm.toarray()

vocab = vectorizer.get_feature_names()
vocab = np.array(vocab)

In [22]:
M = len(vocab)
N= len(raw_files)

In [23]:
# Make array of probabilities per book

raw_lengths = [len(tokens.split()) for tokens in raw_files]
l = np.array(raw_lengths)
ll = l.reshape(len(l),1)

probs = dtm/ll

P=probs

In [24]:
# Calculate mean probability
# i.e. Sum of probabilities for each word / number of documents

probsum = np.ravel(probs.sum(axis=0))
MP = probsum/N

In [39]:
MP_vocab = list(zip(vocab,MP))
MP_vocab.sort(key=lambda x: x[1], reverse=True)
mp = [(item[0], '{:.4f}'.format(round(item[1], 4))) for item in MP_vocab]
mp = mp[:10]
print(mp)

[('et', '0.0270'), ('in', '0.0183'), ('est', '0.0110'), ('non', '0.0105'), ('cum', '0.0094'), ('ut', '0.0086'), ('ad', '0.0076'), ('quod', '0.0063'), ('qui', '0.0062'), ('sed', '0.0052')]
