# Metonym check

- Last updated 5/19/2020 7:08am

In [1]:
# Imports

from cltk.corpus.readers import get_corpus_reader
from cltk.tokenize.latin.sentence import SentenceTokenizer

from tqdm import tqdm_notebook as tqdm

from pprint import pprint

In [2]:
# Setup CLTK tools

sent_tokenizer = SentenceTokenizer()

In [3]:
# Set up corpus

tess = get_corpus_reader(corpus_name = 'latin_text_tesserae', language = 'latin')
files = tess.fileids()
texts = list(tess.texts(files))

In [4]:
# Preprocess texts

def preprocess(text):
    import re
    import html
    from cltk.stem.latin.j_v import JVReplacer
    
    replacer = JVReplacer()
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    
    # Fix partial lines
    text = re.sub(r'\d+(a|b)', ' ', text)
        
#     text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    text = re.sub('\n', ' ', text) # Remove line breaks
    
    return text.strip()

In [5]:
sents = []
for text in texts:
    sents_ = sent_tokenizer.tokenize(text)
    sents_ = [preprocess(sent) for sent in sents_]
    sents.extend(sents_)

In [6]:
metonyms = ['Amor', 'Bacchus', 'Ceres', 'Cupido', 'Mars', 'Uenus']

In [7]:
from collections import defaultdict
examples = defaultdict(list)
for sent in sents:
    sent = sent.split()
    for metonym in metonyms:
        if metonym.lower() in sent and metonym in sent:
            pass
        else:
            if metonym.lower() in sent:
                examples[metonym.lower()].append(sent)
            if metonym in sent and sent[0] != (metonym):
                examples[metonym].append(sent)

In [8]:
for metonym in metonyms:
    print(f'{metonym}: {len(examples[metonym])}')
    print(f'{metonym.lower()}: {len(examples[metonym.lower()])}')

Amor: 146
amor: 888
Bacchus: 58
bacchus: 2
Ceres: 113
ceres: 6
Cupido: 64
cupido: 219
Mars: 102
mars: 2
Uenus: 391
uenus: 34


In [9]:
with open('metonyms.txt','w') as f:
    for metonym in metonyms:
        for v in examples[metonym]:
            f.write(f'1\t{" ".join(v)}\n')
        for v in examples[metonym.lower()]:
            f.write(f'0\t{" ".join(v)}\n')