# Name recognition

In [1]:
import os
import codecs
from functools import lru_cache
from typing import List
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenize.latin.word import WordTokenizer

latin_sentence_tokenizer = SentenceTokenizer()
latin_word_tokenizer = WordTokenizer()

def remove_punctuation(text):
    text = text.replace(".", "")
    text = text.replace(",", "")
    text = text.replace(";", "")
    text = text.replace("?", "")
    text = text.replace("!", "")
    text = text.replace(":", "")
    text = text.replace("'", "")
    text = text.replace('"', "")
    return text


@lru_cache()
def extract_parsed_dlh_books(directory: str) -> List:
    """
    TXT files to str

    :param directory:
    :return:
    """
    retrieved_texts = []
    book_filenames = os.listdir(directory)
    book_filenames = sorted(book_filenames, key=lambda x: int(x.split(".")[0]))
    for filename in book_filenames:
        print(filename)
        with codecs.open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
            text = f.read()
            lines = [[latin_word_tokenizer.tokenize(remove_punctuation(sentence))
                      for sentence in latin_sentence_tokenizer.tokenize(line.strip())]
                     for line in text.split("\n") if line.strip()]
        retrieved_texts.append(lines)
    return retrieved_texts

In [3]:
from cltk.corpus.latin.corpora import LATIN_CORPORA
from cltk.corpus.utils.importer import CorpusImporter

#print(LATIN_CORPORA)
ci = CorpusImporter("latin")
ci.import_corpus("latin_models_cltk")
ci.import_corpus("latin_text_perseus")
ci.import_corpus("latin_text_latin_library")

In [4]:
from cltk.tag.pos import TAGGERS
from cltk.tag.pos import POSTag
latin_pos_tagger = POSTag("latin")
example = "Hunc ferunt instituisse ecclesias per vicos, id est Calatonno, Bricca, Rotomago, Briotreide, Cainone."
latin_pos_tagger.tag_tnt(example)

[('Hunc', 'Unk'),
 ('ferunt', 'V3PPIA---'),
 ('instituisse', 'V--RNA---'),
 ('ecclesias', 'Unk'),
 ('per', 'R--------'),
 ('vicos', 'Unk'),
 (',', 'U--------'),
 ('id', 'P-S---NN-'),
 ('est', 'V3SPIA---'),
 ('Calatonno', 'Unk'),
 (',', 'U--------'),
 ('Bricca', 'Unk'),
 (',', 'U--------'),
 ('Rotomago', 'Unk'),
 (',', 'U--------'),
 ('Briotreide', 'Unk'),
 (',', 'U--------'),
 ('Cainone', 'Unk'),
 ('.', 'U--------')]

In [5]:
books = extract_parsed_dlh_books("gregory_of_tours_txt")

1.txt
2.txt
3.txt
4.txt
5.txt
6.txt
7.txt
8.txt
9.txt
10.txt


In [None]:
books[0]


### Statistics


In [7]:
print(f"Number of books {len(books)}")
print(f"Number of paragraphs {sum([len([paragraph for paragraph in book]) for book in books])}")
# print(f"Number of sentences {len([sentence for book in books for paragraph in book for sentence in paragraph])}")
print(f"Number of tokens {len([token for book in books for paragraph in book for sentence in paragraph for token in sentence])}")

Number of books 10
Number of paragraphs 1022
Number of tokens 123422


### Proper nouns

In [6]:
tokens_set = set([token for book in [books[4]] for paragraph in book for sentence in paragraph for token in sentence[1:]])
proper_nouns = set([word for word in tokens_set if word and word[0].isupper()])

In [6]:
print("Briotreide" in proper_nouns)
print([word for word in proper_nouns if word.startswith("Sig")])


False
['Sigyberthi', 'Sigiberthus', 'Sigibertho', 'Sigybertho', 'Siggonis', 'Sigyberthus', 'Sigiberthi', 'Sigivaldi', 'Sigymundum', 'Sigiberti']


In [None]:
from cltk.stem.lemma import LemmaReplacer


example = ""
lemmatizer = LemmaReplacer('latin')
lemmata = lemmatizer.lemmatize(example)
print(lemmata)

lemmata_orig = lemmatizer.lemmatize(example, return_raw=True)
print(lemmata_orig)

In [None]:
# lemmatized_books = [lemmatizer.lemmatize(" ".join(sentence), return_raw=True) for book in books for paragraph in book for sentence in paragraph]

In [10]:
# pos_tagged_books = [" ".join([" ".join(sentence) for paragraph in book for sentence in paragraph]) for book in books]
# pos_tagged_books[0]

In [13]:
# pos_tagged_books = [latin_pos_tagger.tag_unigram(" ".join([" ".join(sentence) for paragraph in book for sentence in paragraph])) for book in books]

In [None]:
%timeit latin_pos_tagger.tag_unigram(" ".join(books[0][0][0]))

In [8]:
sentences = [sentence for book in books for paragraph in book for sentence in paragraph]

In [9]:
pos_tagged_books = [latin_pos_tagger.tag_bigram(" ".join(sentence)) for sentence in sentences]

MemoryError: 

In [None]:
lemmatized_sentences = [lemmatizer.lemmatize(" ".join(sentence), return_raw=True) for sentence in sentences]

In [None]:
len(sentences), len(pos_tagged_books), len(lemmatized_sentences)

In [None]:
pos_tagged_books[100]

Words that have unknown POS and same lemma as itself are likely with a capitalized first character are likely proper nouns.   


In [None]:
real_proper_nouns = []
for i in range(len(sentences)):
    # print(sentences[i], pos_tagged_books[i], lemmatized_sentences[i])
    for j in range(len(sentences[i])):
        proper_noun, pos_tag, res_lemma = sentences[i][j], pos_tagged_books[i][j], lemmatized_sentences[i][j]
        if len(proper_noun) > 0 and proper_noun[0].isupper():
            # print(res_lemma)
            if len(res_lemma.split("/")) == 1:
                lemma = res_lemma
            elif len(res_lemma.split("/")) == 2:
                lemma = res_lemma.split("/")[1]
            else:
                continue
            print(proper_noun, lemma, pos_tag)
            if proper_noun.lower() == lemma.lower() and pos_tag[1] is None:
                real_proper_nouns.append(proper_noun)

In [None]:
real_proper_nouns

In [None]:
list(zip(pos_tags, res_lemmata))

In [None]:
lemmatizer.lemmatize("Childeberthi")

In [None]:
from cltk.corpus.readers import get_corpus_reader

In [None]:
reader = get_corpus_reader(language="latin", corpus_name="latin_text_latin_library")

In [None]:
docs = list(reader.docs())

In [None]:
len(docs)

In [None]:
reader._fileids = [fileid for fileid in reader._fileids if not fileid.startswith("grego")]

In [None]:
len(list(reader.docs()))

In [None]:
all_tokens = set(reader.words())

In [None]:
len(all_tokens)

All the proper nouns that only occur in DLH.

In [None]:
for_real_proper_nouns = set([p for p in real_proper_nouns if p not in all_tokens])

In [None]:
len(for_real_proper_nouns)

In [None]:
[i for i in all_tokens if i.startswith("Martin")]

In [None]:
already_known_proper_nouns = set([p for p in real_proper_nouns if p in all_tokens])

In [None]:
len(already_known_proper_nouns)

In [None]:
sorted(list(already_known_proper_nouns))

In [None]:
"Sigiberthus" in for_real_proper_nouns

We keep them in a file

In [None]:
with open("dlh_proper_nouns.txt", "w") as f:
    f.write("\n".join(for_real_proper_nouns))

We try to keep regroup different forms of a lemma

In [None]:
from cltk.text_reuse.levenshtein import Levenshtein

In [None]:
l = Levenshtein()

In [None]:
import numpy as np

In [None]:
mat = np.zeros((len(for_real_proper_nouns), 
                len(for_real_proper_nouns)))
for_real_proper_nouns = list(for_real_proper_nouns)

In [None]:
for i in range(len(for_real_proper_nouns)):
    for j in range(i):
        mat[i, j] = l.Levenshtein_Distance(for_real_proper_nouns[i], for_real_proper_nouns[j])

In [None]:
for_real_proper_nouns[:10]

In [None]:
mat

In [None]:
l = []
for i in range(len(for_real_proper_nouns)):
    for j in range(i):
        if 0 < mat[i, j] < 3:
            l.append((for_real_proper_nouns[i], for_real_proper_nouns[j]))

Lemmata which have several forms in the text

In [None]:
t = set([i for i, j in l ])
t.update(set([j for i, j in l]))

In [None]:
len(t)

In [None]:
sorted(list(t))

Nouns which appear only with onecform

In [None]:
set([i for i in for_real_proper_nouns if i not in t])