## Tokenize la texte
Une texte pour l'ordinateur est un 'string', une liste de characters. On aura besoin d'une liste de mots et on appelle ce processus 'tokenisation'

In [1]:
file = open("/Users/danjane/cltk_data/latin/text/latin_text_latin_library/falcone.txt", "r") 
text = file.read()

# Replace j/v and tokenize
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.word import WordTokenizer

jv_replacer = JVReplacer()
text = jv_replacer.replace(text.lower())

word_tokenizer = WordTokenizer('latin')
text_word_tokens = word_tokenizer.tokenize(text)
text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]

# Keep words more than 3 characters long
text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

# Quick check, printing the first 10 words
print(text_word_tokens[:9])

['chronicon', 'beneuentanum', 'falcone', 'beneuento', 'chronicon', 'beneuentanum', '1102.1.1', 'apud', 'urbem']


## Créer une dictionnaire pour les noms des lieux
On a besoin d'une grande liste des noms latins des lieux, et aussi un moyen de les traduires dans leurs noms moderns.

In [2]:
file = open("LatinGeographicalNames.txt")
places = file.read().lower()
lines = places.split('\n')
latinname_modernname = [line.split('\t') for line in lines]

latin_names = []
modern_names = []
for pair in latinname_modernname:
    if len(pair) > 1:
        latins = pair[0].split(',')
        moderns = pair[1].split(',')
        for latin in latins:
            for modern in moderns:
                latin_names.append(latin.lstrip())
                modern_names.append(modern.lstrip())

for i in range(10):
        print(latin_names[i] + ', ' + modern_names[i])

canonical latin name (source(s): variant(s)), english name (native language(s)) - older name(s)
canonical latin name (source(s): variant(s)), (other language(s))
canonical latin name (source(s): variant(s)), location(s)
brigantium, bregenz
brigantia, bregenz
carnuntum, bad deutsch-altenburg / petronell
chremisa, krems
idunum, judenburg
iuenna, jaunstein - (podjuna)
iuenna, völkermarkt


In [3]:
places_in_text = list(set(text_word_tokens) & set(latin_names))
print(places_in_text[:5])

['luna', 'curia', 'regalis', 'solium']


Je voudrais trouver des textes avec les plus de lieux possible, car il serait plus jolie sur un plan

In [15]:
import os

#files = [file in os.listdir(
#    "/Users/danjane/cltk_data/latin/text/latin_text_latin_library/")]
dir_texts = "/Users/danjane/cltk_data/latin/text/latin_text_latin_library/"
files = os.listdir(dir_texts)
files = [file for file in files if (file.endswith('.txt')) ]


In [11]:
def tokenize(text):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    word_tokenizer = WordTokenizer('latin')
    text_word_tokens = word_tokenizer.tokenize(text)
    text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]

    # Keep words more than 3 characters long
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]
    return text_word_tokens

In [19]:
num_of_places = {}
for file in files:
    file = open(os.path.join(dir_texts, file), "r") 
    text = file.read()
    tokens = tokenize(text)
    num_of_places[file] = len(set(tokens) & set(latin_names))
    

In [26]:
import numpy as np
biggest = np.max(list(num_of_places.values()))
for (key, value) in num_of_places.items():
    if value==biggest:
        print(key)

<_io.TextIOWrapper name='/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pliny.nh3.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pomponius2.txt' mode='r' encoding='UTF-8'>


In [28]:
# file = '/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pliny.nh3.txt'
file = '/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pomponius2.txt'

file = open(file, "r")
text = file.read()
tokens = tokenize(text)

places = set(tokens) & set(latin_names)
print(places)

{'colonia', 'nemausus', 'luna', 'quadrata', 'altinum', 'massilia', 'theodosia', 'nicaea', 'columna', 'arelate', 'tolosa', 'bononia'}
