## Tokenize le texte
Une texte pour l'ordinateur est un 'string', une liste de characters. On aura besoin d'une liste de mots et on appelle ce processus 'tokenisation'

In [21]:
file = open("/Users/danjane/cltk_data/latin/text/latin_text_latin_library/falcone.txt", "r") 
text = file.read()

# Remplace j/v et tokenize
from cltk.stem.latin.j_v import JVReplacer
from cltk.tokenize.word import WordTokenizer

jv_replacer = JVReplacer()
text = jv_replacer.replace(text.lower())

word_tokenizer = WordTokenizer('latin')
text_word_tokens = word_tokenizer.tokenize(text)
text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]

# Garde les mots de plus de trois characters
text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

# Montre les premiers mots ('tokens') trouvé
print(text_word_tokens[:9])

['chronicon', 'beneuentanum', 'falcone', 'beneuento', 'chronicon', 'beneuentanum', '1102.1.1', 'apud', 'urbem']


## Créer une dictionnaire pour les noms des lieux
On a besoin d'une grande liste des noms latins des lieux, et aussi un moyen de les traduires dans leurs noms moderns.

In [32]:
file = open("LatinGeographicalNames.txt")
file_text = file.read().lower()
file_text_lines = file_text.split('\n')
latinname_modernname = [line.split('\t') for line in file_text_lines]

latin_names = []
modern_names = []
for pair in latinname_modernname:
    if len(pair) > 1:
        latins = pair[0].split(',')
        moderns = pair[1].split(',')
        for latin in latins:
            for modern in moderns:
                latin_names.append(latin.lstrip())
                modern_names.append(modern.lstrip())

for i in range(10):
        print(latin_names[i] + ', ' + modern_names[i])

canonical latin name (source(s): variant(s)), english name (native language(s)) - older name(s)
canonical latin name (source(s): variant(s)), (other language(s))
canonical latin name (source(s): variant(s)), location(s)
brigantium, bregenz
brigantia, bregenz
carnuntum, bad deutsch-altenburg / petronell
chremisa, krems
idunum, judenburg
iuenna, jaunstein - (podjuna)
iuenna, völkermarkt


In [23]:
places_in_text = list(set(text_word_tokens) & set(latin_names))
print(places_in_text[:5])

['solium', 'regalis', 'curia', 'luna']


Je voudrais trouver des textes avec les plus de lieux possible, car il serait plus jolie sur un plan

In [24]:
import os

#files = [file in os.listdir(
#    "/Users/danjane/cltk_data/latin/text/latin_text_latin_library/")]
dir_texts = "/Users/danjane/cltk_data/latin/text/latin_text_latin_library/"
files = os.listdir(dir_texts)
files = [file for file in files if (file.endswith('.txt')) ]


In [25]:
def tokenize(text):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    word_tokenizer = WordTokenizer('latin')
    text_word_tokens = word_tokenizer.tokenize(text)
    text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]

    # Garde les mots de plus de trois characters
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]
    return text_word_tokens

In [7]:
num_of_places = {}
for file in files:
    file = open(os.path.join(dir_texts, file), "r") 
    text = file.read()
    tokens = tokenize(text)
    num_of_places[file] = len(set(tokens) & set(latin_names))
    

In [8]:
import numpy as np
biggest = np.max(list(num_of_places.values()))
for (key, value) in num_of_places.items():
    if value==biggest:
        print(key)

<_io.TextIOWrapper name='/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pliny.nh3.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pomponius2.txt' mode='r' encoding='UTF-8'>


In [38]:
# file = '/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pliny.nh3.txt'
file = '/Users/danjane/cltk_data/latin/text/latin_text_latin_library/pomponius2.txt'

file = open(file, "r")
text = file.read()
word_tokens = tokenize(text)

places_in_text = set(word_tokens) & set(latin_names)

places_lookup = {}
for place in places_in_text:
    lookedup = [modern_name for 
           (latin_name, modern_name) in zip(latin_names, modern_names) 
           if latin_name==place]
    if len(lookedup):
        places_lookup[place] = lookedup[0]
    else:
        print('no place?   ' + place)
            
print(places_lookup)

{'quadrata': 'lébény', 'columna': 'kolomna', 'tolosa': 'toulouse', 'nemausus': 'nîmes', 'massilia': 'marseille', 'nicaea': 'nice', 'bononia': 'boulogne-sur-mer', 'arelate': 'arles - (arle)', 'theodosia': 'feodosiya', 'altinum': 'kölked', 'colonia': 'kolín', 'luna': 'louny'}


In [44]:
from geopy.geocoders import GoogleV3

# Lire le passmot
file = open("google_api_key.txt", "r") 
api_key = file.read()
geolocator = GoogleV3(api_key = api_key)

places_latlng = {}
for (latin, modern) in places_lookup.items():
    place, (lat, lng) = geolocator.geocode(modern, timeout=15)
    places_latlng[latin] = (lat, lng)
    
print(places_latlng)

{'quadrata': (47.737851, 17.3927637), 'columna': (55.0937517, 38.7688618), 'tolosa': (43.604652, 1.444209), 'nemausus': (43.836699, 4.360054), 'massilia': (43.296482, 5.36978), 'nicaea': (43.7101728, 7.261953200000001), 'bononia': (50.725231, 1.613334), 'arelate': (43.676647, 4.6277769), 'theodosia': (45.031933, 35.382433), 'altinum': (45.9489796, 18.7058024), 'colonia': (50.02732899999999, 15.2027277), 'luna': (50.35398120000001, 13.8033551)}


In [41]:
lat

46.2043907

In [49]:
### Create an interactive .html mapping with Folium
import folium

latitude = 47
longitude = 7
screen_name = 'pomponius2'

tmap=folium.Map(location=[latitude,longitude],
               zoom_start=6,tiles='OpenStreetMap')

fg=folium.FeatureGroup(name="Locations")

for latin, (lat, lng) in places_latlng.items():
    fg.add_child(folium.Marker(location=[lat,lng],popup=(folium.Popup(latin)),
                               icon=folium.Icon(color='green',icon_color='green')))

tmap.add_child(fg)

tmap.add_child(folium.LayerControl())
tmap.save(outfile=screen_name+'_map.html')