In [None]:
# Mount Google Drive (comment out on local)
# Shared folder link (set a shortcut to MyDrive to run): https://drive.google.com/drive/folders/1zwnMrxiQ6o_haHlEzJLAZNv223TxjRcP?usp=sharing
# from google.colab import drive
# drive._mount('/content/drive')

In [None]:
# comment out on local
# !cp '/content/drive/MyDrive/Voynich/corruptions.py' corruptions.py
# !cp '/content/drive/MyDrive/Voynich/uncertainties.py' uncertainties.py
# !cp '/content/drive/MyDrive/Voynich/validation.py' validation.py
# !cp '/content/drive/MyDrive/Voynich/baseline.py' baseline.py

In [None]:
# !pip install cltk==1.0.21
# !pip install gensim==4.1.2

In [None]:
# Colab
# LATIN_AIX = '/content/drive/MyDrive/Voynich/texts/historia_hierosolymitana.txt'
# BENCH = '/content/drive/MyDrive/Voynich/benchmarks/syn-selection-benchmark-Latin.tsv'

# Deepnote
# LATIN_AIX = "ml-project-2-scikit-learn2/texts/Latin_Aix.txt"
# BENCHMARK_LAT = "ml-project-2-scikit-learn2/benchmarks/latin_benchmark.tsv"

# local
LATIN_AIX = 'texts/historia_hierosolymitana.txt'
BENCH = 'benchmarks/syn-selection-benchmark-Latin.tsv'

In [None]:
import math
import numpy as np
import numpy.linalg as npl
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scipy
import string
import random
import nltk
import gensim
from gensim.models import Word2Vec, FastText
from gensim.corpora.dictionary import Dictionary
from scipy.spatial.distance import cosine

from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer
from cltk.data.fetch import FetchCorpus
corpus_downloader = FetchCorpus(language="lat")
corpus_downloader.list_corpora
corpus_downloader.import_corpus("lat_models_cltk")


import corruptions as corr
import uncertainties as unc
import validation as valid
import baseline as base

## Preprocess text

In [None]:
with open(LATIN_AIX, "r") as doc:
    latin = doc.read()

In [None]:
# Strip chapter separation and split sentences
latin = latin.replace('\n\n', '').split('. ')
latin[:10]

['Prooemium sequentis operis',
 'Incipit liber primus Expeditionis Hierosolymitanae urbis, ubi clarissimi ducis Godefridi inclita gesta narrantur, cujus labore et studio civitas sancta ab infidelibus liberata, sanctae Ecclesiae filiis est restituta',
 'Diu multumque his usque diebus, ob inaudita et plurimum admiranda, saepius accensus sum desiderio ejusdem expeditionis et faciendae orationis illic, dum ferverem',
 'Sed cum minime, ob diversa impedimenta, intentioni meae effectus daretur, temerario ausu decrevi saltem ex his aliqua memoriae commendare, quae auditu et revelatione nota fierent ab his qui praesentes adfuissent, ut vel sic non in otio, sed quasi in via, si non corpore, at tota mente et animo consocius essem, elaborare',
 'Quapropter  de labore et miseriis, de firmata fide, de robustorum principum caeterorumque hominum conspiratione bona in amore Christi quomodo scilicet relinquerint patriam, cognatos, uxores, filios, filiasque, urbes, castella, agros, regna et omnem hujus m

In [None]:
def tokenize_latin_to_words(txt):
    '''
    tokenize text, casefold to lowercase and remove punctuation.
    '''
    word_tokenizer = LatinWordTokenizer()
    tokens = [word_tokenizer.tokenize(sentence) for sentence in txt]
    words = [[word.lower() for word in senc if word.isalpha()] for senc in tokens if len(senc) > 0]
    return words

In [None]:
latin_tokenized  = tokenize_latin_to_words(latin)

In [None]:
print(*latin_tokenized[:10], sep='\n')

['prooemium', 'sequentis', 'operis']
['incipit', 'liber', 'primus', 'expeditionis', 'hierosolymitanae', 'urbis', 'ubi', 'clarissimi', 'ducis', 'godefridi', 'inclita', 'gesta', 'narrantur', 'cujus', 'labore', 'et', 'studio', 'civitas', 'sancta', 'ab', 'infidelibus', 'liberata', 'sanctae', 'ecclesiae', 'filiis', 'est', 'restituta']
['diu', 'multum', 'his', 'usque', 'diebus', 'ob', 'inaudita', 'et', 'plurimum', 'admiranda', 'saepius', 'accensus', 'sum', 'desiderio', 'ejusdem', 'expeditionis', 'et', 'faciendae', 'orationis', 'illic', 'dum', 'ferverem']
['sed', 'cum', 'minime', 'ob', 'diversa', 'impedimenta', 'intentioni', 'meae', 'effectus', 'daretur', 'temerario', 'ausu', 'decrevi', 'saltem', 'ex', 'his', 'aliqua', 'memoriae', 'commendare', 'quae', 'auditu', 'et', 'revelatio', 'nota', 'fierent', 'ab', 'his', 'qui', 'praesentes', 'adfuissent', 'ut', 'vel', 'sic', 'non', 'in', 'otio', 'sed', 'quasi', 'in', 'via', 'si', 'non', 'corpore', 'at', 'tota', 'mente', 'et', 'animo', 'consocius', 'es

In [None]:
latin_flat = [word for sentence in latin_tokenized for word in sentence]

In [None]:
len(latin_flat)

123661

Since the text is about 4 times the reference size, we will reduce it to 1/4 of the original length.

In [None]:
latin_tokenized = latin_tokenized[:len(latin_tokenized) // 4]

## Corrupt text

To corrupt the text we will use the same distributions as of Voynich (cf. `embeddings_italian.ipynb`), without multiple uncertainties.

In [None]:
voynich_uncertainty_ratios = {'ALTERNATE_READINGS_RATIO': 0.004270487753229336,
                              'MULTIPLE_UNCERTAINTY_RATIO': 0,
                              'SINGLE_UNCERTAINTY_RATIO': 0.001194929867196402,
                              'SPACE_UNCERTAINTY_RATIO': 0.06889174154899444}

In [None]:
# Merge back to a single string
latin_clean = list(map(lambda sentence: ' '.join(sentence), latin_tokenized))
latin_merged = '\n'.join(latin_clean)

In [None]:
latin_merged

'prooemium sequentis operis\nincipit liber primus expeditionis hierosolymitanae urbis ubi clarissimi ducis godefridi inclita gesta narrantur cujus labore et studio civitas sancta ab infidelibus liberata sanctae ecclesiae filiis est restituta\ndiu multum his usque diebus ob inaudita et plurimum admiranda saepius accensus sum desiderio ejusdem expeditionis et faciendae orationis illic dum ferverem\nsed cum minime ob diversa impedimenta intentioni meae effectus daretur temerario ausu decrevi saltem ex his aliqua memoriae commendare quae auditu et revelatio nota fierent ab his qui praesentes adfuissent ut vel sic non in otio sed quasi in via si non corpore at tota mente et animo consocius essem elaborare\nquapropter de labore et miseriis de firmata fide de robustorum principum caeterorum hominum conspiratio bona in amore christi quomodo scilicet relinquerint patriam cognatos uxores filios filias urbes castella agros regna et omnem hujus mundi dulcedinem certa pro incertis et in nomine jesu

In [None]:
np.unique([*latin_merged])

array(['\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
       'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
       'y', 'z', 'ť'], dtype='<U1')

In [None]:
latin_alphabet = string.ascii_lowercase
latin_alphabet

'abcdefghijklmnopqrstuvwxyz'

In [None]:
# Same uncertainty chars as for italian
latin_uncertainty_chars = {'ALTERNATE_CHAR': 'ž',
                             'SINGLE_UNCERTAINTY': '?',
                             'DOUBLE_UNCERTAINTY': '??',
                             'MULTIPLE_UNCERTAINTY': '???',
                             'UNCERTAIN_SPACE': 'ř'
                            }

# We arbitrarily assume that in the resulting document, 1/2 of the uncertain 
# spaces are actual spaces and the remaining 1/2 are not
actual_space_ratio = 0.5

In [None]:
random.seed(42)
latin_corrupted = corr.corrupt_text(latin_merged, voynich_uncertainty_ratios,
                                    latin_uncertainty_chars, latin_alphabet,
                                    actual_space_ratio)

Number of alternate readings: 803, 0.437% of chars
Number of single uncertainty: 198, 0.108% of chars
Number of multiple uncertainties: 0, 0.000% of chars
Number of uncertain spaces: 2124, 6.918% of spaces


In [None]:
# Split sentences
latin_corrupted = latin_corrupted.splitlines()
latin_corrupted[:10]

['prooemiumřsequentisřoperis',
 'incipit liber primus expeditionis hierosolymitanae urbis ubi clarissimi du?is godefridi inclita geřsta narrantur cujus labore et studio civitas sancta ab infidelibus liberata sanctae ecclesiae filiis est restituta',
 'diu multum his usque diebus ob inaudita et plurimum [y:a]dmira[c:n]da saepius accensus sumřdesiderio ejusdemřexpeditionis et faciendaeřorationis illic dum ferveřrem',
 'sed cum minime ob diversa impeřdimenta intentioni meae effectus daretur temerario ausřu decrevi saltem ex hisřaliqua memoriae commendare quae audituřet revelatio nota fierent ab his qui praesentes adfuis[g:s]ent ut vel sic non in otio sed quasi in via siřnon corpore at tota mente et animo consocius essem elaborare',
 'quapropter de labore et miseriis de firmata fidře de robustorum principum caeterorum hominum [c:x]onspiratio bona in amore christiřquomodo scilicet relinquerintřpatriam cognatos uxores filios filiasřur[v:b]es castella agros regna et omnem hujus mundi dulcedine

In [None]:
latin_corrupted_tokenized = [sentence.split(' ') for sentence in latin_corrupted]
print(*latin_corrupted_tokenized[:10], sep='\n')

['prooemiumřsequentisřoperis']
['incipit', 'liber', 'primus', 'expeditionis', 'hierosolymitanae', 'urbis', 'ubi', 'clarissimi', 'du?is', 'godefridi', 'inclita', 'geřsta', 'narrantur', 'cujus', 'labore', 'et', 'studio', 'civitas', 'sancta', 'ab', 'infidelibus', 'liberata', 'sanctae', 'ecclesiae', 'filiis', 'est', 'restituta']
['diu', 'multum', 'his', 'usque', 'diebus', 'ob', 'inaudita', 'et', 'plurimum', '[y:a]dmira[c:n]da', 'saepius', 'accensus', 'sumřdesiderio', 'ejusdemřexpeditionis', 'et', 'faciendaeřorationis', 'illic', 'dum', 'ferveřrem']
['sed', 'cum', 'minime', 'ob', 'diversa', 'impeřdimenta', 'intentioni', 'meae', 'effectus', 'daretur', 'temerario', 'ausřu', 'decrevi', 'saltem', 'ex', 'hisřaliqua', 'memoriae', 'commendare', 'quae', 'audituřet', 'revelatio', 'nota', 'fierent', 'ab', 'his', 'qui', 'praesentes', 'adfuis[g:s]ent', 'ut', 'vel', 'sic', 'non', 'in', 'otio', 'sed', 'quasi', 'in', 'via', 'siřnon', 'corpore', 'at', 'tota', 'mente', 'et', 'animo', 'consocius', 'essem', 'e

## Build embedings

In [None]:
# Word2Vec parameters
min_count = 1                # minimum number of  word occurrencies
vector_size = 100            # embeddings size
window = 5                   # context window size
alpha = 0.5                  # initial learning rate
min_alpha = 0.5              # limit learning rate
epochs = 20                  # epochs number
negative = 20                # number of draws for negative sampling

In [None]:
embeddings_W2V_CBOW = Word2Vec(latin_corrupted_tokenized, min_count=min_count, vector_size=vector_size, 
                               window=window, sg=0, negative=negative, alpha=alpha,
                               min_alpha=min_alpha, epochs=epochs)

In [None]:
embeddings_W2V_SkipGram = Word2Vec(latin_corrupted_tokenized, min_count=min_count, vector_size=vector_size, 
                                   window=window, sg=1, negative=negative, alpha=alpha,
                                   min_alpha=min_alpha, epochs=epochs)

In [None]:
# FastText parameters
min_count = 1                # minimum number of  word occurrencies
vector_size = 100            # embeddings size
window = 5                   # context window size
alpha = 0.05                 # initial learning rate
min_alpha = 0.0001           # limit learning rate
epochs = 20                  # epochs number
negative = 20                # number of draws for negative sampling
min_n = 3                    # min length of char ngram
max_n = 6                    # max length of char ngram

In [None]:
embeddings_FT_CBOW = FastText(latin_corrupted_tokenized, min_count=min_count, vector_size=vector_size,
                              window=window, sg=0, negative=negative, alpha=alpha,
                              min_alpha=min_alpha, epochs=epochs,
                              min_n=min_n, max_n=max_n)

In [None]:
embeddings_FT_SkipGram = FastText(latin_corrupted_tokenized, min_count=min_count, vector_size=vector_size,
                                  window=window, sg=1, negative=negative, alpha=alpha,
                                  min_alpha=min_alpha, epochs=epochs,
                                  min_n=min_n, max_n=max_n)

# Evaluate the embeddings with benchmark

In [None]:
latin_benchmark = pd.read_csv(BENCH, sep='\t', header = None)

In [None]:
latin_benchmark

Unnamed: 0,0,1,2,3,4
0,a,ab,aqua,manlius,erebus
1,abdico,exheredo,o,nutus,exhortatio
2,abdo,condo,metus,ploro,expensum
3,abduco,abigo,timeo,posteritas,expergiscor
4,abedo,adedo,adeo,pridie,expilo
...,...,...,...,...,...
2754,utrobique,utrinque,legitimus,hister,circensis
2755,uulgaris,uilis,inferi,elegi,hortalus
2756,uulgo,uulgariter,inflammo,eloquens,hortator
2757,uulnus,plaga,insisto,equuleus,hyacinthus


In [None]:
def pick_most_similar(model, target, options):
    '''
    Pick the word among options that is most similar to the target
    word, in terms of cosine similarity.

    Args:
        model (Word2Vec): a trained embeddings model.
        target (str): target word
        options (list of str): options word
    '''
    # FastText is able to compute embeddings also for out-of-vocabulary
    # queries, while Word2Vec is not.
    try:
        target_emb = model.wv[target]
        target_emb = target_emb / npl.norm(target_emb, 2)
        options_emb = model.wv[options]
    except KeyError as e:
        return None

    similarities = [(target_emb @ option_emb) / (npl.norm(option_emb, 2)) for option_emb in options_emb]
    idx = np.argmax(similarities)
    return options[idx]

In [None]:
def benchmark_model(model, benchmark):
    '''
    Evaluate model with the latin benchmark.
    '''
    predictions = benchmark.apply(lambda x: pick_most_similar(model, x[0], x[1:].values), axis=1)
    
    # Remove unvalid entries
    valid = predictions != None
    print('Valid entries:', predictions.notna().sum() / predictions.size * 100,'%')
    predictions = predictions[valid]
    benchmark = benchmark[valid]

    accuracy = (predictions == benchmark[1]).sum() / predictions.size
    return accuracy

In [None]:
benchmark_model(embeddings_W2V_CBOW, latin_benchmark)

Valid entries: 0.0 %


0.0

In [None]:
benchmark_model(embeddings_W2V_SkipGram, latin_benchmark)

Valid entries: 0.0 %


0.0

In [None]:
benchmark_model(embeddings_FT_CBOW, latin_benchmark)

Valid entries: 100.0 %


0.45668720550924247

In [None]:
benchmark_model(embeddings_FT_SkipGram, latin_benchmark)

Valid entries: 100.0 %


0.5744835085175788

Unfortunately, due to the small size of the text picked, there is no entry of the benchmark whose 5 words are all belonging to the vocabulary. Hence, there is no way to evaluate Word2Vec models with this setup.
The FastText results show how the models are performing quite poorly, picking the right synonym about half of the times. It is though pretty evident how SkipGram is performing much better than CBOW for this task.