In [1]:
INFERNO_LAT = "ml-project-2-scikit-learn2/texts/DivineComedyLatinOCR.txt"
LATIN_AIX = "ml-project-2-scikit-learn2/texts/Latin_Aix.txt"
BENCHMARK_LAT = "ml-project-2-scikit-learn2/benchmarks/latin_benchmark.tsv"

In [None]:
!pip install cltk==1.0.21
!pip install gensim==4.1.2

Collecting gensim<4.0.0,>=3.8.1
  Using cached gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scipy
import nltk
import gensim
from gensim.models import Word2Vec, FastText
from gensim.corpora.dictionary import Dictionary
import math

from cltk.tokenizers.lat.lat import LatinPunktSentenceTokenizer
from cltk.tokenizers.lat.lat import LatinWordTokenizer
from cltk.data.fetch import FetchCorpus
corpus_downloader = FetchCorpus(language="lat")
corpus_downloader.list_corpora
corpus_downloader.import_corpus("lat_models_cltk")
gensim.__version__


'4.1.2'

In [None]:
with open(LATIN_AIX, "r") as doc:
    text = doc.read().replace("\n", " ")

TypeError: object of type '_io.TextIOWrapper' has no len()

In [None]:
def tokenize_latin_to_words(txt):
    '''
    tokenize text, casefold to lowercase and remove punctuation.
    '''
    word_tokenizer = LatinWordTokenizer()
    #tokens = [word_tokenizer.tokenize(sentence) for sentence in txt]
    tokenized_text = word_tokenizer.tokenize(txt)
    words = [[word.lower() for word in tokenized_text if word.isalpha()]]
    return words

In [None]:
print(len(text))
tokenized = tokenize_latin_to_words(text)
print(len(tokenized[0]))
tokenized[:20]

900782
126091


[['albert',
  'of',
  'aixhistoria',
  'hierosolymitanae',
  'expeditionisliber',
  'icap',
  'i',
  'prooemium',
  'sequentis',
  'operis',
  'incipit',
  'liber',
  'primus',
  'expeditionis',
  'hierosolymitanae',
  'urbis',
  'ubi',
  'clarissimi',
  'ducis',
  'godefridi',
  'inclita',
  'gesta',
  'narrantur',
  'cujus',
  'labore',
  'et',
  'studio',
  'civitas',
  'sancta',
  'ab',
  'infidelibus',
  'liberata',
  'sanctae',
  'ecclesiae',
  'filiis',
  'est',
  'restituta',
  'diu',
  'multum',
  'his',
  'usque',
  'diebus',
  'ob',
  'inaudita',
  'et',
  'plurimum',
  'admiranda',
  'saepius',
  'accensus',
  'sum',
  'desiderio',
  'ejusdem',
  'expeditionis',
  'et',
  'faciendae',
  'orationis',
  'illic',
  'dum',
  'ferverem',
  'sed',
  'cum',
  'minime',
  'ob',
  'diversa',
  'impedimenta',
  'intentioni',
  'meae',
  'effectus',
  'daretur',
  'temerario',
  'ausu',
  'decrevi',
  'saltem',
  'ex',
  'his',
  'aliqua',
  'memoriae',
  'commendare',
  'quae',
  'au

# Embedings

In [None]:
# Word2Vec parameters
min_count = 1                # minimum number of  word occurrencies
vector_size = 100            # embeddings size
window = 5                   # context window size
alpha = 0.5                  # initial learning rate
min_alpha = 0.5              # limit learning rate
epochs = 20                  # epochs number
negative = 20                # number of draws for negative sampling

In [None]:
embeddings_W2V_CBOW = Word2Vec(tokenized, min_count=min_count, vector_size=vector_size, 
                               window=window, sg=0, negative=negative, alpha=alpha,
                               min_alpha=min_alpha, epochs=epochs)

In [None]:
embeddings_W2V_SkipGram = Word2Vec(tokenized, min_count=min_count, vector_size=vector_size, 
                               window=window, sg=1, negative=negative, alpha=alpha,
                               min_alpha=min_alpha, epochs=epochs)

In [None]:
# FastText parameters
min_count = 1                # minimum number of  word occurrencies
vector_size = 100            # embeddings size
window = 5                   # context window size
alpha = 0.05                 # initial learning rate
min_alpha = 0.0001           # limit learning rate
epochs = 20                  # epochs number
negative = 20                # number of draws for negative sampling
min_n = 3                    # min length of char ngram
max_n = 6                    # max length of char ngram

In [None]:
embeddings_FT_CBOW = FastText(tokenized, min_count=min_count, vector_size=vector_size,
                              window=window, sg=0, negative=negative, alpha=alpha,
                              min_alpha=min_alpha, epochs=epochs,
                              min_n=min_n, max_n=max_n)

In [None]:
embeddings_FT_SkipGram = FastText(tokenized, min_count=min_count, vector_size=vector_size,
                              window=window, sg=1, negative=negative, alpha=alpha,
                              min_alpha=min_alpha, epochs=epochs,
                              min_n=min_n, max_n=max_n)

# Evaluate the Embeddings with benchmark

In [None]:
# Only use first 2 columns, for synonyms
latin_benchmark = pd.read_csv(BENCHMARK_LAT, sep='\t', header = None, usecols=[0,2])

#Build 2 way dictionary
dictionary1 = dict(latin_benchmark.values)
dictionary = dict(latin_benchmark.values)
# example: inclino only appears as synonym for word acclino and never again... So we need to map inclino to acclino (Reverse map)
for k,v in dictionary1.items():
    if v not in dictionary1:
        dictionary[v] = k

Now we have synonym dictionary and can evaluate our latin embeddings

In [None]:
#For now see how many simmilar words specific model can match
#So far no words can be matched at all
def calculate_matches(model, word_set):
    score = 0
    triedWords = 0
    for word in word_set:
        if word in dictionary:
            pos_synonyms = model.wv.similar_by_word(word)
            true_synonym = dictionary[word]
            if true_synonym not in model.wv.key_to_index:
                continue;
            triedWords += 1
            sim = 0
            for syn, val in pos_synonyms:
                # Not sure how to fully deal with cosine similarity the values are between [-1;1] and -1 means oposite word.
                # For now I will assume man and woman are synonyms
                sim_ = model.wv.similarity(syn, true_synonym)
                sim += sim_
            #avg similarity
            
            score += sim/len(pos_synonyms)
    #print("Tried words = " + str(triedWords) + " Score = " + str(score/triedWords)) 
    return score/triedWords

In [None]:
word_list = [word for sentence in tokenized for word in sentence]

# Text still contains many artifacts...
print(len(word_list))
print(len(set(word_list)))
print("#"*80)
word_set = set(word_list)

print("Word 2 Vec CBOW score: ")
print(calculate_matches(embeddings_W2V_CBOW,word_set))
print("Word 2 Vec Skipgram score: ")
print(calculate_matches(embeddings_W2V_SkipGram,word_set))
print("Fast Text CBOW score: ")
print(calculate_matches(embeddings_FT_CBOW,word_set))
print("Fast Text Skipgram score: ")
print(calculate_matches(embeddings_FT_SkipGram,word_set))

126091
19610
################################################################################
Word 2 Vec CBOW score: 
0.03081980943570389
Word 2 Vec Skipgram score: 
0.1299283536143155
Fast Text CBOW score: 
0.99974467206833
Fast Text Skipgram score: 
0.9330448905857963


Baseline...

In [None]:
dsds

# Report
1. Abstract [Jirka]
2. intro [Jirka]
- Voynich (we have to explain our problem => transliteration => types of uncertainties & their frequencies {table/bar graph})
- Embeddings
3. Methods [Choosing Fasttext vs Word2vec] [Francesco]
- describe what they are
- describe cbow vs skipgram
- how we choose one over the other
- eval: nearest neighbours (italian), benchmark comparison (latin)
4. Methods [Liudvikas]
- baseline
- noising text
- our model
5. Results on Latin/Italian
6. Results on Voynich
7. Conclusion


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=232308e3-5ead-45f3-9639-338eb193dfaf' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>