In [None]:
# Mount Google Drive (comment out on local)
# Shared folder link (set a shortcut to MyDrive to run): https://drive.google.com/drive/folders/1zwnMrxiQ6o_haHlEzJLAZNv223TxjRcP?usp=sharing
# from google.colab import drive
# drive._mount('/content/drive')

In [2]:
# comment out on local
# !cp '/content/drive/MyDrive/Voynich/corruptions.py' corruptions.py
# !cp '/content/drive/MyDrive/Voynich/uncertainties.py' uncertainties.py
# !cp '/content/drive/MyDrive/Voynich/validation.py' validation.py

In [None]:
# !pip install numpy==1.19.5
# !pip install scipy==1.7.3
# !pip install nltk==3.6.5
# !pip install gensim==4.1.2
# !pip install smart-open==5.2.1

In [47]:
# Colab
# INFERNO_IT = '/content/drive/MyDrive/Voynich/texts/Inferno_IT.txt'
# ZL = '/content/drive/MyDrive/Voynich/texts/ZL_raw.txt'
# RESULTS_PATH = '/content/drive/MyDrive/Voynich/predictions/'

# DeepNote
# INFERNO_IT = "ml-project-2-scikit-learn2/texts/Inferno_IT.txt"
# ZL = 'ml-project-2-scikit-learn2/texts/ZL_raw.txt'

# Local 
INFERNO_IT = 'texts/Inferno_IT.txt'
ZL = 'texts/ZL_raw.txt'

In [42]:
import numpy as np
import numpy.linalg as npl
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import string
import nltk
import re
import gensim
from gensim.models import Word2Vec, FastText
from gensim.corpora.dictionary import Dictionary
nltk.download('punkt')

import corruptions as corr
import uncertainties as unc
import validation as valid

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocess text

In [6]:
with open(ZL, 'r', encoding='latin-1') as doc:
    voynich = doc.read()

In [7]:
# Remove all words with multiple ambiguities
voynich = re.sub('[^ \n]*\?\?\?[^ \n]*', '', voynich)

In [8]:
voynich_lines = [line for line in voynich.splitlines() if len(line) > 0]
voynich_lines[:10]

['fachys ykal ar ataiin shol shory [cth:oto]res y kor sholdy',
 'sory ckhar or,y kair chtaiin shar ase cthar cthar,dan',
 'syaiir sheky or ykaiin shod cthoary cthes daraiin sy',
 'soiin oteey oteo[s:r],roloty cthiar,daiin okaiin or okan',
 'sair,y chear cthaiin cphar cfhaiin',
 'ydaraishy',
 "odar c'y shol cphoy oydar sh s cfhoaiin shodary",
 'yshey shody okchoy otchol chocthy os,chy dain chor kos',
 'daiin shos cfhol shody',
 'dain os teody']

In [9]:
voynich_paragraphs = voynich.split('\n\n')
voynich_paragraphs = list(map(lambda par: par.replace('\n', ' '), voynich_paragraphs))
voynich_paragraphs[:10]

['fachys ykal ar ataiin shol shory [cth:oto]res y kor sholdy sory ckhar or,y kair chtaiin shar ase cthar cthar,dan syaiir sheky or ykaiin shod cthoary cthes daraiin sy soiin oteey oteo[s:r],roloty cthiar,daiin okaiin or okan sair,y chear cthaiin cphar cfhaiin ydaraishy',
 "odar c'y shol cphoy oydar sh s cfhoaiin shodary yshey shody okchoy otchol chocthy os,chy dain chor kos daiin shos cfhol shody dain os teody",
 'ydain cphesaiin ols cphey ytain shoshy cphodal,es oksho kshoy otairin oteol okan shodain sckhey daiin shoy ckhey kodaiin cphy cphodaiils cthey sho oldain d dain oiin chol odaiin chodain chdy ok[a:o]in d?n cthy kod daiin shckhey ckeo r char shey kol chol chol kor chal sho chol shodan kshy kchy d or chodaiin sho koeam ycho tchey chekain sheo,pshol dydyd cthy dai[cto:Â]y yto shol she kodshey cphealy dar,ain dain ckhyds dchar shcthaiin okaiir chey Àchy \x82tol cthols dlocto shok chor chey dain ckhey otol daiiin',
 'cpho shaiin shokcheey chol tshodeesy shey pydeey chy r,o,d ??doin

## Build vocabulary

In [10]:
voynich_uncertainty_chars = {'ALTERNATE_CHAR': 'ž',
                             'SINGLE_UNCERTAINTY': '?',
                             'DOUBLE_UNCERTAINTY': '??',
                             'MULTIPLE_UNCERTAINTY': '???',
                             'UNCERTAIN_SPACE': ','
                            }

In [11]:
np.unique([*voynich])

array(['\n', ' ', "'", '*', ',', ':', '?', 'I', '[', ']', 'a', 'b', 'c',
       'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
       'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '\x82', '\x83',
       '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b',
       '\x8c', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
       '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9f', '¡', '¢', '£', '¤',
       '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²',
       '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
       'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Ç', 'È', 'É', 'Ë', 'Ì', 'Î', 'Ï',
       'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'ð', 'ñ', 'ò', 'ó', 'ô',
       'õ', 'ö'], dtype='<U1')

In [12]:
# For practicale purposes, we generate alternatives only for basic EVA characters
# and not for rare chars.
voynich_alphabet = string.ascii_lowercase #+ ''.join(['\x82', '\x83',
    #    '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b',
    #    '\x8c', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
    #    '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9f', '¡', '¢', '£', '¤',
    #    '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²',
    #    '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
    #    'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Ç', 'È', 'É', 'Ë', 'Ì', 'Î', 'Ï',
    #    'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'ð', 'ñ', 'ò', 'ó', 'ô',
    #    'õ', 'ö'])

In [13]:
# Model 1, removing uncertain words
lines_nounc = [unc.contextualize_sentence(sentence, sentence, 
                                           voynich_uncertainty_chars, voynich_alphabet,
                                           is_voynich=True)
                for sentence in voynich_lines]

pars_nounc = [unc.contextualize_sentence(sentence, sentence, 
                                           voynich_uncertainty_chars, voynich_alphabet,
                                           is_voynich=True)
                for sentence in voynich_paragraphs]

uncertainty_list_nounc, lines_nounc = list(zip(*lines_nounc))
_, pars_nounc = list(zip(*pars_nounc))

# Merge list of different sentences
uncertainty_list_nounc = [item for sentence in uncertainty_list_nounc for item in sentence]

# Split words
lines_nounc_tokenized = [sentence.split(' ') for sentence in lines_nounc]
pars_nounc_tokenized = [sentence.split(' ') for sentence in pars_nounc]

In [14]:
vocab_nounc = [[word] for uncertainty in uncertainty_list_nounc for alternative
                in uncertainty.alternatives_list for word in alternative.split(' ')]

In [15]:
# Model 2, replacing corrupted letters
lines_repunc = [unc.contextualize_sentence(sentence, sentence, 
                                           voynich_uncertainty_chars, voynich_alphabet,
                                           convert_uncertainties='£', is_voynich=True)
                for sentence in voynich_lines]

pars_repunc = [unc.contextualize_sentence(sentence, sentence, 
                                          voynich_uncertainty_chars, voynich_alphabet,
                                          convert_uncertainties='£', is_voynich=True)
                for sentence in voynich_paragraphs]

uncertainty_list_repunc, lines_repunc = list(zip(*lines_repunc))
_, pars_repunc = list(zip(*pars_repunc))

# Merge list of different sentences
uncertainty_list_repunc = [item for sentence in uncertainty_list_repunc for item in sentence]

# Split words
lines_repunc_tokenized = [sentence.split(' ') for sentence in lines_nounc]
pars_repunc_tokenized = [sentence.split(' ') for sentence in lines_repunc]

In [16]:
vocab_repunc = [[word] for uncertainty in uncertainty_list_repunc for alternative
                 in uncertainty.alternatives_list for word in alternative.split(' ')]

## Creating embeddings

In [17]:
# Model 1B
build_vocab = 1
convert_uncertainties = 0

In [18]:
text_lines = lines_nounc_tokenized if not convert_uncertainties \
        else lines_repunc_tokenized
text_pars = pars_nounc_tokenized if not convert_uncertainties \
        else pars_repunc_tokenized
uncert_list = uncertainty_list_nounc if not convert_uncertainties else uncertainty_list_repunc
uncert_vocab = vocab_nounc if not convert_uncertainties else vocab_repunc

how = 'softmax' if build_vocab else 'cosine'

In [19]:
params = {'alpha': 0.05,
          'epochs': 20,
          'max_n': 6,
          'min_alpha': 0.005,
          'min_count': 2,
          'min_n': 2,
          'negative': 20,
          'sg': 0,
          'vector_size': 300,
          'window': 5}

In [20]:
window = params['window']

In [21]:
model_lines = FastText(seed=42, **params)
vocab = text_lines
# vocab += uncert_vocab if build_vocab else []
model_lines.build_vocab(vocab)
model_lines.train(text_lines, total_examples=model_lines.corpus_count,
                  epochs=model_lines.epochs)

(451439, 663200)

In [22]:
model_pars = FastText(seed=42, **params)
vocab = text_pars
vocab += uncert_vocab if build_vocab else []
model_pars.build_vocab(vocab)
model_pars.train(text_pars, total_examples=model_pars.corpus_count,
                  epochs=model_pars.epochs)

(654008, 1204880)

## Disambiguating uncertainties

In [84]:
res_lines = valid.predict_uncertainty(model_lines, window, uncert_list)
res_pars = valid.predict_uncertainty(model_pars, window, uncert_list)

In [85]:
ambiguous_words = [uncertainty[0] for uncertainty in res_lines]

prediction_lines = [uncertainty[1][0][0] for uncertainty in res_lines]
score_lines = [uncertainty[1] for uncertainty in res_lines]

prediction_pars = [uncertainty[1][0][0] for uncertainty in res_pars]
score_pars = [uncertainty[1] for uncertainty in res_pars]

In [86]:
df = pd.DataFrame({'Ambiguous word': ambiguous_words,
                   'Prediction_lines': prediction_lines,
                   'Prediction_paragraphs': prediction_pars,
                   'Scores_lines': score_lines,
                    'Scores_paragraphs': score_pars})

In [87]:
df

Unnamed: 0,Ambiguous word,Prediction_lines,Prediction_paragraphs,Scores_lines,Scores_paragraphs
0,[cth:oto]res,otores,otores,"[(otores, 0.41193393), (cthres, 0.31876117)]","[(otores, 0.34352824), (cthres, 0.29783356)]"
1,"or,y",ory,ory,"[(ory, 0.41524306), (or y, 0.3787379)]","[(ory, 0.26372424), (or y, 0.20780554)]"
2,"cthar,dan",cthardan,cthardan,"[(cthardan, 0.6236899), (cthar dan, 0.5733931)]","[(cthardan, 0.639566), (cthar dan, 0.5516057)]"
3,"oteo[s:r],roloty",oteorroloty,oteorroloty,"[(oteorroloty, 0.5720313), (oteosroloty, 0.546...","[(oteorroloty, 0.5448401), (oteosroloty, 0.488..."
4,"cthiar,daiin",cthiardaiin,cthiardaiin,"[(cthiardaiin, 0.38909966), (cthiar daiin, 0.1...","[(cthiardaiin, 0.32427838), (cthiar daiin, 0.1..."
...,...,...,...,...,...
3253,sh??y,shyky,shchy,"[(shyky, 0.41128594), (shchy, 0.40910923), (sh...","[(shchy, 0.53584903), (shkhy, 0.4851367), (shy..."
3254,qoka[r:s],qokas,qokas,"[(qokas, 0.024066824), (qokar, -0.018024012)]","[(qokas, 0.010914943), (qokar, -0.025959097)]"
3255,"sysor,shey",sysorshey,sysorshey,"[(sysorshey, 0.46524483), (sysor shey, 0.28443...","[(sysorshey, 0.41452068), (sysor shey, 0.27122..."
3256,"ch,al",chal,chal,"[(chal, 0.68826514), (ch al, 0.26655424)]","[(chal, 0.66760236), (ch al, 0.15376556)]"


In [89]:
# Save to file
df.to_csv(os.path.join(RESULTS_PATH, 'predictions.csv'))

In [92]:
# Percentage of agreement between lines and paragraphs predictions
(df.Prediction_lines == df.Prediction_paragraphs).sum() / len(df)

0.879987722529159