In the following, we display the correct and predicted syllabification of the verses for which those two do not coincide and compute the percentage of correctly syllabified verses.

# Setup

In [1]:
from google.colab import drive

drive.mount('/content/drive/')
%cd '/content/drive/My Drive/Deep Comedy/src'

Mounted at /content/drive/
/content/drive/My Drive/Deep Comedy/src


In [2]:
import logging
from IPython.utils import io

from transformer import Transformer
from syllabification import syllabify, get_tokenizers

In [3]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

In [4]:
%%capture
def cd(path):
    %cd path

In [5]:
train_verses_nosyll = []
val_verses_nosyll = []
test_verses_nosyll = []

train_verses_syll = []
val_verses_syll = []
test_verses_syll = []

verse_info = {
    'train_nosyll': {
        'fname': 'training_set_nosyll.txt',
        'verses': train_verses_nosyll
    },
    'val_nosyll': {
        'fname': 'validation_set_nosyll.txt',
        'verses': val_verses_nosyll
    },
    'test_nosyll': {
        'fname': 'test_set_nosyll.txt',
        'verses': test_verses_nosyll
    },
    'train_ground': {
        'fname': 'training_set_ground_truth.txt',
        'verses': train_verses_syll
    },
    'val_ground': {
        'fname': 'validation_set_ground_truth.txt',
        'verses': val_verses_syll
    },
    'test_ground': {
        'fname': 'test_set_ground_truth.txt',
        'verses': test_verses_syll
    }
}

%cd '/content/drive/My Drive/Deep Comedy/syllabifications'

for k in verse_info.keys():
    with open(verse_info[k]['fname']) as f:
        for verse in f:
            verse = verse.rstrip('\n')
            verse_info[k]['verses'].append(f'<{verse}>')

/content/drive/My Drive/Deep Comedy/syllabifications


In [6]:
syllabifier = Transformer(
    num_layers=2,
    d_model=128,
    num_heads=2,
    dff=32,
    input_vocab_size=80+1,
    target_vocab_size=81+1,
    pe_input=1000,
    pe_target=1000,
    rate=0.1)

In [7]:
%cd '/content/drive/My Drive/Deep Comedy'

syllabifier.load_weights('./syllabification_weights/')

/content/drive/My Drive/Deep Comedy


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5720398750>

In [8]:
tokenizer_nosyll, tokenizer_syll = get_tokenizers()

Downloading data from https://raw.githubusercontent.com/asperti/Dante/main/inferno.txt
Downloading data from https://raw.githubusercontent.com/asperti/Dante/main/purgatorio.txt
Downloading data from https://raw.githubusercontent.com/asperti/Dante/main/paradiso.txt
Downloading data from https://raw.githubusercontent.com/asperti/Dante/main/inferno_syllnew.txt
Downloading data from https://raw.githubusercontent.com/asperti/Dante/main/purgatorio_syllnew.txt
Downloading data from https://raw.githubusercontent.com/asperti/Dante/main/paradiso_syllnew.txt


# Dataset Syllabification

In order to be able to retrieve the already computed syllabifications in case of crashes/disconnections, we first write them down in a file.

In [10]:
def write_predictions(verses_nosyll, verses_syll, fname):
    with open(fname, 'w') as f:
        for i, (verse, ground_truth) in enumerate(zip(verses_nosyll, verses_syll)):
            with io.capture_output() as captured:
                cd('/content/drive/My Drive/Deep Comedy')
            print(f'{i}/{len(verses_syll)}')

            prediction = syllabify(verse, syllabifier, tokenizer_nosyll, tokenizer_syll)

            with io.capture_output() as captured:
                cd('/content/drive/My Drive/Deep Comedy/syllabifications')
            f.write(f"{prediction[0].replace('<','').replace('>','')}\n")

In [11]:
def store_predictions(fname):
    with io.capture_output() as captured:
        cd('/content/drive/My Drive/Deep Comedy/syllabifications')
    predictions = []
    with open(fname) as f:
        for verse in f:
            predictions.append(verse.rstrip('\n'))
    return predictions

In [12]:
def print_mistakes(verses_nosyll, predictions, ground_truths):
    mistakes = []
    for verse, prediction, ground_truth in zip(verses_nosyll, predictions, ground_truths):
        ground_truth = ground_truth.replace('<','').replace('>','')
        if prediction != ground_truth:
            print(f'Syllabifier input:       {verse}'.replace('<','').rstrip('>'))
            print(f'Syllabifier output:      {prediction}'.replace('<','').rstrip('>'))
            print(f'Correct syllabification: {ground_truth}'.replace('<','').rstrip('>'))
            print()
            mistakes.append((ground_truth, prediction))
    return mistakes

In [13]:
def print_ratio(verses_nosyll, mistakes):
    l = len(verses_nosyll)
    acc = (l-len(mistakes))/l
    print(f'Number of correctly syllabified verses: {l-len(mistakes)}/{l} ({acc*100:.1f}%)')

In [19]:
%cd '/content/drive/My Drive/Deep Comedy/syllabifications'

/content/drive/My Drive/Deep Comedy/syllabifications


## Test Set

In [None]:
#write_predictions(test_verses_nosyll, test_verses_syll, 'test_set_prediction.txt')

In [54]:
test_set_predictions = store_predictions('test_set_prediction.txt')

In [55]:
test_set_mistakes = print_mistakes(test_verses_nosyll, test_set_predictions, test_verses_syll)

Syllabifier input:       io dico d’Aristotile e di Plato
Syllabifier output:      |io |di|co |d’ A|ri|sto|ti|le |di |Pla|to
Correct syllabification: |io |di|co |d’ A|ri|sto|ti|le e |di |Pla|to

Syllabifier input:       E non er’ anco del mio petto essausto
Syllabifier output:      |E |non |e|r’ an|co |del |mio |pet|to es|sa|u|sto
Correct syllabification: |E |non |e|r’ an|co |del |mio |pet|to es|sau|sto

Syllabifier input:       ficcava ïo sì come far suole
Syllabifier output:      |fic|ca|va ï|o |sì |co|me |far |suo|le
Correct syllabification: |fic|ca|va |ï|o |sì |co|me |far |suo|le

Syllabifier input:       vedi oggimai quant’ esser dee quel tutto
Syllabifier output:      |ve|di og|gi|mai |quan|t’ es|ser |de |quel |tut|to
Correct syllabification: |ve|di og|gi|mai |quan|t’ es|ser |dee |quel |tut|to

Syllabifier input:       Io avea già il mio viso nel suo fitto;
Syllabifier output:      |Io |a|vea |già |il |mio |vi|so |nel |suo |fit|to;
Correct syllabification: |Io |a|vea |già il |mio 

In [56]:
print_ratio(test_verses_nosyll, test_set_mistakes)

Number of correctly syllabified verses: 1384/1424 (97.2%)


## Validation Set

In [None]:
#write_predictions(val_verses_nosyll, val_verses_syll, 'validation_set_prediction.txt')

In [61]:
val_set_predictions = store_predictions('validation_set_prediction.txt')

In [63]:
val_set_mistakes = print_mistakes(val_verses_nosyll, val_set_predictions, val_verses_syll)

Syllabifier input:       e che s’incontran con sì aspre lingue,
Syllabifier output:      |e |che |s’ in|con|tran |con |sì |a|spre |lin|gue,
Correct syllabification: |e |che |s’ in|con|tran |con |sì |as|pre |lin|gue,

Syllabifier input:       volgendom’ io con li etterni Gemelli,
Syllabifier output:      |vol|gen|dom’ |io |con |li et|ter|ni |Ge|mel|li,
Correct syllabification: |vol|gen|do|m’ io |con |li et|ter|ni |Ge|mel|li,

Syllabifier input:       «Osanna, sanctus Deus sabaòth,
Syllabifier output:      « |O|san|na, |sanc|cus |De|us |sa|ba|òth,
Correct syllabification: « |O|san|na, |sanc|tus |De|us |sa|ba|òth,

Syllabifier input:       a le curule Sizii e Arrigucci.
Syllabifier output:      |a |le |cu|ru|le |Si|ziii e |Ar|ri|guc|ci.
Correct syllabification: |a |le |cu|ru|le |Si|zii |e Ar|ri|guc|ci.

Syllabifier input:       l’onor d’Agobbio e l’onor di quell’ arte
Syllabifier output:      |l’ o|nor |d’ A|gob|bio e |l’ o|n |di |quel|l’ ar|te
Correct syllabification: |l’ o|nor |d’ A|gob

In [64]:
print_ratio(val_verses_nosyll, val_set_mistakes)

Number of correctly syllabified verses: 2819/2846 (99.1%)


## Training Set

In [None]:
#write_predictions(train_verses_nosyll, train_verses_syll, 'training_set_prediction.txt')

In [32]:
train_set_predictions = store_predictions('training_set_prediction.txt')

In [38]:
train_set_mistakes = print_mistakes(train_verses_nosyll, train_set_predictions, train_verses_syll)

Syllabifier input:       legge, moneta, officio e costume
Syllabifier output:      |leg|ge, |mo|ne|ta, of|fi|cio e |co|stu|me
Correct syllabification: |leg|ge, |mo|ne|ta, of|fi|cio |e |co|stu|me

Syllabifier input:       di Nostra Donna in sul lito adriano.
Syllabifier output:      |di |No|stra |Don|na in |sul |li|to a|dria|no.
Correct syllabification: |di |No|stra |Don|na in |sul |li|to a|dri|a|no.

Syllabifier input:       nel prossimo si danno, e nel suo avere
Syllabifier output:      |nel |pros|si|mo |si |dan|no, e |nel |suo |a|ve|re
Correct syllabification: |nel |pros|si|mo |si |dan|no, e |nel |suo a|ve|re

Syllabifier input:       Già eran sovra noi tanto levati
Syllabifier output:      |Già |e|ran |sov|ra |noi |tan|to |le|va|ti
Correct syllabification: |Già |e|ran |so|vra |noi |tan|to |le|va|ti

Syllabifier input:       «O Virgilio, Virgilio, chi è questa?»,
Syllabifier output:      « |O |Vir|gi|lio, |Vir|gi|lio, |chi è |que|sta?»,
Correct syllabification: « |O |Vir|gi|lio, |Vir

In [39]:
print_ratio(train_verses_nosyll, train_set_mistakes)

Number of correctly syllabified verses: 9870/9963 (99.1%)
