# Setup notebook, imports and predefined functions

## Notebook magics

In [1]:
%load_ext autoreload
%autoreload 2

## Some imports

In [2]:
from nhpylm.lexicon import build_fst_for_lexicon
import os
from nhpylm.c_core import nhpylm
from tqdm import tqdm
from nhpylm import fst
from nhpylm.kaldi_data_preparation import convert_transcription, word_to_grapheme
import json
from nhpylm import json_utils as ju

## Output directory

In [3]:
output_directory = 'lattice_playground/c-l-e-p/'
os.makedirs(output_directory, exist_ok=True)

## Some predefined functions

### Combined write and print/display function

In [4]:
def write_and_print_fst(fst_graph, fst_filename, sym_filename, **kwargs):
    print_fst_kwargs = {'determinize': False}
    print_fst_kwargs.update(kwargs)
    fst_graph.write_fst(fst_filename, **print_fst_kwargs)
    return fst.print(fst_filename, sym_filename, sym_filename)

### Convert a list of sentences to a list of list of list of units:

In [5]:
def text_to_splitted_words(text):
    return [convert_transcription(line, word_to_units=word_to_grapheme(join=False))[1] for line in text]

### Return all unique units from the converted sentences

In [6]:
def find_symbols(text):
    return {symbol for line in text for word in line for symbol in word}

### Write a symbol list to a symbol file mapping all symbols to integers from 1 to N_symbols

In [7]:
def write_symbols(symbols, sym_file):
    with open(sym_file, 'w') as fid:
        for i, s in enumerate(symbols):
            fid.write('{} {}\n'.format(s, i))

### convert a list of sentences of word ids to a list of list of character ids

In [8]:
def word_ids_to_char_ids(word_ids, wlm_order, clm_order, lm_order, int_eow, lm):
    char_ids = []
    for words in word_ids:
        char_words = []
        for word in words[wlm_order-1:]:
            char_word = [int_eow] * (lm_order-1)
            char_word.extend(lm.id2word(word)[clm_order-1:])
            char_words.append(char_word)
        char_ids.append(char_words)
    return char_ids

# Get train data, split into characters and get symbols

In [9]:
train_data = ['Martian Marsman', 'man on Mars']
train_data_splitted = text_to_splitted_words(train_data)
symbols = find_symbols(train_data_splitted)

# Instantiate dictionary and transform training sentences

In [10]:
wc_lm = nhpylm.NHPYLM_wrapper(list(symbols), 1, 1)
train_data_word_ids = wc_lm.word_lists_to_id_lists(train_data_splitted)

# Write symbol file

In [11]:
sym_filename = output_directory + 'symbols.txt'
word_list = wc_lm.string_ids

write_symbols(word_list, sym_filename)

# Get int versions of lexicon and labels

In [12]:
int_lexicon = wc_lm.get_word_id_to_char_id()
int_eos_word = wc_lm.sentence_boundary_id
int_labels = wc_lm.get_char_ids()
int_eps = wc_lm.sym2id('EPS')
int_eow = wc_lm.sym2id('EOW')
int_eoc = wc_lm.sym2id('EOC')
int_phi = wc_lm.sym2id('PHI')
int_eos_label = wc_lm.sym2id('EOS')

## Instantiate character LM, add training sentences and resample hyper parameters

In [13]:
c_lm = nhpylm.NHPYLM_wrapper([], 8, 0, 1/(len(symbols) + 2), 'EOW')
train_data_char_ids = word_ids_to_char_ids(train_data_word_ids, 1, 1, 8, int_eow, wc_lm)

In [14]:
for line in tqdm(train_data_char_ids):
    for word in line:
        c_lm.add_id_sentence_to_lm(word)



# Get FST for language model

In [15]:
G_fst_filename = output_directory + 'G.fst'
_, arc_list = c_lm.to_fst_text_format(eow=int_eoc, return_to_start=True)
G_fst = fst.build_fst_from_arc_list(arc_list)
G_fst.write_fst(G_fst_filename, minimize=False, determinize=False, rmepsilon=False)
fst.print(G_fst_filename, sym_filename, sym_filename)



# Character sequence

## Some character sequence

In [16]:
character_sequence = 'MartianMarsman'

## Build and print FST for character sequence

In [17]:
int_sequence = [wc_lm.sym2id(character) for character in character_sequence]

I_fst_filename = output_directory + 'I.fst'
character_sequence_fst = fst.build_fst_for_sequence(int_sequence + [int_eos_label])
write_and_print_fst(character_sequence_fst, I_fst_filename, sym_filename, minimize=False, determinize=False)

## Add loops for word end/disambigutity symbols (eow and eoc)

In [18]:
I_loop_fst_filename = output_directory + 'I_loop.fst'
character_sequence_fst.add_self_loops(int_eps, int_eow, int_eow, mode='after')
character_sequence_fst.add_self_loops(int_eps, int_eoc, int_eoc, mode='after')
write_and_print_fst(character_sequence_fst, I_loop_fst_filename, sym_filename, minimize=False, determinize=False)

# Do the final compositions

## Compose input sequence FST with language model FST

In [19]:
I_loop_G_fst_filename = output_directory + 'I_loop_G.fst'
fst.compose(I_loop_fst_filename, G_fst_filename, I_loop_G_fst_filename,
            determinize=False, minimize=False, rmepsilon=False, phi=int_phi)
fst.print(I_loop_G_fst_filename, sym_filename, sym_filename)

## Get shortest path(s)

In [20]:
I_loop_G_shortestpath_fst_filename = output_directory + 'I_loop_G_shortestpath.fst'
fst.shortestpath(I_loop_G_fst_filename, I_loop_G_shortestpath_fst_filename, nshortest=1,
            determinize=False, minimize=False, rmepsilon=True, project=True, project_output=True)
fst.print(I_loop_G_shortestpath_fst_filename, sym_filename, sym_filename)