# Setup notebook, imports and predefined functions

## Notebook magics

In [1]:
%load_ext autoreload
%autoreload 2

## Some imports

In [2]:
from nhpylm.lexicon import build_fst_for_lexicon
from nhpylm import fst
import os
from nhpylm.c_core import nhpylm
from tqdm import tqdm
from nhpylm.kaldi_data_preparation import convert_transcription, word_to_grapheme
import json
from nhpylm import json_utils as ju

## Output directory

In [3]:
output_directory = 'lattice_playground/wcl-l-wsjcam0-p/'
os.makedirs(output_directory, exist_ok=True)

## Some predefined functions

### Combined write and print/display function

In [4]:
def write_and_print_fst(fst_graph, fst_filename, sym_filename, **kwargs):
    print_fst_kwargs = {'determinize': False}
    print_fst_kwargs.update(kwargs)
    fst_graph.write_fst(fst_filename, **print_fst_kwargs)
    return fst.print(fst_filename, sym_filename, sym_filename)

### Convert a list of sentences to a list of list of list of units:

In [5]:
def text_to_splitted_words(text):
    return [convert_transcription(line, word_to_units=word_to_grapheme(join=False))[1] for line in text]

### Return all unique units from the converted sentences

In [6]:
def find_symbols(text):
    return {symbol for line in text for word in line for symbol in word}

### Write a symbol list to a symbol file mapping all symbols to integers from 1 to N_symbols

In [7]:
def write_symbols(symbols, sym_file):
    with open(sym_file, 'w') as fid:
        for i, s in enumerate(symbols):
            fid.write('{} {}\n'.format(s, i))

### Return all unique words from list of sentences

In [8]:
def find_words(text):
    return {word for line in text for word in line.split()}

# Database setup

## Database file and paths

In [9]:
database_file = '/net/storage/database_jsons/reverb.json'
train_flist = 'train/flists/wav/si_tr'
test_flist = 'dev/flists/wav/si_dt5b'
tlist = 'orth'
channels = ['observed/CH1']

## Load database (filelist and transcription)

In [10]:
with open(database_file) as fid:
    database = json.load(fid)

train_files = ju.traverse_to_dict(database, train_flist)
train_files_for_channel = {channel: ju.get_flist_for_channel(train_files, channel) for channel in channels}
test_files = ju.traverse_to_dict(database, test_flist)
test_files_for_channel = {channel: ju.get_flist_for_channel(test_files, channel) for channel in channels}
transcriptions = ju.traverse_to_dict(database, tlist)

# Get train data, split into characters and get symbols

In [11]:
train_data = [transcriptions[key] for key in train_files_for_channel[channels[0]].keys()]
train_data_splitted = text_to_splitted_words(train_data)
symbols = find_symbols(train_data_splitted)

# Instantiate LM, add training sentences and resample hyper parameters

In [12]:
lm = nhpylm.NHPYLM_wrapper(list(symbols), 2, 8)

train_data_ids = lm.word_lists_to_id_lists(train_data_splitted)
for line in tqdm(train_data_ids):
    lm.add_id_sentence_to_lm(line)
    
lm.resample_hyperparameters()



# Write symbol file

In [13]:
sym_filename = output_directory + 'symbols.txt'
word_list = lm.string_ids

write_symbols(word_list, sym_filename)

# Get int versions of lexicon an labels

In [14]:
int_lexicon = lm.get_word_id_to_char_id()
int_eos_word = lm.sentence_boundary_id
int_labels = lm.get_char_ids()
int_eps = lm.sym2id('EPS')
int_eow = lm.sym2id('EOW')
int_eoc = lm.sym2id('EOC')
int_phi = lm.sym2id('PHI')
int_eos_label = lm.sym2id('EOS')

# Build and write FST for lexicon

In [15]:
L_fst_filename = output_directory + 'L.fst'
mode = 'trie'
build_character_model = True
fst_lexicon = build_fst_for_lexicon(int_lexicon, int_eps, int_eow, build_character_model,
                                    mode, int_labels, eoc=int_eoc)
fst_lexicon.add_eos(int_eos_label, int_eos_word)
fst_lexicon.write_fst(L_fst_filename, minimize=False, determinize=False)



# Get FST for language model

In [16]:
G_fst_filename = output_directory + 'G.fst'
_, arc_list = lm.to_fst_text_format(eow=int_eoc)
G_fst = fst.build_fst_from_arc_list(arc_list)
G_fst.write_fst(G_fst_filename, minimize=False, determinize=False, rmepsilon=False)



# Character sequence

## Get test data

In [17]:
test_data = [transcriptions[key] for key in test_files_for_channel[channels[0]].keys()]

## Some character sequence

In [18]:
character_sequence = ''.join(test_data[121].split())

## Build and print FST for character sequence

In [19]:
int_sequence = [lm.sym2id(character) for character in character_sequence]

I_fst_filename = output_directory + 'I.fst'
character_sequence_fst = fst.build_fst_for_sequence(int_sequence + [int_eos_label])
write_and_print_fst(character_sequence_fst, I_fst_filename, sym_filename, minimize=False, determinize=False)

## Add loops for word end/disambigutity symbols (eow and eoc)

In [20]:
I_loop_fst_filename = output_directory + 'I_loop.fst'
character_sequence_fst.add_self_loops(int_eps, int_eow, int_eow, mode='after')
character_sequence_fst.add_self_loops(int_eps, int_eoc, int_eoc, mode='after')
write_and_print_fst(character_sequence_fst, I_loop_fst_filename, sym_filename, minimize=False, determinize=False)

# Do the final compositions

## Compose lexicon and input sequence FST

In [21]:
I_loop_L_fst_filename = output_directory + 'I_loop_L.fst'
fst.compose(I_loop_fst_filename, L_fst_filename, I_loop_L_fst_filename,
            determinize=False, minimize=False, rmepsilon=False, sort_type="olabel")
fst.print(I_loop_L_fst_filename, sym_filename, sym_filename)

## Compose lexicon and input sequence FST with language model FST

In [22]:
I_loop_L_G_fst_filename = output_directory + 'I_loop_L_G.fst'
fst.compose(I_loop_L_fst_filename, G_fst_filename, I_loop_L_G_fst_filename,
            determinize=False, minimize=False, rmepsilon=False, phi=int_phi)
fst.print(I_loop_L_G_fst_filename, sym_filename, sym_filename)

## Get shortest path(s)

In [23]:
I_loop_L_G_shortestpath_fst_filename = output_directory + 'I_loop_L_G_shortestpath.fst'
fst.shortestpath(I_loop_L_G_fst_filename, I_loop_L_G_shortestpath_fst_filename, nshortest=1,
            determinize=False, minimize=False, rmepsilon=True, project=True, project_output=True)
fst.print(I_loop_L_G_shortestpath_fst_filename, sym_filename, sym_filename)

# Get a list of training words and print out the test sentences with new words

In [24]:
train_words = find_words(train_data)

In [25]:
for idx, line in enumerate(test_data):
    for word in line.split():
        if word not in train_words:
            print('{} --> {}: {}'.format(idx, word, line))

9 --> CLINTON: GOVERNOR CLINTON INSISTS THAT VOTERS WILL ACCEPT TAX INCREASES IF YOU CAN SHOW THEM WHERE THE MONEY WILL GO
19 --> ALEXANDER'S: THE STORE ACCOUNTED FOR ABOUT THREE PERCENT OF ALEXANDER'S REVENUE IN THE YEAR ENDED JULY TWENTY FIFTH
20 --> AVERAGED: BUT THIS WAS PURCHASED AT A PRICE OF INFLATION THAT AVERAGED MORE THAN EIGHT PERCENT FOR THE DECADE
21 --> DIALS: THEN THOSE FRIENDLY PEOPLE IN WASHINGTON COULD JUST TURN A FEW DIALS AND WE'D GET FASTER GROWTH AND STABLE PRICES
23 --> COLECO: COLECO IS IN A SEVERE CASH SQUEEZE
24 --> INTERVENTION: HE SAID HOWEVER THAT THE SUCCESS OF ANY INTERVENTION IS A MATTER OF JUDGMENT BECAUSE OF THE QUESTION OF WHAT WOULD'VE HAPPENED HADN'T WE BEEN THERE
24 --> WOULD'VE: HE SAID HOWEVER THAT THE SUCCESS OF ANY INTERVENTION IS A MATTER OF JUDGMENT BECAUSE OF THE QUESTION OF WHAT WOULD'VE HAPPENED HADN'T WE BEEN THERE
34 --> STONE: MR. STONE WILL CONTINUE AS PRESIDENT OF THE FEDERATED DEPARTMENT STORES FOUNDATION A CHARITABLE ORGANIZATION
34