# Setup notebook, imports and predefined functions

## Notebook magics

In [1]:
%load_ext autoreload
%autoreload 2

## Some imports

In [2]:
from nhpylm.lexicon import build_fst_for_lexicon
import os
from nhpylm.c_core import nhpylm
from tqdm import tqdm
from nhpylm import fst
from nhpylm.kaldi_data_preparation import convert_transcription, word_to_grapheme
import json
from nhpylm import json_utils as ju
import subprocess

## Output directory

In [3]:
output_directory = 'lattice_playground/wsj_hpylm/'
os.makedirs(output_directory, exist_ok=True)

## Some predefined functions

### Convert a list of sentences to a list of list of list of units:

In [4]:
def text_to_splitted_words(text):
    return [convert_transcription(line, word_to_units=word_to_grapheme(join=False))[1] for line in text]

### Return all unique units from the converted sentences

In [5]:
def find_symbols(text):
    return {symbol for line in text for word in line for symbol in word}

### Write a symbol list to a symbol file mapping all symbols to integers from 1 to N_symbols

In [6]:
def write_symbols(symbols, sym_file):
    with open(sym_file, 'w') as fid:
        for i, s in enumerate(symbols):
            fid.write('{} {}\n'.format(s, i))

### Return all files ending with '.z' in specified path

In [7]:
def get_z_files(path):
    z_files = list()
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.z'):
                z_files.append(os.path.join(root, file))
    return z_files

### Read a '.z' file and return uncompressed output

In [8]:
def read_z_file(filepath):
    p = subprocess.Popen(['gunzip', '-c', filepath], stdout=subprocess.PIPE, universal_newlines=True)
    text, _ = p.communicate()
    return text

### Convert text lines from WSJ training data and return only training sentence (upper case)

In [9]:
def wsj_lm_text_to_lm_lines(text):
    lines = list()
    for line in text.replace('</p>', '').split('</s>'):
        lines.append(line.strip().split('\n')[-1].upper())
    return lines

### Read all lm files ('.z' files) from specified path (lm training data in WSJ databse)

In [10]:
def read_lm_files(path):
    data = list()
    for file in get_z_files(path):
        data.extend(wsj_lm_text_to_lm_lines(read_z_file(file)))
    return data

# Database setup

## Database file and paths

In [11]:
database_path = '/net/speechdb/wsj/11_13_1/wsj0/doc/lng_modl/lm_train/np_data'

## Get train data, split into characters and get symbols

In [12]:
train_data = read_lm_files(database_path)
train_data_splitted = text_to_splitted_words(train_data)
symbols = find_symbols(train_data_splitted)

# Instantiate LM, add training sentences and resample hyper parameters

In [13]:
lm = nhpylm.NHPYLM_wrapper(list(symbols), 2, 8)

train_data_ids = lm.word_lists_to_id_lists(train_data_splitted)
for line in tqdm(train_data_ids):
    lm.add_id_sentence_to_lm(line)
    
lm.resample_hyperparameters()



# Write symbol file

In [14]:
sym_filename = output_directory + 'symbols.txt'

word_list = lm.string_ids
write_symbols(word_list, sym_filename)

# Get int lexicon and labels

In [15]:
int_lexicon = lm.get_word_id_to_char_id()
int_labels = lm.get_char_ids()
int_eps = lm.sym2id('EPS')
int_eow = lm.sym2id('EOW')
int_eoc = lm.sym2id('EOC')
int_phi = lm.sym2id('PHI')

# Build and write FST for lexicon

In [None]:
L_fst_filename = output_directory + 'L.fst'
mode = 'trie'
build_character_model = True
fst_lexicon = build_fst_for_lexicon(int_lexicon, int_eps, int_eow, build_character_model,
                                    mode, int_labels, eoc=int_eoc)
fst_lexicon.add_self_loops(int_phi)
fst_lexicon.write_fst(L_fst_filename, minimize=True, determinize=False, rmepsilon=True)

# Get FST for language model

In [None]:
G_fst_filename = output_directory + 'G.fst'
_, arc_list = lm.to_fst_text_format(sow=int_eps, eow=int_eoc, eos_word=int_eps)
G_fst = fst.build_fst_from_arc_list(arc_list)
G_fst.write_fst(G_fst_filename, minimize=True, rmepsilon=True)

# Compose lexicon and language model FST

In [None]:
L_G_fst_filename = output_directory + 'L_G.fst'
fst.compose(L_fst_filename, G_fst_filename, L_G_fst_filename,
            determinize=False, minimize=True, rmepsilon=True)