In [2]:
#to use the language model, make sure you've unzipped the languageModel.tar.gz file
#and have compiled the code in the LanguageModelDecoder folder
aseDir = '/oak/stanford/groups/shenoy/fwillett/speechPaperRelease_final'

In [3]:
import os
from glob import glob
from pathlib import Path

import numpy as np
from omegaconf import OmegaConf
import tensorflow as tf
from neuralDecoder.neuralSequenceDecoder import NeuralSequenceDecoder
import neuralDecoder.utils.lmDecoderUtils as lmDecoderUtils

In [7]:
#loads the language model, could take a while and requires ~60 GB of memory
lmDir = baseDir+'/languageModel'
ngramDecoder = lmDecoderUtils.build_lm_decoder(
    lmDir,
    acoustic_scale=0.8,
    nbest=1,
    beam=18
)

I0707 14:32:25.127691 37509 brain_speech_decoder.h:52] Reading fst /oak/stanford/groups/henderj/stfan/code/nptlrig2/LanguageModelDecoder/examples/speech/s0/lm_order_exp/3gram/data/lang_test/TLG.fst
I0707 14:33:34.774243 37509 brain_speech_decoder.h:58] Reading lm fst /oak/stanford/groups/henderj/stfan/code/nptlrig2/LanguageModelDecoder/examples/speech/s0/lm_order_exp/3gram/data/lang_test/G.fst
I0707 14:33:51.572348 37509 brain_speech_decoder.h:70] Reading rescore fst /oak/stanford/groups/henderj/stfan/code/nptlrig2/LanguageModelDecoder/examples/speech/s0/lm_order_exp/3gram/data/lang_test/G_no_prune.fst
I0707 14:36:48.752995 37509 brain_speech_decoder.h:81] Reading symbol table /oak/stanford/groups/henderj/stfan/code/nptlrig2/LanguageModelDecoder/examples/speech/s0/lm_order_exp/3gram/data/lang_test/words.txt


In [None]:
#evaluate the RNN on the test partition and competitionHoldOut partition
testDirs = ['test','competitionHoldOut']
trueTranscriptions = [[],[]]
decodedTranscriptions = [[],[]]
for dirIdx in range(2):
    ckptDir = baseDir + '/derived/rnns/baselineRelease'

    args = OmegaConf.load(os.path.join(ckptDir, 'args.yaml'))
    args['loadDir'] = ckptDir
    args['mode'] = 'infer'
    args['loadCheckpointIdx'] = None

    for x in range(len(args['dataset']['datasetProbabilityVal'])):
        args['dataset']['datasetProbabilityVal'][x] = 0.0

    for sessIdx in range(4,19):
        args['dataset']['datasetProbabilityVal'][sessIdx] = 1.0
        args['dataset']['dataDir'][sessIdx] = baseDir+'/derived/tfRecords'
    args['testDir'] = testDirs[dirIdx]

    with tf.device('/CPU:0'):  # Change to GPU:0 to run on GPU
    # Initialize model
        tf.compat.v1.reset_default_graph()
        nsd = NeuralSequenceDecoder(args)

        # Inference
        out = nsd.inference()
    decoder_out = lmDecoderUtils.cer_with_lm_decoder(ngramDecoder, out, outputType='speech_sil', blankPenalty=np.log(2))

    def _ascii_to_text(text):
        endIdx = np.argwhere(text==0)
        return ''.join([chr(char) for char in text[0:endIdx[0,0]]])

    for x in range(out['transcriptions'].shape[0]):
        trueTranscriptions[dirIdx].append(_ascii_to_text(out['transcriptions'][x,:]))  
    decodedTranscriptions[dirIdx] = decoder_out['decoded_transcripts']


In [5]:
from neuralDecoder.utils.lmDecoderUtils import _cer_and_wer as cer_and_wer

#get word error rate and phoneme error rate for the test set (cer is actually phoneme error rate here)
cer, wer = cer_and_wer(decodedTranscriptions[0], trueTranscriptions[0], outputType='speech_sil', returnCI=True)

#print word error rate
print(wer)

(0.1854970124932102, 0.16925995754567183, 0.20179986165548894)


In [None]:
#print the sentence predictions for the test set
print(decodedTranscriptions[0])

In [None]:
#print the predictions for the competition hold-out set (labels are unreleased)
print(decodedTranscriptions[1])

In [8]:
#format the predictions for competition submission. This generates a .txt file that can be submitted.
with open('baselineCompetitionSubmission.txt', 'w') as f:
    for x in range(len(decodedTranscriptions[1])):
        f.write(decodedTranscriptions[1][x]+'\n')

Optionally, if you have access to high-end machine with least **330GB of RAM**, and a **GPU with 12GB of RAM**, you can run the following rescoring step to get better decoding accuracy.  

In [5]:
# Delete the previous 3gram model
del ngramDecoder

## Load a 5gram model with an unpruned LM
lmDir = '/scratch/users/stfan/lm_models/speech_5gram'
ngramDecoder = lmDecoderUtils.build_lm_decoder(
    lmDir,
    acoustic_scale=0.5,
    nbest=100,
    beam=18
)

MODEL_CACHE_DIR = '/scratch/users/stfan/huggingface'
# Load OPT 6B model
llm, llm_tokenizer = lmDecoderUtils.build_opt(cacheDir=MODEL_CACHE_DIR,
                                              device='auto',
                                              load_in_8bit=True
)

I0711 12:18:06.942003 14533 brain_speech_decoder.h:52] Reading fst /scratch/users/stfan/lm_models/speech_5gram/TLG.fst
I0711 12:22:48.091068 14533 brain_speech_decoder.h:58] Reading lm fst /scratch/users/stfan/lm_models/speech_5gram/G.fst
I0711 12:23:47.952838 14533 brain_speech_decoder.h:70] Reading rescore fst /scratch/users/stfan/lm_models/speech_5gram/G_no_prune.fst
I0711 12:39:09.893329 14533 brain_speech_decoder.h:81] Reading symbol table /scratch/users/stfan/lm_models/speech_5gram/words.txt



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/groups/henderj/stfan/.conda/env/py3.9/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /share/software/user/open/cuda/11.7.1/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/groups/henderj/stfan/.conda/env/py3.9/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
acoustic_scale = 0.5
blank_penalty = np.log(7)
llm_weight = 0.5


nbest_outputs = []
logits = lmDecoderUtils.rearrange_speech_logits(out['logits'], has_sil=True)
logitLengths = out['logitLengths']
for i in range(len(logits)):
    nbest = lmDecoderUtils.lm_decode(ngramDecoder,
                                     logits[i, :logitLengths[i]],
                                     blankPenalty=blank_penalty,
                                     returnNBest=True,
                                     rescore=True)
    nbest_outputs.append(nbest)

llm_out = lmDecoderUtils.cer_with_gpt2_decoder(llm,
                                               llm_tokenizer,
                                               nbest_outputs,
                                               acoustic_scale,
                                               out,
                                               outputType='speech_sil',
                                               returnCI=True,
                                               lengthPenalty=0,
                                               alpha=llm_weight)

print(f"wer={llm_out['wer']}")

  0%|          | 0/600 [00:00<?, ?it/s]

llm_weight=0.5 wer=(0.14584464964693103, 0.12977665854605186, 0.16272724464691138)
