In [2]:
#to use the language model, make sure you've unzipped the languageModel.tar.gz file
#and have compiled the code in the LanguageModelDecoder folder
baseDir = '/scratch/users/stfan/tmp'

In [3]:
import os
import time
from glob import glob
from pathlib import Path

import numpy as np
from omegaconf import OmegaConf
import tensorflow as tf
from neuralDecoder.neuralSequenceDecoder import NeuralSequenceDecoder
import neuralDecoder.utils.lmDecoderUtils as lmDecoderUtils

In [4]:
#evaluate the RNN on the test partition and competitionHoldOut partition
testDirs = ['test','competitionHoldOut']
trueTranscriptions = [[],[]]
decodedTranscriptions = [[],[]]
rnn_outputs = []
for dirIdx in range(2):
    ckptDir = baseDir + '/derived/rnns/baselineRelease'

    args = OmegaConf.load(os.path.join(ckptDir, 'args.yaml'))
    args['loadDir'] = ckptDir
    args['mode'] = 'infer'
    args['loadCheckpointIdx'] = None

    for x in range(len(args['dataset']['datasetProbabilityVal'])):
        args['dataset']['datasetProbabilityVal'][x] = 0.0

    for sessIdx in range(4,19):
        args['dataset']['datasetProbabilityVal'][sessIdx] = 1.0
        args['dataset']['dataDir'][sessIdx] = baseDir+'/derived/tfRecords'
    args['testDir'] = testDirs[dirIdx]

    with tf.device('/CPU:0'):  # Change to GPU:0 to run on GPU
    # Initialize model
        tf.compat.v1.reset_default_graph()
        nsd = NeuralSequenceDecoder(args)

        # Inference
        out = nsd.inference()
        rnn_outputs.append(out)

2023-07-11 15:44:38.148374: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-11 15:44:38.622820: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38344 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:41:00.0, compute capability: 8.0


Model: "gru"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_1 (GRU)                 multiple                  28317696  
                                                                 
 gru_2 (GRU)                 multiple                  6297600   
                                                                 
 gru_3 (GRU)                 multiple                  6297600   
                                                                 
 gru_4 (GRU)                 multiple                  6297600   
                                                                 
 gru_5 (GRU)                 multiple                  6297600   
                                                                 
 dense (Dense)               multiple                  42025     
                                                                 
Total params: 53,551,145
Trainable params: 53,551,145
Non-train

In [5]:
## Load a 5gram model with an unpruned LM
lmDir = '/scratch/users/stfan/lm_models/speech_5gram'
ngramDecoder = lmDecoderUtils.build_lm_decoder(
    lmDir,
    acoustic_scale=0.5,
    nbest=100,
    beam=18
)

MODEL_CACHE_DIR = '/scratch/users/stfan/huggingface'
# Load OPT 6B model
llm, llm_tokenizer = lmDecoderUtils.build_opt(cacheDir=MODEL_CACHE_DIR,
                                              device='auto',
                                              load_in_8bit=True
)

I0711 15:47:34.039937 20933 brain_speech_decoder.h:52] Reading fst /scratch/users/stfan/lm_models/speech_5gram/TLG.fst
I0711 15:52:20.065663 20933 brain_speech_decoder.h:58] Reading lm fst /scratch/users/stfan/lm_models/speech_5gram/G.fst
I0711 15:53:19.112915 20933 brain_speech_decoder.h:70] Reading rescore fst /scratch/users/stfan/lm_models/speech_5gram/G_no_prune.fst
I0711 16:12:02.043052 20933 brain_speech_decoder.h:81] Reading symbol table /scratch/users/stfan/lm_models/speech_5gram/words.txt



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/groups/henderj/stfan/.conda/env/py3.9/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /share/software/user/open/cuda/11.7.1/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/groups/henderj/stfan/.conda/env/py3.9/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# LM decoding hyperparameters
acoustic_scale = 0.5
blank_penalty = np.log(7)
llm_weight = 0.5

llm_outputs = []
for i, out in enumerate(rnn_outputs):
    # Generate nbest outputs from 5gram LM
    start_t = time.time()
    nbest_outputs = []
    logits = lmDecoderUtils.rearrange_speech_logits(out['logits'], has_sil=True)
    logitLengths = out['logitLengths']
    for j in range(len(logits)):
        nbest = lmDecoderUtils.lm_decode(ngramDecoder,
                                         logits[j, :logitLengths[j]],
                                         blankPenalty=blank_penalty,
                                         returnNBest=True,
                                         rescore=True)
        nbest_outputs.append(nbest)
    time_per_sample = (time.time() - start_t) / len(logits)
    print(f'5gram decoding took {time_per_sample} seconds per sample')

    # Rescore nbest outputs with LLM
    start_t = time.time()
    llm_out = lmDecoderUtils.cer_with_gpt2_decoder(llm,
                                                   llm_tokenizer,
                                                   nbest_outputs,
                                                   acoustic_scale,
                                                   out,
                                                   outputType='speech_sil',
                                                   returnCI=True,
                                                   lengthPenalty=0,
                                                   alpha=llm_weight)
    time_per_sample = (time.time() - start_t) / len(logits)
    print(f'LLM decoding took {time_per_sample} seconds per sample')
    if i == 0:
        print(f"Final WER={llm_out['wer']}")
    
    llm_outputs.append(llm_out)

5gram decoding took 0.40535478353500365 seconds per sample


  0%|          | 0/600 [00:00<?, ?it/s]

LLM decoding took 0.31027826547622683 seconds per sample
Final WER=(0.14584464964693103, 0.1300810165444312, 0.1624660861384548)
5gram decoding took 0.32532530069351195 seconds per sample


  0%|          | 0/1200 [00:00<?, ?it/s]

LLM decoding took 0.3052226936817169 seconds per sample


In [8]:
#format the predictions for competition submission. This generates a .txt file that can be submitted.
decodedTranscriptions[dirIdx] = llm_outputs[1]['decoded_transcripts']
with open('5gramLLMCompetitionSubmission.txt', 'w') as f:
    for x in range(len(decodedTranscriptions[1])):
        f.write(decodedTranscriptions[1][x]+'\n')