In [1]:
#to use the language model, make sure you've unzipped the languageModel.tar.gz file
#and have compiled the code in the LanguageModelDecoder folder
baseDir = '/oak/stanford/groups/henderj/fwillett/speechPaperRelease_08_20'

In [2]:
import os
from glob import glob
from pathlib import Path
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]=""

import numpy as np
from omegaconf import OmegaConf
import tensorflow as tf
from neuralDecoder.neuralSequenceDecoder import NeuralSequenceDecoder
import neuralDecoder.utils.lmDecoderUtils as lmDecoderUtils



In [3]:
#loads the language model, could take a while and requires ~60 GB of memory
lmDir = baseDir+'/languageModel'
ngramDecoder = lmDecoderUtils.build_lm_decoder(
    lmDir,
    acoustic_scale=0.8, #1.2
    nbest=1,
    beam=18
)

I0825 10:45:02.989113 64566 brain_speech_decoder.h:52] Reading fst /oak/stanford/groups/henderj/fwillett/speechPaperRelease_08_20/languageModel/TLG.fst


In [4]:
#evaluate the RNN on the test partition and competitionHoldOut partition
testDirs = ['test','competitionHoldOut']
trueTranscriptions = [[],[]]
decodedTranscriptions = [[],[]]
for dirIdx in range(2):
    ckptDir = baseDir + '/derived/rnns/baselineRelease'

    args = OmegaConf.load(os.path.join(ckptDir, 'args.yaml'))
    args['loadDir'] = ckptDir
    args['mode'] = 'infer'
    args['loadCheckpointIdx'] = None

    for x in range(len(args['dataset']['datasetProbabilityVal'])):
        args['dataset']['datasetProbabilityVal'][x] = 0.0

    for sessIdx in range(4,19):
        args['dataset']['datasetProbabilityVal'][sessIdx] = 1.0
        args['dataset']['dataDir'][sessIdx] = baseDir+'/derived/tfRecords'
    args['testDir'] = testDirs[dirIdx]

    # Initialize model
    tf.compat.v1.reset_default_graph()
    nsd = NeuralSequenceDecoder(args)

    # Inference
    out = nsd.inference()
    decoder_out = lmDecoderUtils.cer_with_lm_decoder(ngramDecoder, out, outputType='speech_sil', blankPenalty=np.log(2))

    def _ascii_to_text(text):
        endIdx = np.argwhere(text==0)
        return ''.join([chr(char) for char in text[0:endIdx[0,0]]])

    for x in range(out['transcriptions'].shape[0]):
        trueTranscriptions[dirIdx].append(_ascii_to_text(out['transcriptions'][x,:]))  
    decodedTranscriptions[dirIdx] = decoder_out['decoded_transcripts']


I0825 10:47:19.548393 64566 brain_speech_decoder.h:81] Reading symbol table /oak/stanford/groups/henderj/fwillett/speechPaperRelease_08_20/languageModel/words.txt
2023-08-25 10:47:20.375852: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-08-25 10:47:20.375891: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: sh03-11n03.int
2023-08-25 10:47:20.375898: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: sh03-11n03.int
2023-08-25 10:47:20.375964: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 535.54.3
2023-08-25 10:47:20.376003: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 535.54.3
2023-08-25 10:47:20.376008: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 535.54.3
2023-08-25 10:47:20.376238: I te

Model: "gru"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_1 (GRU)                 multiple                  28317696  
                                                                 
 gru_2 (GRU)                 multiple                  6297600   
                                                                 
 gru_3 (GRU)                 multiple                  6297600   
                                                                 
 gru_4 (GRU)                 multiple                  6297600   
                                                                 
 gru_5 (GRU)                 multiple                  6297600   
                                                                 
 dense (Dense)               multiple                  42025     
                                                                 
Total params: 53,551,145
Trainable params: 53,551,145
Non-train

  0%|          | 0/600 [00:00<?, ?it/s]

Model: "gru"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_1 (GRU)                 multiple                  28317696  
                                                                 
 gru_2 (GRU)                 multiple                  6297600   
                                                                 
 gru_3 (GRU)                 multiple                  6297600   
                                                                 
 gru_4 (GRU)                 multiple                  6297600   
                                                                 
 gru_5 (GRU)                 multiple                  6297600   
                                                                 
 dense (Dense)               multiple                  42025     
                                                                 
Total params: 53,551,145
Trainable params: 53,551,145
Non-train

  0%|          | 0/1200 [00:00<?, ?it/s]

In [5]:
from neuralDecoder.utils.lmDecoderUtils import _cer_and_wer as cer_and_wer

#get word error rate and phoneme error rate for the test set (cer is actually phoneme error rate here)
cer, wer = cer_and_wer(decodedTranscriptions[0], trueTranscriptions[0], outputType='speech_sil', returnCI=True)

#print word error rate
print(wer)

(0.18875611080934274, 0.17230437541209634, 0.20561550684383054)


In [6]:
#print the sentence predictions for the test set
print(decodedTranscriptions[0])

["it's really hard to find something that works", 'and i love the line', 'i do have a friend that run', 'this way in the back', 'actually it makes sense to a certain extent', 'so the government that he was employed', "they feel good about what they're doing", "because it's an invasion of personal privacy", 'so you travel a lot', 'they were going to write', 'any kind of a good environment to live in', 'the best of android', "i don't know what you think about the point", 'i believe it cost about ten dollars', 'what in the world dogs are you on to', 'android tv box', 'a couple of times', 'i had the back down on it', 'which i you as a riff and have been real handy', 'i do when i was working', "it's a package deal", 'i have seen it', 'the measure always passed a senate committee', 'they told me that this was a topic', 'in the red square are weaver said', 'they did a lot of play work and stuff', 'you can gain too much right', 'recent legislation', "i'm a car buff too", 'two fans from a diese

In [7]:
#print the predictions for the competition hold-out set (labels are unreleased)
print(decodedTranscriptions[1])

["i'm initially from colorado", "if i had money i'd be safe to buy a known", 'i raced in york', 'i thought the topic was pouring', 'what every drive up there', "you don't get evolve", 'i hope you enjoy', "we don't do a", 'a voice from the crowd', 'i hope you enjoy my blog', 'i never have it', 'maybe even too far', "it's had that russians were quite some time", 'quick animus or suit or dress', 'manufactured by nineteen seventy five', 'the water takes the recession', "because that's all she talked about", 'did you hear an piece about that', 'i play in several company sources', "i've heard on the news", 'maybe but in a little bit fit', 'and i guess it was late me', 'in just cause', 'the word of the confusion', 'and it evoked an emotional response', 'you want to buy something', "that's basically what we do", 'i guess a lot of people do', 'a lot of people have real water', 'it seems like he would develop pride', "but you're not", 'they make a collection on monday for yard waste', 'several o

In [8]:
#format the predictions for competition submission. This generates a .txt file that can be submitted.
with open('baselineCompetitionSubmission.txt', 'w') as f:
    for x in range(len(decodedTranscriptions[1])):
        f.write(decodedTranscriptions[1][x]+'\n')