# Decoding CTC output

In [1]:
import pickle
import torch


# Load precomputed CTC output
with open('mystery_records.pickle', 'rb') as f:
    batch = pickle.load(f)

# log probabilities of softmax layers [batch_size, T, vocab_size]
log_probs = batch["log_probs"]

# Dictionary with index to character mapping
ind2char = batch["ind2char"]

# Index of special EMPTY token
EMPTY_TOK = '^'
EMPTY_IND = 0

In [7]:
def ctc_decode(inds, ind2char):
  result = []
  last_char = EMPTY_TOK
  for ind in inds:
    if ind == EMPTY_IND:
      continue
    if last_char == ind2char[ind]:
      continue
    result.append(ind2char[ind])
    # if ind == EMPTY_IND:
    #   last_char = ind2char[ind]
    #   continue
    # if last_char != ind2char[ind]:
    #   result.append(ind2char[ind])
    last_char = ind2char[ind]
  return " ".join(result)

for i, rec in enumerate(log_probs):
    text = ctc_decode(rec.argmax(-1).numpy(), ind2char)
    print(f"{i}) {text}")



0) w e   n o s t r n g e s t o   l o v e   y o u   k n o w   t h e r o l s   a n d   s o   d o   i   a   f o l   c o m i t m e n t   w h a t   i   t h i n k i n g   o f   y o u   w o l d e n   g e t   t h i s   f r o m   a n y   a t h e r   g u y
1)   n e v e r   g o n a   g i v e   y o u   u p   n e v e r   d o n e l e t   y o u   d o w n   n e v e r   g o   a r u n   a r o u n d   a n d   d e s e t   y o u   n e v e r   g o n   a   m a k e   y o u   c r i   n e v e r   g o n a   s a y   g o d   b y


# Computing WER and CER
Task: Implemet WER and CER metrics

$$
  WER = \frac{S + D + I}{N}
$$

In [13]:
# library for fast quick calculation of edit distance
import editdistance

def calc_wer(target_text: str, pred_text: str):
    if not target_text: ### case if we don't want to recognize speech (TV show sounds)
      if pred_text:
        return 1
      return 0
    list_target = target_text.split(" ")
    list_pred = pred_text.split(" ")
    return editdistance.eval(list_target, list_pred) / len(list_target)



def calc_cer(target_text: str, pred_text: str):
    return editdistance.eval(target_text, pred_text) / len(target_text)


In [14]:
import numpy as np

for target, pred, expected_wer, expected_cer in [
    ("if you can not measure it you can not improve it",
     "if you can nt measure t yo can not i",
     0.454, 0.25),
    ("if you cant describe what you are doing as a process you dont know what youre doing",
     "if you cant describe what you are doing as a process you dont know what youre doing",
     0.0, 0.0),
    ("one measurement is worth a thousand expert opinions",
     "one  is worth thousand opinions",
     0.375, 0.392)
]:
    wer = calc_wer(target, pred)
    cer = calc_cer(target, pred)
    assert np.isclose(wer, expected_wer, atol=1e-3), f"true: {target}, pred: {pred}, expected wer {expected_wer} != your wer {wer}"
    assert np.isclose(cer, expected_cer, atol=1e-3), f"true: {target}, pred: {pred}, expected cer {expected_cer} != your cer {cer}"


Task: come up with such a pair of target-prediction texts, so the
1) WER > 1.0
2) CER > WER

In [19]:
# 1) WER > 1.0
target, prediction = "a" , "a b bdb"
assert calc_wer(target, prediction) > 1.0

# 2) CER > WER
# your code here
target, prediction = "cat", "abcs"
assert calc_wer(target, prediction) < calc_cer(target, prediction)


1.0
1.3333333333333333


# Beam search
Task: implement beam-search on CTC outputs

In [20]:
# Load precomputed CTC output
with open('lj_batch.pickle', 'rb') as f:
    batch = pickle.load(f)

# log probabilities of softmax layers [batch_size, T, vocab_size]
log_probs = batch["log_probs"]

# Dictionary with index to character mapping
ind2char = batch["ind2char"]

true_texts = batch["text"]

In [28]:
from collections import defaultdict
from tqdm import tqdm
def extand_and_merge(frame, state, ind2char):
  new_state = defaultdict(float)
  for next_ind, next_proba in enumerate(frame):
    for (pref, last_char), pref_proba in state.items():
      next_char = ind2char[next_ind]
      if next_char == last_char:
        new_pref = pref
      else:
        if next_char != EMPTY_TOK:
          new_pref = pref + next_char
        else:
          new_pref = pref
        last_char = next_char
      new_state[(new_pref, last_char)] += pref_proba * next_proba
  return new_state

def truncate(state, beam_size):
    state_list = list(state.items())
    state_list.sort(key=lambda x: -x[1])
    return dict(state_list[:beam_size])


def ctc_beam_search(probs, beam_size, ind2char):
    state = {("", EMPTY_TOK) : 1.0}
    for frame in probs:
      state = extand_and_merge(frame, state, ind2char)
      state = truncate(state, beam_size)
    return [[v[0][0], v[-1]] for v in list(state.items())]
bs_results = []
for log_probs_line in log_probs:
    bs_results.append(ctc_beam_search(log_probs_line.exp().numpy(), 100, ind2char))

In [29]:

for i in range(len(true_texts)):
    beam_search_hypos = bs_results[i][:3]
    true_text = true_texts[i]
    argmax_text = ctc_decode(log_probs[i].numpy().argmax(-1), ind2char)
    print("True: ", true_text)
    print(f"Argmax: {argmax_text} --- (CER: {calc_cer(true_text, argmax_text):.3f})")
    for ind, (hypo, score) in enumerate(beam_search_hypos):
        print(f"{ind+1}) '{hypo}' --- (CER: {calc_cer(true_text, hypo):.3f})")
    print('-' * 100)

True:  he would go to her and tell her all his family complications
Argmax: h e   w l d   g e   t o h e r   i a n d   t e l   h e r   a l   m h i s a n   l y   o m b l i c a t i o n s --- (CER: 0.983)
1) 'he wl ge to her iand tell her all hisan ly omblications' --- (CER: 0.183)
2) 'he wl ge to her and tell her all hisan ly omblications' --- (CER: 0.167)
3) 'he wl ge to her iand tell her all hisanly omblications' --- (CER: 0.183)
----------------------------------------------------------------------------------------------------
True:  he did not say the last as a boast but merely as an assurance to the liveryman who he saw was anxious on his account
Argmax: h e   d i d   n o t   s a d   t h e   l a s t   i s   a   b o s t   b u t   m e a r l i o v e s   a n   a s u r a n c e   t o   t h e   l i v e r y   m a n   w h o   r e   s a w   w a s   a n x e s   o n   h i s   a c o u n t --- (CER: 1.043)
1) 'he did not say the last is a bost but merli oves an a surance to the livery man who re 