<a href="https://colab.research.google.com/github/duckyngo/Word-Error-Rate-Visualization-with-Colab/blob/main/WERCalculationAndVisualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import json
from IPython.display import HTML, display
try:
  import jiwer
except:
  !pip install jiwer
  import jiwer

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

# Turn on line wrapping on Colab Ref: https://github.com/jupyter/notebook/issues/6274
get_ipython().events.register('pre_run_cell', set_css)


def wer(ref, hyp ,debug=False):
    r = ref.split()
    h = hyp.split()
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        lines = []
        compares = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
                compares.append(colored(0, 0, 0, h[j]))
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
                compares.append(colored(0, 255, 0, h[j]) +  colored(0, 0, 0, f'({r[i]})'))
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
                compares.append(colored(0, 0, 255, h[j]))
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
                compares.append(colored(255, 0, 0, r[i]))
    if debug:
        # print("OP\tREF\tHYP")
        # lines = reversed(lines)
        # for line in lines:
        #     print(line)

        compares = reversed(compares)
        for line in compares:
          print(line, end=" ")
        # print("Ncor " + str(numCor))
        # print("Nsub " + str(numSub))
        # print("Ndel " + str(numDel))
        # print("Nins " + str(numIns))
    wer_result = round( (numSub + numDel + numIns) / (float) (len(r)), 3)
    return {'WER':wer_result, 'Cor':numCor, 'Sub':numSub, 'Ins':numIns, 'Del':numDel}, compares


def colored(r, g, b, text):
    return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)

def strike(text, color=None):
    if color:
      return colored(0, 255, 0, ''.join([u'\u0336{}'.format(c) for c in text]))
      
    else:
      return  colored(0, 0, 0, ''.join([u'\u0332{}'.format(c) for c in text]))

#@title Calculate and visualize WER { run: "auto" }

remove_punctuation = True #@param {type:"boolean"}
input_ref='\uB2EC\uBCF4\uB2E4 \uC774 \uC810 \uCE60 \uD37C\uC13C\uD2B8\uD3EC\uC778\uD2B8 \uB0AE\uC544\uC84C\uC2B5\uB2C8\uB2E4 \uD314\uC2ED \uD37C\uC13C\uD2B8\uB97C \uB118\uC5B4 \uD3EC\uD654 \uC0C1\uD0DC\uC600\uB358 \uC218\uB3C4\uAD8C \uC911\uC9C4 \uBCD1\uC0C1 \uAC00\uB3D9\uB960\uB3C4 \uC721\uC2ED\uC774 \uC810 \uC0BC \uD37C\uC13C\uD2B8\uB85C \uB5A8\uC5B4\uC84C\uC2B5\uB2C8\uB2E4 \uC9C0\uB09C\uB2EC \uC0BC\uC2ED\uC77C \uCE60\uC2ED \uD37C\uC13C\uD2B8 \uC544\uB798\uB85C \uB5A8\uC5B4\uC9C4 \uB4A4 \uB098\uD758 \uC5F0\uC18D \uD558\uB77D \uC149\uB2C8\uB2E4 \uC9C0\uB09C\uB2EC \uC774\uC2ED \uAD6C\uC77C \uD558\uB8E8\uC5D0\uB9CC \uC624\uBC31 \uC5EC\uB4E0 \uBA85\uC0C1\uC744 \uD655\uCDA9\uD558\uB294 \uB4F1 \uBCD1\uC0C1 \uBD80\uC871 \uC0C1\uD669\uC774 \uB098\uC544\uC9C0\uBA74\uC11C \uC804\uAD6D\uC5D0\uC11C \uD558\uB8E8 \uB118\uAC8C \uC785\uC6D0\uC744 \uAE30\uB2E4\uB9AC\uB294 \uC0AC\uB78C\uC740 \uB2F7\uC0C8\uC9F8 \uD55C \uBA85\uB3C4 \uC5C6\uC5C8\uC2B5\uB2C8\uB2E4 \uBC29\uC5ED \uB2F9\uAD6D\uC740 \uC774\uB2EC \uB9D0\uAE4C\uC9C0 \uC911\uC99D \uD658\uC790 \uBCD1\uC0C1 \uCC9C \uC624\uBC31 \uC77C\uD754 \uC5EC\uB35F\uAC1C\uB97C \uBE44\uB86F\uD574 \uC721\uCC9C \uAD6C\uBC31 \uB9C8\uD754 \uB124\uAC1C\uC758 \uC785\uC6D0 \uBCD1\uC0C1\uC744 \uD655\uCDA9\uD574 \uD558\uB8E8 \uD655\uC9C4 \uB9CC \uBA85\uC5D0\uB3C4 \uB300\uC751\uD558\uACA0\uB2E4\uB294 \uBAA9\uD45C\uC785\uB2C8\uB2E4' #@param {type:"string"}
input_hyp='\uB2EC\uBCF4\uB2E4 \uC774 \uC810 \uCE60 \uD37C\uC13C\uD2B8\uD3EC\uC778\uD2B8 \uB0AE\uC544\uC84C\uC2B5\uB2C8\uB2E4 \uD314\uC2ED \uD37C\uC13C\uD2B8\uB97C \uB118\uC5B4 \uD3EC\uD55C \uC0C1\uD0DC\uC600\uB358 \uC218\uB3C4\uAD8C \uC911\uC99D\uBCD1\uC0C1 \uAC00\uB3C5\uB960\uB3C4 \uC721\uC2ED\uC774 \uC810 \uC0BC \uD37C\uC13C\uD2B8\uB85C \uB5A8\uC5B4\uC84C\uC2B5\uB2C8\uB2E4 \uC9C0\uB09C\uB2EC \uC0BC\uC2ED\uC77C \uCE60\uC2ED \uD37C\uC13C\uD2B8 \uC544\uB798\uB85C \uB5A8\uC5B4\uC9C4 \uB4A4 \uB098\uD750\uC5F0 \uC18D \uD558\uB77D\uD230\uB2C8\uB2E4 \uC9C0\uB09C \uB2EC \uC774\uC2ED \uAD6C\uC77C \uD558\uB8E8\uC5D0\uB9CC \uC624\uBC31 \uC5EC\uB4E0 \uBCD1\uC0C1\uC744 \uD655\uCDA9\uD558\uB294 \uB4F1 \uBCD1\uC0C1 \uBD80\uC871 \uC0C1\uD669\uC774 \uB098\uC544\uC9C0\uBA74\uC11C \uC804\uAD6D\uC5D0\uC11C \uD558\uB8E8 \uB118\uAC8C \uC774 \uBC88\uC744 \uAE30\uB2E4\uB9AC\uB294 \uC0AC\uB78C\uC740 \uB2E4 \uC14B\uC9F8 \uD55C \uBA85\uB3C4 \uC5C6\uC5C8\uC2B5\uB2C8\uB2E4 \uBC29\uC5ED \uB2E8\uAD6D\uC740 \uC774\uB2EC \uB9D0\uAE4C\uC9C0 \uC911\uC99D \uD658\uC790 \uBCD1\uC0C1 \uCC9C \uC624\uBC31 \uC77C\uD754 \uC5EC\uB35F\uAC1C\uB97C \uBE44\uC211\uD574 \uC721\uCC9C \uAD6C\uBC31 \uB9C8\uD754 \uB124 \uAC1C\uC5D0 \uC774\uBC88 \uBCD1\uC0C1\uC744 \uD655\uC911\uD574 \uD558\uB8E8 \uD655\uC9C4 \uB9CC\uBA85\uD574\uB3C4 \uB300\uC751\uD558\uACA0\uB2E4\uB294 \uBAA9\uD45C(\uC785)\uB2C8\uB2E4' #@param {type:"string"}
input_json = "" #@param {type:"string"}

if input_json and input_json != "":
  json_data = json.loads(input_json)
  input_ref = json_data['text']
  input_hyp = json_data['pred_text']


if rm_punctuation == True:
    ref = jiwer.RemovePunctuation()(input_ref)
    hyp = jiwer.RemovePunctuation()(input_hyp)
else:
    ref = input_ref
    hyp = input_hyp

print(f"REF: {ref}\n")
print(f"HYP: {hyp}")
print('-'* 30)

output, compares = wer(ref, hyp ,debug=True)

print()
print(colored(0, 0, 0,   f"N CORRECT   : {output['Cor']}"))
print(colored(255, 0, 0, f"N DELETE    : {output['Del']}"))
print(colored(0, 255, 0, f"N SUBSTITUTE: {output['Sub']}"))
print(colored(0, 0, 255, f"N INSERT    : {output['Ins']}"))
print(colored(0, 0, 0, f"WER: {output['WER']}"))

REF: 달보다 이 점 칠 퍼센트포인트 낮아졌습니다 팔십 퍼센트를 넘어 포화 상태였던 수도권 중진 병상 가동률도 육십이 점 삼 퍼센트로 떨어졌습니다 지난달 삼십일 칠십 퍼센트 아래로 떨어진 뒤 나흘 연속 하락 셉니다 지난달 이십 구일 하루에만 오백 여든 명상을 확충하는 등 병상 부족 상황이 나아지면서 전국에서 하루 넘게 입원을 기다리는 사람은 닷새째 한 명도 없었습니다 방역 당국은 이달 말까지 중증 환자 병상 천 오백 일흔 여덟개를 비롯해 육천 구백 마흔 네개의 입원 병상을 확충해 하루 확진 만 명에도 대응하겠다는 목표입니다

HYP: 달보다 이 점 칠 퍼센트포인트 낮아졌습니다 팔십 퍼센트를 넘어 포한 상태였던 수도권 중증병상 가독률도 육십이 점 삼 퍼센트로 떨어졌습니다 지난달 삼십일 칠십 퍼센트 아래로 떨어진 뒤 나흐연 속 하락툰니다 지난 달 이십 구일 하루에만 오백 여든 병상을 확충하는 등 병상 부족 상황이 나아지면서 전국에서 하루 넘게 이 번을 기다리는 사람은 다 셋째 한 명도 없었습니다 방역 단국은 이달 말까지 중증 환자 병상 천 오백 일흔 여덟개를 비숑해 육천 구백 마흔 네 개에 이번 병상을 확중해 하루 확진 만명해도 대응하겠다는 목표입니다
------------------------------
[38;2;0;0;0m달보다 [38;2;255;255;255m [38;2;0;0;0m이 [38;2;255;255;255m [38;2;0;0;0m점 [38;2;255;255;255m [38;2;0;0;0m칠 [38;2;255;255;255m [38;2;0;0;0m퍼센트포인트 [38;2;255;255;255m [38;2;0;0;0m낮아졌습니다 [38;2;255;255;255m [38;2;0;0;0m팔십 [38;2;255;255;255m [38;2;0;0;0m퍼센트를 [38;2;255;255;255m [38;2;0;0;0m넘어 [38;2;255;255;255m [38;2;0;255;0m포한 [38;2;255;255;255m[38