# DTW TEST
The DTW module contains implementations of string based DP with as main purpose the use in evaluation of speech recognition systems   
+ DP routines
    - levenshtein(): 
        - computes the vanilla Levenshtein distance, i.e. #S+#I+#D
        - no backtracking, hence only composite distance / error rate 
    - dtw(): 
        - computes a weighted edit distance
        - allows for prior normalization
        - allowing Substitutions, Insertion, Deletions 
        - returns the alignment and #S, #I, #D separated out
    - both routines take lists of tokens as inputs, hence applicable to both word or character tokens 

In [1]:
# do all the imports
import numpy as np
import pandas as pd
import timeit
from pyspch import dtw 
#import Normalizer as Norm
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# help(dtw)

In [2]:
display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

## Some Extra Utilities
- print_results(): for printing alignment and scores of DTW matching
- score_corpus(): for global scoring of a corpus given by a list of paired sentences [reference,test]


In [27]:
def print_edist_results(cts=None,df_align=None,trellis=None,Display=True):
    if trellis is not None:
        print(" == TRELLIS == ")
    if df_align is not None:
        print("\n == ALIGNMENT == ")
        with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
            if(Display):
                display(df_align.T)
            else:
                print(df_align.T)
    if cts is not None:
        print("\n == SCORE ==")
        print("#S=%d, #I=%d, #D=%d for %d tokens \nErr=%.2f%%" % cts )
        
def score_corpus(corpus, Verbose = False, Display = True):
    Nsub = 0
    Nins = 0
    Ndel = 0
    Ntot = 0
    for [reference,result] in corpus:
        ref = dtw.tokenizer(reference)
        tst = dtw.tokenizer(result)
        df_align, cts, _ = dtw.wedit(tst,ref)
        
        if(Verbose):
            print("Input(reference):",ref)
            print("Output(test):    ",tst)
            print_edist_results(df_align=df_align)
        
        Nsub += cts[0]
        Nins += cts[1]
        Ndel += cts[2]
        Ntot += cts[3]

    print("\n ++ CORPUS RESULTS ++ ")
    print("\n#S=%d, #I=%d, #D=%d for %d tokens" % (Nsub,Nins,Ndel,Ntot) )
    print("Error Rate: %5.2f%%" % (100.*(Nsub+Nins+Ndel)/Ntot)  )

### Levenshtein and Weighted Edit Distance for string matching (tokens=:characters)
- character strings are essentially a list of characters and can be passed directly as input and reference sequence
- change Verbose to True to see all essential internal data

In [28]:
ref = "boeken"
tst =  "broekske"
y = ref
x = tst
levdist = dtw.levenshtein(x,y)
print("Levenshtein Distance: ",levdist)
#
alignment,cts,_ = dtw.wedit(x,y,Verbose=False)
print_edist_results(df_align=alignment,cts=cts)


Levenshtein Distance:  4.0

 == ALIGNMENT == 


Unnamed: 0,0,1,2,3,4,5,6,7,8
x,b,r,o,e,k,s,k,e,_
y,b,_,o,e,_,_,k,e,n
O,M,I,M,M,I,I,M,M,D



 == SCORE ==
#S=0, #I=3, #D=1 for 6 tokens 
Err=66.67%


### Levenshtein and Weighted Edit Distance for sentence matching (tokens=:words)
The tokenizer used is simply the default Python split(), with optional conversion to lower case   

In [29]:
# pd.set_option('display.max_rows', None, 'display.max_columns', None)
# sentence matching
ref = "to recognize speech is the topic of this course"
tst =  "to wreck a nice beach seems of this month"
print("Input(reference):",ref)
print("Output(test):    ",tst)
#
print("\n--- Character DTW Match on Sentence ----- ")
y = ref
x = tst
levdist = dtw.levenshtein(x,y)
print("Levenshtein Distance: ",levdist,"\n")

alignment,cts,_ = dtw.wedit(x,y,Verbose=False)
print_edist_results(df_align=alignment,cts=cts)

print("\n--- Word DTW Match on Sentence ----- ")
y = dtw.tokenizer(ref)
x = dtw.tokenizer(tst)
levdist = dtw.levenshtein(x,y)
print("Levenshtein Distance: ",levdist,"\n")

alignment,cts,_ = dtw.wedit(x,y,Verbose=False)
print_edist_results(df_align=alignment,cts=cts)

Input(reference): to recognize speech is the topic of this course
Output(test):     to wreck a nice beach seems of this month

--- Character DTW Match on Sentence ----- 
Levenshtein Distance:  24.0 


 == ALIGNMENT == 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
x,t,o,,w,r,e,c,k,,a,,n,i,c,e,,_,b,e,a,c,h,,_,s,_,_,_,e,_,_,_,e,m,s,,o,f,,t,h,i,s,,m,o,_,n,t,h
y,t,o,,_,r,e,c,_,_,o,g,n,i,z,e,,s,p,e,e,c,h,,i,s,,t,h,e,,t,o,p,i,c,,o,f,,t,h,i,s,,c,o,u,r,s,e
O,M,M,M,I,M,M,M,I,I,S,S,M,M,S,M,M,D,S,M,S,M,M,M,D,M,D,D,D,M,D,D,D,S,S,S,M,M,M,M,M,M,M,M,M,S,M,D,S,S,S



 == SCORE ==
#S=12, #I=3, #D=9 for 47 tokens 
Err=51.06%

--- Word DTW Match on Sentence ----- 
Levenshtein Distance:  6.0 


 == ALIGNMENT == 


Unnamed: 0,0,1,2,3,4,5,6,7,8
x,to,wreck,a,nice,beach,seems,of,this,month
y,to,recognize,speech,is,the,topic,of,this,course
O,M,S,S,S,S,S,M,M,S



 == SCORE ==
#S=6, #I=0, #D=0 for 9 tokens 
Err=66.67%


In [30]:
corpus1 = [ 
[ "ASTRONOMERS SAY THAT THE EARTH'S FATE IS SEALED",
  " MR. ARMOUR SAY THAT THE EARTH'S FETISH  SEALED"],
[ 'GLASNOST HAS ALSO BEEN GOOD TO LAWRENCE LEIGHTON SMITH',
  'CLASS NOSED HAD ALSO BEEN GOOD TO LAWRENCE FLEET AND SMITH'],
[ "AS MS. KENDALL AND MR. LOUW SEE IT SOUTH AFRICA'S CENTRAL GOVERNMENT IS LIKE A BIG LUMBERING TANK",
  "AS MS. SCANDAL AND MR. LOWE C. AT SOUTH AFRICA'S CENTRAL GOVERNMENT IS LIKE A BIG LUMBER INK TANK"],
[ "MR. WANG IS RELATIVELY YOUNG FOR HIS JOB UPSETTING OLDER COLLEAGUES",
  "MR. WANG IS RELATIVELY YOUNG FIRST JOB KIND OF SETS OLDER COLLEAGUES"]
]


In [31]:
score_corpus(corpus1,Verbose=True)

Input(reference): ['ASTRONOMERS', 'SAY', 'THAT', 'THE', "EARTH'S", 'FATE', 'IS', 'SEALED']
Output(test):     ['MR.', 'ARMOUR', 'SAY', 'THAT', 'THE', "EARTH'S", 'FETISH', 'SEALED']

 == ALIGNMENT == 


Unnamed: 0,0,1,2,3,4,5,6,7,8
x,MR.,ARMOUR,SAY,THAT,THE,EARTH'S,_,FETISH,SEALED
y,_,ASTRONOMERS,SAY,THAT,THE,EARTH'S,FATE,IS,SEALED
O,I,S,M,M,M,M,D,S,M


Input(reference): ['GLASNOST', 'HAS', 'ALSO', 'BEEN', 'GOOD', 'TO', 'LAWRENCE', 'LEIGHTON', 'SMITH']
Output(test):     ['CLASS', 'NOSED', 'HAD', 'ALSO', 'BEEN', 'GOOD', 'TO', 'LAWRENCE', 'FLEET', 'AND', 'SMITH']

 == ALIGNMENT == 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
x,CLASS,NOSED,HAD,ALSO,BEEN,GOOD,TO,LAWRENCE,FLEET,AND,SMITH
y,_,GLASNOST,HAS,ALSO,BEEN,GOOD,TO,LAWRENCE,_,LEIGHTON,SMITH
O,I,S,S,M,M,M,M,M,I,S,M


Input(reference): ['AS', 'MS.', 'KENDALL', 'AND', 'MR.', 'LOUW', 'SEE', 'IT', 'SOUTH', "AFRICA'S", 'CENTRAL', 'GOVERNMENT', 'IS', 'LIKE', 'A', 'BIG', 'LUMBERING', 'TANK']
Output(test):     ['AS', 'MS.', 'SCANDAL', 'AND', 'MR.', 'LOWE', 'C.', 'AT', 'SOUTH', "AFRICA'S", 'CENTRAL', 'GOVERNMENT', 'IS', 'LIKE', 'A', 'BIG', 'LUMBER', 'INK', 'TANK']

 == ALIGNMENT == 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
x,AS,MS.,SCANDAL,AND,MR.,LOWE,C.,AT,SOUTH,AFRICA'S,CENTRAL,GOVERNMENT,IS,LIKE,A,BIG,LUMBER,INK,TANK
y,AS,MS.,KENDALL,AND,MR.,LOUW,SEE,IT,SOUTH,AFRICA'S,CENTRAL,GOVERNMENT,IS,LIKE,A,BIG,_,LUMBERING,TANK
O,M,M,S,M,M,S,S,S,M,M,M,M,M,M,M,M,I,S,M


Input(reference): ['MR.', 'WANG', 'IS', 'RELATIVELY', 'YOUNG', 'FOR', 'HIS', 'JOB', 'UPSETTING', 'OLDER', 'COLLEAGUES']
Output(test):     ['MR.', 'WANG', 'IS', 'RELATIVELY', 'YOUNG', 'FIRST', 'JOB', 'KIND', 'OF', 'SETS', 'OLDER', 'COLLEAGUES']

 == ALIGNMENT == 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
x,MR.,WANG,IS,RELATIVELY,YOUNG,_,FIRST,JOB,KIND,OF,SETS,OLDER,COLLEAGUES
y,MR.,WANG,IS,RELATIVELY,YOUNG,FOR,HIS,JOB,_,_,UPSETTING,OLDER,COLLEAGUES
O,M,M,M,M,M,D,S,M,I,I,S,M,M



 ++ CORPUS RESULTS ++ 

#S=12, #I=6, #D=2 for 46 tokens
Error Rate: 43.48%
