# DP1 TEST
Testing the dp1 dp routines
+ DP routines
    - levenshtein(): 
        - computes the vanilla Levenshtein distance, i.e. #S+#I+#D
        - no backtracking, hence only composite distance / error rate 
    - dtw(): 
        - computes a weighted edit distance
        - allows for prior normalization
        - allowing Substitutions, Insertion, Deletions 
        - returns the alignment and #S, #I, #D separated out
    - both routines take lists of tokens as inputs, hence applicable to both word or character tokens 
    
24/03/2022:  not fully functional yet in v0.6

In [1]:
#!pip install git+https://github.com/compi1234/pyspch.git
try:
    import pyspch
except ModuleNotFoundError:
    try:
        print(
        """
        To enable this notebook on platforms as Google Colab, 
        install the pyspch package and dependencies by running following code:

        !pip install git+https://github.com/compi1234/pyspch.git
        """
        )
    except ModuleNotFoundError:
        raise

In [2]:
# do all the imports
import numpy as np
import pandas as pd
import timeit
import pyspch.dp1 as dtw
#import Normalizer as Norm
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# help(dtw)

In [3]:
display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [4]:
help(dtw)

Help on module pyspch.dp1 in pyspch:

NAME
    pyspch.dp1

DESCRIPTION
    The modules in `dtw.py` contain basic implementations of Levenshtein and Weighted Edit Distance DP matching
    The main purpose is for didactic demonstrations of  small systems 
    
    Created on Jan 13, 2021
            
    @author: compi

FUNCTIONS
    alignment_to_counts(df)
        count nSUB/nINS/nDEL, nTOT and Err from an alignment dataframe
        Parameters:
        -----------
                    df  type DataFrame, alignment as provided e.g. by wedit()
        Returns:
        --------
                    (nsub,nins,ndel,ntot,err)   counts of SUB/INS/DEL and TOT and Err in %
    
    edit_distance(x=[], y=[], wS=1.0, wI=1.0, wD=1.0, Verbose=False)
        Weighted Edit Distance by DTW aligment allowing for SUB/INS/DEL
        
        Parameters
        ----------
        x : list (or str) 
            tokens in hypothesis/test
        y : list (or str)
            tokens in reference
        
   

## Some Extra Utilities
- print_results(): for printing alignment and scores of DTW matching
- score_corpus(): for global scoring of a corpus given by a list of paired sentences [reference,test]


In [5]:
def print_edist_results(cts=None,df_align=None,trellis=None,Display=True):
    if trellis is not None:
        print(" == TRELLIS == ")
    if df_align is not None:
        print("\n == ALIGNMENT == ")
        with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
            if(Display):
                display(df_align.T)
            else:
                print(df_align.T)
    if cts is not None:
        print("\n == SCORE ==")
        print("#S=%d, #I=%d, #D=%d for %d tokens \nErr=%.2f%%" % cts )
        
def score_corpus(corpus, Verbose = False, Display = True):
    Nsub = 0
    Nins = 0
    Ndel = 0
    Ntot = 0
    for [reference,result] in corpus:
        ref = dtw.tokenizer(reference)
        hyp = dtw.tokenizer(result)
        df_align, cts, _ = dtw.wedit(hyp,ref)
        
        if(Verbose):
            print("Reference:",ref)
            print("Output(test):    ",hyp)
            print_edist_results(df_align=df_align)
        
        Nsub += cts[0]
        Nins += cts[1]
        Ndel += cts[2]
        Ntot += cts[3]

    print("\n ++ CORPUS RESULTS ++ ")
    print("\n#S=%d, #I=%d, #D=%d for %d tokens" % (Nsub,Nins,Ndel,Ntot) )
    print("Error Rate: %5.2f%%" % (100.*(Nsub+Nins+Ndel)/Ntot)  )

### Levenshtein and Weighted Edit Distance for string matching (tokens=:characters)
- character strings are essentially a list of characters and can be passed directly as input and reference sequence
- change Verbose to True to see all essential internal data

In [6]:
ref = "boeken"
hyp =  "broekske"
y = ref
x = hyp
levdist = dtw.lev_distance(x,y)
print("Levenshtein Distance: ",levdist)
#
dist,alignment,cts,_ = dtw.edit_distance(x,y,Verbose=False)
print_edist_results(df_align=alignment,cts=cts)


Levenshtein Distance:  4.0

 == ALIGNMENT == 


Unnamed: 0,x,y,O
0,b,b,M
1,r,_,I
2,o,o,M
3,e,e,M
4,k,k,M
5,s,_,I
6,k,e,S
7,e,n,S



 == SCORE ==
#S=2, #I=2, #D=0 for 6 tokens 
Err=66.67%


### Levenshtein and Weighted Edit Distance for sentence matching (tokens=:words)
The tokenizer used is simply the default Python split(), with optional conversion to lower case   

In [7]:
# pd.set_option('display.max_rows', None, 'display.max_columns', None)
# sentence matching
ref = "to recognize speech is the topic of this course"
hyp =  "to wreck a nice beach seems of this month"
print("Input(reference):",ref)
print("Output(test):    ",hyp)
#
print("\n--- Character DTW Match on Sentence ----- ")
y = ref
x = hyp
levdist = dtw.lev_distance(x,y)
print("Levenshtein Distance: ",levdist,"\n")

dist,alignment,cts,_ = dtw.edit_distance(x,y,Verbose=False)
print_edist_results(df_align=alignment,cts=cts)

print("\n--- Word DTW Match on Sentence ----- ")
y = dtw.tokenizer(ref)
x = dtw.tokenizer(hyp)
levdist = dtw.lev_distance(x,y)
print("Levenshtein Distance: ",levdist,"\n")

dist, alignment,cts,_ = dtw.edit_distance(x,y,Verbose=False)
print_edist_results(df_align=alignment,cts=cts)

Input(reference): to recognize speech is the topic of this course
Output(test):     to wreck a nice beach seems of this month

--- Character DTW Match on Sentence ----- 
Levenshtein Distance:  24.0 


 == ALIGNMENT == 


Unnamed: 0,x,y,O
0,t,t,M
1,o,o,M
2,,,M
3,w,_,I
4,r,r,M
5,e,e,M
6,c,c,M
7,k,_,I
8,,_,I
9,a,o,S



 == SCORE ==
#S=12, #I=3, #D=9 for 47 tokens 
Err=51.06%

--- Word DTW Match on Sentence ----- 
Levenshtein Distance:  6.0 


 == ALIGNMENT == 


Unnamed: 0,x,y,O
0,to,to,M
1,wreck,recognize,S
2,a,speech,S
3,nice,is,S
4,beach,the,S
5,seems,topic,S
6,of,of,M
7,this,this,M
8,month,course,S



 == SCORE ==
#S=6, #I=0, #D=0 for 9 tokens 
Err=66.67%


In [8]:
corpus1 = [ 
[ "ASTRONOMERS SAY THAT THE EARTH'S FATE IS SEALED",
  " MR. ARMOUR SAY THAT THE EARTH'S FETISH  SEALED"],
[ 'GLASNOST HAS ALSO BEEN GOOD TO LAWRENCE LEIGHTON SMITH',
  'CLASS NOSED HAD ALSO BEEN GOOD TO LAWRENCE FLEET AND SMITH'],
[ "AS MS. KENDALL AND MR. LOUW SEE IT SOUTH AFRICA'S CENTRAL GOVERNMENT IS LIKE A BIG LUMBERING TANK",
  "AS MS. SCANDAL AND MR. LOWE C. AT SOUTH AFRICA'S CENTRAL GOVERNMENT IS LIKE A BIG LUMBER INK TANK"],
[ "MR. WANG IS RELATIVELY YOUNG FOR HIS JOB UPSETTING OLDER COLLEAGUES",
  "MR. WANG IS RELATIVELY YOUNG FIRST JOB KIND OF SETS OLDER COLLEAGUES"]
]


In [9]:
score_corpus(corpus1,Verbose=True)

AttributeError: module 'pyspch.dp1' has no attribute 'wedit'