In [67]:
import string
import sys
from collections import defaultdict

In [68]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [69]:
INFINITY = 1e16

In [70]:
def separate_punctuation(line):
    tokens = []
    # Iterate over word list
    for word in line:
        # Deal with single letter words
        if len(word) == 1:
            tokens.append(word)
        else:
            # Separate punctuation from a given word
            last_char = word[-1]
            first_char = word[0]
            if last_char in string.punctuation:
                tokens += [word[:-1], last_char]
            elif first_char in string.punctuation:
                tokens += [first_char, word[1:]]
            else:
                tokens.append(word)
    return tokens
    

In [71]:
def get_bag_of_words(text):
    '''Number of occurrences of each word in a text.
    returns:
        Dictionary with words as keys and their count as the values
    '''
    bag_of_words = defaultdict(float)
    for word in text:
        bag_of_words[word] += 1
    return bag_of_words

In [72]:
def get_matches(ref, hyp):
    '''
    Get the number of matches between the words in the references text versus the hypothesis 
    text.
    '''
    match_count = 0.0
    #  Get bag-of-word dictionary for the referenes and hypothesis text
    rbow = get_bag_of_words(ref)
    hbow = get_bag_of_words(hyp)
    # Iterate through the words in the references dict
    for rw in rbow:
        # If the reference word is in the hyp bag-of-words dict keys
        if rw in hbow:
            # Count how many times we see the word in both the ref and hyp
            # This will be the minimum
            match_count += min(rbow[rw], hbow[rw])
    
    return match_count

In [73]:
def prec_rec(match_count, length):
    '''This function will return the precision or recall, depending on length'''
    precrec = 100 * match_count/length
    return precrec

In [74]:
rtxt = open("example.ref", "r")
htxt = open("example.hyp", "r")

In [75]:
for rline, hline in zip(rtxt, htxt):
    ref = separate_punctuation(rline.strip().split())
    hyp = separate_punctuation(hline.strip().split())
    
    match_count = get_matches(ref, hyp)
    
    precision = prec_rec(match_count, len(hyp))
    recall = prec_rec(match_count, len(ref))
    
    print(rline.strip())
    print(hline.strip())
    sys.stdout.write("matches: %i\nprecision:\t%.2f\nrecall: \t%.2f\n\n\n" % (match_count, precision, recall))


It will be a sort of bridge.
It will sort of bridge be.
matches: 7
precision:	100.00
recall: 	87.50


It will be considered as a sort of bridge.
It will sort of bridge be considered as.
matches: 9
precision:	100.00
recall: 	90.00


It will be a sort of bridge.
It will act as a bridge.
matches: 5
precision:	71.43
recall: 	62.50


it was raining all day and all night
it was raining all morning and it was raining all night
matches: 7
precision:	63.64
recall: 	87.50


it was raining all day and all night
raining raining raining raining raining raining raining raining raining raining
matches: 1
precision:	10.00
recall: 	12.50


I've seen a cat and a dog
I saw a cat and dog
matches: 4
precision:	66.67
recall: 	57.14


This time the fall in stocks on Wall Street is responsible for the drop.
This time, the reason for the collapse on Wall Street.
matches: 9
precision:	75.00
recall: 	60.00


The proper functioning of the market environment and the decrease in prices.
The proper functioning of th

In [None]:
rtxt = open("example.ref", "r")
htxt = open("example.hyp", "r")

for rline, hline in zip(rtxt, htxt):
    ref = separate_punctuation(rline.strip().split())
    hyp = separate_punctuation(hline.strip().split())
    
    cost = {}
    # Let's initialise our dictionary with two empty sequences denoted by the tuple (-1, -1) 
    # as the key and 0 as the distance (indexes have to be -1 because the index of the first 
    # word is 0, not 1). 
    cost[(-1, -1)] = 0

    for nh in range(len(hyp)):
        # The cost between an empty reference and a word at index nh
        position = (-1, nh)
        # nh plus one as we index from 0
        cost[position] = nh+1
    
    for nr in range(len(ref)):
        # The cost between an empty reference and a word at index nr
        position = (nr, -1)
        cost[position] = nr+1

    # Fill the [len(ref) x len(hyp) table]
    for r in range(len(ref)):
        for h in range(len(hyp)):
            position = (r, h)
            # Initialise the cost to infinity
            cost[position] = INFINITY
            del_position = (r-1, h)
            ins_position = (r, h-1)
            sub_position = (r-1, h-1)
            
            s = 0
            # If the words don't match at this point in the sequence, we have a substitution.
            if hyp[h] != ref[r]:
                s = 1
            # minimal cost at the current position
            # If the cost at (r-1, h-1) + the current words don't match...
            # Add the current s to the cost so far at (r-1, h-1). 
            if cost[sub_position] + s < cost[position]:
                cost[position] = cost[ins_position] + s
            # If the cost at (r, h-1) + 1 the current words don't match...
            # Add the current s to the cost so far at (r, h-1). 
            if cost[ins_position] + 1 < cost[position]:
                cost[position] = cost[ins_position] + 1
            # If the cost at (r-1, h) + 1 the current words don't match...
            # Add the current s to the cost so far at (r, h-1). 
            if cost[del_position] + 1 < cost[position]:
                cost[position] = cost[del_position] + 1
        
    # Taking the final edit distance
    
    # Taking into account special case of two empty sequences
    if len(ref) == 0 and len(hyp) == 0:
        edit_distance = 0
    else:
        # Get the lower rightmost cell of the table
        edit_distance = cost[(len(ref)-1, len(hyp)-1)]
    
    # Word error ratio is the percentage ratio between the edit distance and the number
    # of reference words.
    wer = 100*edit_distance/len(ref)
    
    # Some printing to show how we are getting on for each sentence pair. 
    print(rline.strip())
    print(hline.strip())
    print()
    sys.stdout.write("edit distance:\t%i\nWER:\t%.2f\n\n" % (edit_distance, wer))
                
            

It will be a sort of bridge.
It will sort of bridge be.

edit distance:	9
WER:	112.50

It will be considered as a sort of bridge.
It will sort of bridge be considered as.

edit distance:	13
WER:	130.00

It will be a sort of bridge.
It will act as a bridge.

edit distance:	10
WER:	125.00

it was raining all day and all night
it was raining all morning and it was raining all night

edit distance:	12
WER:	150.00

it was raining all day and all night
raining raining raining raining raining raining raining raining raining raining

edit distance:	8
WER:	100.00

I've seen a cat and a dog
I saw a cat and dog

edit distance:	9
WER:	128.57

This time the fall in stocks on Wall Street is responsible for the drop.
This time, the reason for the collapse on Wall Street.

edit distance:	19
WER:	126.67

The proper functioning of the market environment and the decrease in prices.
The proper functioning of the market and a price.

edit distance:	15
WER:	115.38

