In [188]:
import string
import operator 
import functools 
import numpy as np
import pandas as pd

from tabulate import tabulate
from collections import Counter
from IPython.display import display, Math, Latex, Markdown

In [224]:
ref_1 = 'The cat sat on the mat.'
cand_1 = 'The cat is on the mat.'
ref_2 = 'There is a cat on the mat.'
cand_2 = 'The the the the the the the the.'

punctuation_list = string.punctuation
preprocess = lambda x: x.lower().translate(str.maketrans('', '', string.punctuation))

def extract(sentence):
    sentence = preprocess(sentence)
    uni_gram = sentence.split()
    bi_gram = [' '.join(words) for words in zip(uni_gram[::], uni_gram[1::])]
    tri_gram = [' '.join(words) for words in zip(uni_gram[::], uni_gram[1::], uni_gram[2::])]
    quad_gram = [' '.join(words) for words in zip(uni_gram[::], uni_gram[1::], uni_gram[2::], uni_gram[3::])]
    return uni_gram, bi_gram, tri_gram, quad_gram

## N-gram Evaluation

### Example 

Reference

`{{ref_1}}` 

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(ref_1)}}` 
     
$\xrightarrow[\text{}]{\text{Extract 1-gram}} $ `{{extract(ref_1)[0]}}`
     
$\xrightarrow[\text{}]{\text{Extract 2-gram}} $ `{{extract(ref_1)[1]}}` 

$\xrightarrow[\text{}]{\text{Extract 3-gram}} $ `{{extract(ref_1)[2]}}` 

Candidate

`{{cand_1}}`

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(cand_1)}}` 
     
$\xrightarrow[\text{}]{\text{Extract 1-gram}} $ `{{extract(cand_1)[0]}}` 
     
$\xrightarrow[\text{}]{\text{Extract 2-gram}} $ `{{extract(cand_1)[1]}}` 

$\xrightarrow[\text{}]{\text{Extract 3-gram}} $ `{{extract(cand_1)[2]}}` 

## Considering Recall %

### Modified Precision - Clipping

### Example 

Candidate

`{{cand_2}}` 

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(cand_2)}}` 
     
$\xrightarrow[\text{}]{\text{Extract 1-gram}} $ `{{extract(cand_2)[0]}}`
     
$\xrightarrow[\text{}]{\text{Extract 2-gram}} $ `{{extract(cand_2)[1]}}` 

$\xrightarrow[\text{}]{\text{Extract 3-gram}} $ `{{extract(cand_2)[2][:2] + ['...']}}` 

## [BLEU - Bilingual Evaluation Understudy](https://www.aclweb.org/anthology/P02-1040.pdf)

### BLEU_n Formula

$
\begin{align}
    \quad
        BLEU = BP \cdot exp(\sum_{n=1}^{N} w_n\log_{}{P_n}) \cr
\end{align}
$
    
$
\begin{align}
    \quad
        BP \quad\,\ = \begin{cases} 
                        1         &, \ c > r   \cr
                        exp(1-\frac{r}{c}) &, \ c \leq 0   \cr
                      \end{cases} 
\end{align}
$

In [3]:
def BLEU_n(candidate, reference):
    candidate = extract(candidate)
    reference = extract(reference)
    BLEU = 0
    W_n = 1. / len(candidate)
    for cand, ref in zip(candidate, reference):
        BLEU += W_n * np.log(P_n(cand, ref))
    BLEU = np.exp(BLEU) * BP(candidate[0], reference[0])
    return BLEU

def P_n(cand, ref):
    count = 0
    for c in cand:
        if c in ref:
            count += 1
            ref.remove(c)
    return 1 if count == 0 else count / len(cand)
    
def BP(candidate, reference):
    c, r = len(candidate), len(reference)
    return 1 if c > r else np.exp(1 - r / c)

#BLEU_n(cand_1, ref_1)

## [ROUGE - Recall-Oriented Understudy for Gisting Evaluation](https://www.aclweb.org/anthology/W04-1013.pdf)

### Rouge-N Formula

$
\begin{align}
    \quad
        \text{Rouge-N}\; = \frac{\sum\limits_{S \in \{\textit{ReferenceSummaries}\}} 
                                 \sum\limits_{gram_n \in S} Count_{macth}(gram_n)}
                                {\sum\limits_{S \in \{\textit{ReferenceSummaries}\}} 
                                 \sum\limits_{gram_n \in S} Count(gram_n)}
\end{align}
$
    

In [4]:
ref_3 = 'The cat was under the bed.'
cand_3 = 'The cat was found under the bed.'

def Rouge_n(candidate, reference, n=1):
    cand, ref = extract(candidate)[n-1], extract(reference)[n-1]
    cand = list(map(lambda x: 1 if x in ref else 0, cand))
    return functools.reduce(operator.add, cand) / len(ref)

#Rouge_n(cand_3, ref_3)

### LCS(Longest Common Subsequence)

#### Example 

Reference

`{{ref_1}}` 

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(ref_1)}}` 

Candidate

`{{cand_1}}`

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(cand_1)}}` 

### Rouge-L Formula

$
\begin{align}
    \cr
    \quad
        R_{lcs} = \frac{LCS(\textit{Reference}, \textit{Candidate})}{m}, 
                        \; m \;\text{for }\textit{Reference} \text{ length} \cr              
        P_{lcs} = \frac{LCS(\textit{Reference}, \textit{Candidate})}{n}, 
                        \; n \;\text{for }\textit{Candidate} \text{ length} \cr                      
\end{align}
$

$
\begin{align}
    \quad \;
        F_{lcs} = \frac{(1+\beta^2)R_{lcs}P_{lcs}}{R_{lcs} + \beta^2P_{lcs}}  
\end{align}
$

In [5]:
def Rouge_l(candidate, reference, beta=1.2):
    cand, ref = extract(candidate)[0], extract(reference)[0]
    lcs = LCS(cand, ref)
    r_lcs, p_lcs = lcs / len(ref), lcs / len(cand)
    return ((1 + beta**2)*r_lcs*p_lcs) / (r_lcs + beta**2*p_lcs)
    
def LCS(cand, ref):
    l_c, l_r = len(cand), len(ref)
    dp = np.zeros(shape=(l_c + 1, l_r + 1))
    for i in range(l_c):
        for j in range(l_r):
            if cand[i] == ref[j]:  
                dp[i + 1][j + 1] = dp[i][j] + 1
            elif dp[i + 1][j] > dp[i][j + 1]:
                dp[i + 1][j + 1] = dp[i + 1][j]
            else: 
                dp[i + 1][j + 1] = dp[i][j + 1]
    return int(dp[-1][-1])

#Rouge_l(cand_1, ref_1)

### Rouge-W
   
   - WLCS
       - Weighted LCS-based statistics that favors consecutive LCSes.

### Rouge-S

   - Skip-gram
       - Skip-bigram based co-occurrence statistics. 
       - Skip-bigram is any pair of words in their sentence order.
       
### Rouge-SU

   - Skip-bigram plus unigram-based co-occurrence statistics.

## [CIDEr -  Consensus-based Image Description Evaluation](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Vedantam_CIDEr_Consensus-Based_Image_2015_CVPR_paper.pdf)

### TF-IDF

   - Term Frequency
       
       $ 
       \begin{align}
           \quad 
           \textit{TF}(𝑥)
           \;\text{for count of term } x \;\text{ in the document}
           \cr
       \end{align}
       $
       
       
   - Inverse Document Frequency
       
       $ 
       \begin{align}
           \quad 
           \textit{IDF}(𝑥) = \log \frac{N + 1}{N(x) + 1} +1
           ,\; N\;\text{for total document count and }N(x) \text{ for document which includes term } x
           \cr
       \end{align}
       $
       
       
   - TF-IDF
   
       $
       \begin{align}
           \quad
           \textit{TF-IDF}(x) = \textit{TF}(x)\;\times\;\textit{IDF}(x) \cr 
       \end{align}
       $

In [272]:
def gen_matrix(documents):
    d = Counter()
    for doc in documents:
        d += Counter(doc)
    d = dict(d.most_common()).fromkeys(d, 0)
    matrix = []
    for doc in documents:
        dest = d.copy()  
        dest.update(dict(Counter(doc)))
        matrix.append(list(dest.values()))
    return d, matrix

def gen_table(d, matrix):
    columns = {}
    for idx, key in enumerate(list(d.keys()) +['Cos_Sim']):
        columns.update({idx:key})
    index = ['Candidate'] + ['Reference_{}'.format(i+1) for i in range(len(matrix) - 1)]
    temp = []
    for i in range(len(matrix)):
        temp.append(matrix[i] + [cosine_similiraty(matrix[0], matrix[i])])
    df = pd.DataFrame(temp).rename(columns=columns)
    df.reset_index(drop=True, inplace=True)
    df.index = index
    return Markdown(df.to_markdown())

def cosine_similiraty(cand, ref):
    fn = lambda x: (np.sqrt(np.sum(np.power(x, 2))))
    return np.dot(cand, ref) / (fn(cand) * fn(ref))

table = []
for i in range(0, 4):
    table.append(gen_table(*gen_matrix([extract(cand_1)[i], 
                        extract(ref_1)[i], extract(ref_2)[i]])))

### Example 

Candidate

`{{cand_1}}`

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(cand_1)}}`     
$\xrightarrow[\text{}]{\text{Extract 2-gram}} $ `{{extract(cand_1)[1]}}` 

Reference_1

`{{ref_1}}` 

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(ref_1)}}` 
$\xrightarrow[\text{}]{\text{Extract 2-gram}}$ `{{extract(ref_1)[1]}}` 

Reference_2

`{{ref_2}}` 

$\xrightarrow[\text{}]{\text{ Preprocessing }}$ `{{preprocess(ref_2)}}` 
$\xrightarrow[\text{}]{\text{Extract 2-gram}}$ `{{extract(ref_2)[1]}}`

#### Doc-Term Matrix

`{{table[0]}}`

`{{table[1]}}`

### Cosine Similiraty

   - The cosine of two non-zero vectors can be derived by using the Euclidean dot product formula
       
       $
       \begin{align}
           \quad
           A \cdot B = \lVert A \rVert \lVert B \rVert \cos{\theta}
           \cr
       \end{align}
       $
       
       
   - Similiraty 
       
       $
       \begin{align}
           \quad
           \textit{Similiraty } = 
           \cos{(\theta)} = 
           \frac{A \cdot B}
                {\lVert A \rVert \lVert B \rVert} = 
           \frac{\sum\limits_{i=1}^{N}A_i B_i}
                {\sqrt{\sum\limits_{i=1}^{N}A_i^2} \sqrt{\sum\limits_{i=1}^{N}B_i^2}}, 
           \text{ where } A_i \text{ and } B_i \text{are components of vector } A \text{ and } B \text{ respectively.}
       \end{align}
       $

### CIDEr_n Formula


$
\begin{align}
\cr \quad
\textit{CIDEr_n}(\textit{candidate}, 
                 \textit{reference}) = 
    \frac{1}{M}\sum\limits_{i=1}^{M}
    \frac{ g^n(\textit{candidate}) \cdot g^n(\textit{reference}) }
         {\lVert g^n(\textit{candidate}) \rVert \times \lVert g^n(\textit{reference}) \rVert},
\text{ where } g^n(x) \text{ is  } \textit{TF-IDF} \text{ weight of n-gram in sentence } x \text{.}
\end{align}
$

In [None]:
    df = pd.DataFrame.from_dict(Counter(doc), orient='index') \
                .reset_index().rename(columns={'index':'Term', 0:'Count'})
    display(df.style.hide_index())