# String Matching Evaluation

## Downloads and imports

In [None]:
!pip install -U nltk gensim jiwer

In [None]:
!cp -R '/content/drive/My Drive/TCC_data/embeddings/' .

!unzip -d 'word2vec' "embeddings/word2vec_*.zip"
!unzip -d 'wang2vec' "embeddings/wang2vec_*.zip"

In [None]:
import io
import json
import os
import glob
import time
import wave
import warnings
from collections import OrderedDict
from itertools import chain
import re

import matplotlib.pyplot as plt
import nltk
from jiwer import wer
import numpy as np
import pandas as pd
import requests
from IPython import display
from nltk.translate import bleu_score, meteor_score
from tqdm import tqdm

from gensim import corpora
from gensim.matutils import softcossim
from gensim.models import KeyedVectors

In [None]:
warnings.filterwarnings('ignore')

tqdm.pandas()

plt.style.use('fivethirtyeight')
display.set_matplotlib_formats('svg')

nltk.download('wordnet')
nltk.download('rslp')

## Aux

In [None]:
def flatten_dict(d, parent_key='', sep='_'):
    if not isinstance(d, dict):
        return {parent_key: d}
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


def flatten_columns(df, columns):
    for col in columns:
        df[f'{col}_'] = df[f'{col}'].apply(flatten_dict)
        keys = set(chain(*df[f'{col}_'].apply(lambda column: column.keys())))
        for key in keys:
            column_name = f'{col}_{key}'.lower()
            df[column_name] = df[f'{col}_'].apply(
                lambda cell: cell[key] if key in cell.keys() else np.NaN
            )
    cols_to_drop = [(f'{col}', f'{col}_') for col in columns]
    return df.drop(columns=list(chain(*cols_to_drop)))

def clean_str(x):
    return re.sub('\W', ' ', x).lower()

## Metrics

In [None]:
sent_1 = 'o rato roeu a roupa do rei de atena'
sent_2 = 'de roma o rato roeu a roupa do rei'

### Cosine Similarity

In [None]:
emb_models = {
    'wang2vec_cbow_s50': KeyedVectors.load_word2vec_format('wang2vec/cbow_s50.txt'),
    'wang2vec_skip_s50': KeyedVectors.load_word2vec_format('wang2vec/skip_s50.txt'),
    'word2vec_cbow_s50': KeyedVectors.load_word2vec_format('word2vec/cbow_s50.txt'),
    'word2vec_skip_s50': KeyedVectors.load_word2vec_format('word2vec/skip_s50.txt')
}

In [None]:
def cosine_similarity(reference, hypothesis, model):
    reference = reference.split()
    hypotesis = hypothesis.split()
    documents = [hypotesis, reference]
    dictionary = corpora.Dictionary(documents)

    similarity_matrix = emb_models[model].similarity_matrix(dictionary)

    hypotesis = dictionary.doc2bow(hypotesis)
    reference = dictionary.doc2bow(reference)

    return softcossim(hypotesis, reference, similarity_matrix)

In [None]:
for model in emb_models:
    print(model, cosine_similarity(sent_1, sent_2, model))

### BLEU

In [None]:
def bleu(reference, hypothesis):
    references = [reference.split()]
    hypothesis = hypothesis.split()

    if len(references[0]) == 1:
        weights=(1.0, 0.0, 0.0, 0.0)
    elif len(references[0]) == 2:
        weights=(0.5, 0.5, 0.0, 0.0)
    elif len(references[0]) == 3:
        weights=(0.4, 0.3, 0.3, 0.0)
    else:
        weights=(0.4, 0.3, 0.2, 0.1)

    return bleu_score.sentence_bleu(references, hypothesis, weights=weights)

In [None]:
bleu(sent_1, sent_2)

### METEOR

In [None]:
pt_stemmer = nltk.stem.RSLPStemmer()

def meteor(reference, hypothesis):
    references = [reference]
    hypothesis = hypothesis
    return meteor_score.meteor_score(references, hypothesis, stemmer=pt_stemmer)

In [None]:
meteor(sent_1, sent_2)

### WER

In [None]:
def word_error_rate(reference, hypothesis):
    return wer(reference, hypothesis)

In [None]:
word_error_rate(sent_1, sent_2)

### Jaccard distance

In [None]:
def jaccard_distance(reference, hypothesis):
    reference = set(reference.split())
    hypothesis = set(hypothesis.split())
    return nltk.jaccard_distance(reference, hypothesis)

In [None]:
jaccard_distance(sent_1, sent_2)

### Eval metrics

In [None]:
def replace_oov(text):
    text = text.split()
    text = [oov_words[w][0][1] for w in text]
    return ' '.join(text)

def eval_metrics(reference, hypotesis, oov=False):
    ms = dict()
    
    reference = clean_str(reference)
    hypotesis = clean_str(hypotesis)

    if oov:
        hypotesis = replace_oov(hypotesis)

    for model in emb_models:
        ms[model] = cosine_similarity(reference, hypotesis, model)
    ms['bleu'] = bleu(reference, hypotesis)
    ms['meteor'] = meteor(reference, hypotesis)
    ms['wer'] = word_error_rate(reference, hypotesis)
    ms['jaccard_distance'] = jaccard_distance(reference, hypotesis)

    return OrderedDict(sorted(ms.items()))

## Evaluating transcriptions

In [None]:
corpus = 'voxforge'
file_path = '/content/drive/My Drive/TCC_data/metrics/data/evaluate_metrics_{}.tsv'.format(corpus)
transcribed_df = pd.read_csv(
    file_path, 
    sep='\t'
)
transcribed_df.dropna(inplace=True)
print(transcribed_df.shape)
transcribed_df.head()

In [None]:
transcribed_df['eval'] = transcribed_df[['sentence', 'translation']].progress_apply(
    lambda row: eval_metrics(row['sentence'], row['translation']), axis=1
)
transcribed_df = flatten_columns(transcribed_df, ['eval'])
file_path = '/content/drive/My Drive/TCC_data/metrics/data/evaluate_metrics_{}.tsv'.format(corpus)
transcribed_df.to_csv(file_path, sep='\t', index=False)

In [None]:
with open('/content/drive/My Drive/TCC_data/embeddings/oov_{}.json'.format(corpus)) as f:
    oov_words = json.load(f)

transcribed_df['eval'] = transcribed_df[['sentence', 'translation']].progress_apply(
    lambda row: eval_metrics(row['sentence'], row['translation'], oov=True), axis=1
)
transcribed_df = flatten_columns(transcribed_df, ['eval'])
file_path = '/content/drive/My Drive/TCC_data/metrics/data/evaluate_metrics_{}_oov.tsv'.format(corpus)
transcribed_df.to_csv(file_path, sep='\t', index=False)