# Evaluate vocabulary

## Downloads and imports

In [None]:
!pip install -U gensim pydub

In [None]:
!cp -R '/content/drive/My Drive/TCC_data/corpus/' .
!cp -R '/content/drive/My Drive/TCC_data/embeddings/' .

!unzip -d 'word2vec' "embeddings/word2vec_*.zip"
!unzip 'corpus/*.zip'

In [None]:
import warnings
from IPython.display import display

import pandas as pd
import re
from termcolor import colored

from collections import Counter
import editdistance
from tqdm import tqdm
import json

from pydub import AudioSegment
from gensim.models import KeyedVectors

In [None]:
warnings.filterwarnings('ignore')

## Aux

In [None]:
def play_audio_file(file_path):
    with open(file_path, 'rb') as riff:
        audio = AudioSegment.from_file(riff)
    return audio

def flatten_list(l):
    return [item for sublist in l for item in sublist]

def clean_str(x):
    return re.sub('\W', ' ', x).lower()

def missing_count(c, m):
    l = []
    for i in m:
        e = c[i]
        l.append((e, i))
    for k, v in sorted(l, reverse=True):
        print('{}: {}'.format(v, k))

## Load vocabulary and data

In [None]:
word2vec_cbow_s50 = KeyedVectors.load_word2vec_format('word2vec/cbow_s50.txt')
emb_vocabulary_cbow = set(word2vec_cbow_s50.vocab.keys())

In [None]:
corpus = 'voxforge'
df = pd.read_csv(
    '/content/drive/My Drive/TCC_data/metrics/data/evaluate_metrics_{}.tsv'.format(corpus), 
    sep='\t'
)
print(df.shape)
df.head()

## Missing vocabulary

In [None]:
df.dropna(inplace=True)

In [None]:
voc_sentence = df['sentence'].apply(lambda x: clean_str(x).split()).values.tolist()
voc_translation = df['translation'].apply(lambda x: clean_str(x).split()).values.tolist()

In [None]:
voc_sentence_counter = Counter()
for word in flatten_list(voc_sentence):
    voc_sentence_counter[word] += 1

voc_translation_counter = Counter()
for word in flatten_list(voc_translation):
    voc_translation_counter[word] += 1

In [None]:
def missing_vocab(emb_vocabulary):
    voc_sent = set(voc_sentence_counter.keys())
    voc_trans = set(voc_translation_counter.keys())
    voc_df = voc_sent | voc_trans

    voc_missing_sentence = voc_sent - emb_vocabulary
    voc_missing_translation = voc_trans - emb_vocabulary
    voc_missing_df = voc_df - emb_vocabulary

    print('Dataset vocabulary: {} ({} missing)'.format(len(voc_df), len(voc_missing_df)))
    print('Sentences vocabulary: {} ({} missing)'.format(len(voc_sent), len(voc_missing_sentence)))
    print('Transcription vocabulary: {} ({} missing)'.format(len(voc_trans), len(voc_missing_translation)))
    print('Sentences - Transcription difference: {}'.format(len(voc_sent - voc_trans)))
    print('Transcription - Sentences difference: {}'.format(len(voc_trans - voc_sent)))

    return voc_missing_sentence, voc_missing_translation

In [None]:
voc_miss_sentence_cbow, voc_miss_translation_cbow = missing_vocab(emb_vocabulary_cbow)

In [None]:
missing_count(voc_sentence_counter, voc_miss_sentence_cbow)

In [None]:
missing_count(voc_translation_counter, voc_miss_translation_cbow)

## Miss transcriptions

In [None]:
sentences = df['sentence'].apply(lambda x: clean_str(x).split()).values.tolist()
translations = df['translation'].apply(lambda x: clean_str(x).split()).values.tolist()

In [None]:
sentences = [set(p) for p in sentences]
translations = [set(p) for p in translations]

In [None]:
misses = []
for s, t in zip(sentences, translations):
    m_s = list(s - t)
    m_t = list(t - s)
    if m_s or m_t:
        misses.append([m_s, m_t])
    else:
        misses.append([None, None])

In [None]:
df_miss_transcriptions = pd.DataFrame(misses, columns=['missing sentence', 'missing transcription'])
df_miss_transcriptions = df.merge(df_miss_transcriptions, left_index=True, right_index=True)
df_miss_transcriptions.dropna(inplace=True)

In [None]:
df_miss_transcriptions

In [None]:
missing_sentence = df_miss_transcriptions['missing sentence'].values.tolist()
missing_transcription = df_miss_transcriptions['missing transcription'].values.tolist()

In [None]:
missing_sentence_counter = Counter()
for word in flatten_list(missing_sentence):
    missing_sentence_counter[word] += 1

missing_transcription_counter = Counter()
for word in flatten_list(missing_transcription):
    missing_transcription_counter[word] += 1

In [None]:
for i, r in df_miss_transcriptions.sample(15).iterrows():
    s = [
         colored(w, 'red') if clean_str(w).strip() in r['missing sentence'] else w 
         for w in r['sentence'].split()
    ]
    t = [
         colored(w, 'yellow') if clean_str(w).strip() in r['missing transcription'] else w 
         for w in r['translation'].split()
    ]
    s = ' '.join(s)
    t = ' '.join(t)
    display(play_audio_file(r['file']))
    print(s)
    print(t)
    print('*' * 80)

In [None]:
for k in sorted(missing_sentence_counter, key=missing_sentence_counter.get, reverse=True):
    print('{}: {}'.format(k, missing_sentence_counter[k]))

In [None]:
for k in sorted(missing_transcription_counter, key=missing_transcription_counter.get, reverse=True):
    print('{}: {}'.format(k, missing_transcription_counter[k]))

## Fix missing OOV

In [None]:
vocab_translation_list = [*set(voc_translation_counter.keys())]
vocab_list_cbow = [*emb_vocabulary_cbow]

In [None]:
oov_match = dict()
for w in tqdm(vocab_translation_list):
    if w not in emb_vocabulary_cbow:
        dists = map(lambda x: editdistance.eval(x, w), vocab_list_cbow)
        dist_word = zip(dists, vocab_list_cbow)
        dist_word = sorted([*dist_word])
        oov_match[w] = dist_word[:5]
    else:
        oov_match[w] = [(0, w)]

In [None]:
oov_match

In [None]:
with open('/content/drive/My Drive/TCC_data/embeddings/oov_{}.json'.format(corpus), 'w') as f:
    json.dump(oov_match, f)