In [None]:
!pip install -U gensim

In [None]:
!cp -R '/content/drive/My Drive/TCC_data/embeddings/' .

!unzip -d 'word2vec' "embeddings/word2vec_*.zip"

In [None]:
import pandas as pd
import re
from collections import defaultdict, Counter

from gensim.models import KeyedVectors

In [None]:
word2vec_cbow_s50 = KeyedVectors.load_word2vec_format('word2vec/cbow_s50.txt')
emb_vocabulary_cbow = set(word2vec_cbow_s50.vocab.keys())

In [None]:
def clean_str(x):
    return re.sub('\W', ' ', x).lower()

def flatten_list(l):
    return [item for sublist in l for item in sublist]

In [None]:
corpus = ['cod_def_cons', 'constituicao', 'voxforge', 'mozilla']

data = defaultdict(list)

for c in corpus:
    df = pd.read_csv(
        '/content/drive/My Drive/TCC_data/metrics/data/evaluate_metrics_{}.tsv'.format(c), 
        sep='\t'
    )
    
    voc_sentence = df['sentence'].apply(lambda x: clean_str(x).split()).values.tolist()
    voc_translation = df['translation'].apply(lambda x: clean_str(x).split()).values.tolist()

    voc_sentence_counter = Counter()
    for word in flatten_list(voc_sentence):
        voc_sentence_counter[word] += 1

    voc_translation_counter = Counter()
    for word in flatten_list(voc_translation):
        voc_translation_counter[word] += 1

    voc_sent = set(voc_sentence_counter.keys())
    voc_trans = set(voc_translation_counter.keys())
    voc_df = voc_sent | voc_trans

    voc_missing_sentence = voc_sent - emb_vocabulary_cbow
    voc_missing_translation = voc_trans - emb_vocabulary_cbow
    voc_missing_df = voc_df - emb_vocabulary_cbow

    
    data['corpus'].append(c)
    data['vocabulary size'].append(len(voc_df))
    data['vocabulary missing'].append(len(voc_missing_df))
    data['sentence vocabulary'].append(len(voc_sent))
    data['sentence missing'].append(len(voc_missing_sentence))
    data['transcription vocabulary'].append(len(voc_trans))
    data['transcription missing'].append(len(voc_missing_translation))
    data['sentences - transcription difference'].append(len(voc_sent - voc_trans))
    data['transcription - sentences difference'].append(len(voc_trans - voc_sent))
    data['audio length'].append(df['length'].mean())
    data['sentence length'].append(df['sentence'].str.len().mean())
    data['transcription length'].append(df['translation'].str.len().mean())

df = pd.DataFrame(data=data)

In [None]:
df.to_csv(
    '/content/drive/My Drive/TCC_data/metrics/plots/corpus_info.csv', 
    index=False
)