In [None]:
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import numpy as np
import pandas as pd
sys.path.append('hyperwords')
from representations.embedding import Embedding
import pprint
import codecs
import chardet

## Qualitative Analysis of Wikipedia Enriched Embeddings
Arbitrarily select a set of 'bible' words and a set of 'modern' words, and investigate the top 10 nearest neighbour translations using the embeddings trained on the Bible enriched with Wikipedia data. 

In [None]:
bible_words = {'en-fr': [['man','woman','lamb','shepherd','sea'], 
                         ['homme','femme','agneau','berger','mer']],
               'en-es': [['man','woman','lamb','shepherd','sea'], 
                         ['hombre','mujer','cordero','pastor','mar']],
               'en-fi': [['man','woman','lamb','shepherd','sea'], 
                         ['mies','nainen','karitsa','paimen','meri']]}
modern_words = {'en-fr': [['car','phone','film','newspaper','national'], 
                          ['voiture','téléphone','film','journal','nationale']],
                'en-es': [['car','phone','film','newspaper','national'], 
                          ['coche','teléfono','película','periódico','nacional']],
                'en-fi': [['car','phone','film','newspaper','national'], 
                          ['auto','puhelin','elokuva','sanomalehti','kansallinen']]}

In [None]:
training_corpus = 'bible'
enrich_corpus = 'twitter'

In [None]:
def translate_word(word, src_lang, trg_lang, training_corpus, sgns_dir):
    src_vecs = './' + training_corpus + '/' + sgns_dir + '/' + src_lang + '0.vecs'
    trg_vecs = './' + training_corpus + '/' + sgns_dir + '/' + trg_lang + '0.vecs'
    Es = Embedding(src_vecs, True)
    Et = Embedding(trg_vecs, True)
    
    vs = Es.represent(word)
    scores = vs.dot(Et.m.T)
    cands = []
    idx_top10 = reversed(np.argsort(scores)[-10:])
    for idx in idx_top10:
        cands.append((Et.iw[idx], scores[idx]))
    
    return cands

In [None]:
def translate_words_all_langs(word_dict, training_corpus, sgns_dir):
    word_trans = dict()
    for key,word_lists in word_dict.items():
        src_lang = key[0:2]
        trg_lang = key[3:]
        for word in word_lists[0]:
            trans_list = translate_word(word, src_lang, trg_lang, training_corpus, sgns_dir)
            prefix_word = src_lang + '_' + word
            if prefix_word not in word_trans:
                word_trans[prefix_word] = dict()
            word_trans[prefix_word][trg_lang] = trans_list
        src_lang = key[3:]
        trg_lang = key[0:2]
        for word in word_lists[1]:
            trans_list = translate_word(word, src_lang, trg_lang, training_corpus, sgns_dir)
            prefix_word = src_lang + '_' + word
            if prefix_word not in word_trans:
                word_trans[prefix_word] = dict()
            word_trans[prefix_word][trg_lang] = trans_list
    return word_trans

In [None]:
def pprint_lang_translations(langs):    
    for lang, trans_list in langs.items():
        print '\t' + lang
        trans_score_list = ''
        for trans in trans_list:
            trans_score_list = trans_score_list + '(' + trans[0] + ',' + "{0:.3f}".format(trans[1]) + '),'
        print trans_score_list
        print

def pprint_translations(trans_dict, diff=False):
    for word, langs in trans_dict.items():
        print word + ':'
        if diff:
            print '  first:'
            pprint_lang_translations(langs[0])
            print '  second:'
            pprint_lang_translations(langs[1])
        else:
            pprint_lang_translations(langs)

In [None]:
def diff_dict(first, second):
    KEYNOTFOUND = '<KEYNOTFOUND>'
    
    diff = {}
    for key in first.keys():
        if (not second.has_key(key)):
            diff[key] = (first[key], KEYNOTFOUND)
        elif (first[key] != second[key]):
            diff[key] = (first[key], second[key])
    # Check all keys in second dict to find missing
    for key in second.keys():
        if (not first.has_key(key)):
            diff[key] = (KEYNOTFOUND, second[key])
    return diff

In [None]:
bl_bible_word_trans = translate_words_all_langs(bible_words, training_corpus, 'sgns')
pprint_translations(bl_bible_word_trans)

In [None]:
bl_modern_word_trans = translate_words_all_langs(modern_words, training_corpus, 'sgns')
pprint_translations(bl_modern_word_trans)

In [None]:
er_bible_word_trans = translate_words_all_langs(bible_words, training_corpus + '-' + enrich_corpus, 'sgns_enriched_0')
pprint_translations(er_bible_word_trans)

In [None]:
er_modern_word_trans = translate_words_all_langs(modern_words, training_corpus + '-' + enrich_corpus, 'sgns_enriched_0')
pprint_translations(er_modern_word_trans)

### Investigate differences in nearest neighbour translations after enrichment

In [None]:
pprint_translations(diff_dict(bl_bible_word_trans, er_bible_word_trans), diff=True)

### Word Frequencies
Investgate the frequencies of the arbitrarily selected 'bible' and 'modern' word lists in the embeddings that were trained on the Bible corpus alone, and when it was enriched with the Wikipedia corpus.

In [None]:
def build_corpus_vocab(training_corpus, enrich_corpus):
    corpus_vocab = dict()
    if enrich_corpus is not None:
        word_counts_file = './' + training_corpus + '-' + enrich_corpus + '/counts.words.vocab'
    else:
        word_counts_file = './' + training_corpus + '/counts.words.vocab'
    with codecs.open(word_counts_file, 'r', encoding="utf8", errors="replace") as counts_file:
        lines = [line.strip() for line in counts_file]
        for line in lines:
            count_entry = line.split(' ')
            corpus_vocab[count_entry[0]] = int(count_entry[1])
    return corpus_vocab

In [None]:
def print_word_frequencies(word_lists, corpus_vocab):
    for lang_pair, word_lists in word_lists.items():
        src_lang = lang_pair[0:2]
        trg_lang = lang_pair[3:]
        for word in word_lists[0]:
            enc_word = word.decode('utf-8')
            if (src_lang + '0_' + enc_word) in corpus_vocab:
                freq = corpus_vocab[src_lang + '0_' + enc_word]
            else:
                freq = 0
            print src_lang + ': ' + enc_word + ': ' + str(freq)
        for word in word_lists[1]:
            enc_word = word.decode('utf-8')
            if (trg_lang + '0_' + enc_word) in corpus_vocab:
                freq = corpus_vocab[trg_lang + '0_' + enc_word]
            else:
                freq = 0
            print trg_lang + ': ' + enc_word + ': ' + str(freq)

In [None]:
corpus_vocab = build_corpus_vocab(training_corpus, None)

In [None]:
print_word_frequencies(bible_words, corpus_vocab)

In [None]:
print_word_frequencies(modern_words, corpus_vocab)

In [None]:
corpus_vocab = build_corpus_vocab(training_corpus, enrich_corpus)

In [None]:
print_word_frequencies(bible_words, corpus_vocab)

In [None]:
print_word_frequencies(modern_words, corpus_vocab)

## Repeat on a selection of words that have high enrich frequencies
Based on the quantitative analysis of frequency distributions in 'enriched-quantitative-analysis.ipynb', create a list of words that appeared with high frequency in the Wikipedia enrich corpus, but low frequency in the Bible training corpus, and then do the same qualitative analysis as above.

In [None]:
high_freq_enrich_words = {'en-fr': [['it\'s','video','company','game','group'], 
                                    ['c\'est','vidéo','société','jeu','groupe']],
                          'en-es': [['game','national','video','team','season'], 
                                    ['partido','nacional','vídeo','equipo','temporada']],
                          'en-fi': [['later','band','book','area','film'], 
                                    ['myöhemmin','yhtye','kirja','alue','elokuva']]}

In [None]:
print_word_frequencies(high_freq_enrich_words, corpus_vocab)

In [None]:
high_freq_word_trans_baseline = translate_words_all_langs(high_freq_enrich_words, training_corpus, 'sgns')
pprint_translations(high_freq_word_trans_baseline)

In [None]:
high_freq_word_trans_0 = translate_words_all_langs(high_freq_enrich_words, training_corpus + '-' + enrich_corpus, 'sgns_enriched_0')
pprint_translations(high_freq_word_trans_0)

In [None]:
pprint_translations(diff_dict(high_freq_word_trans_baseline, high_freq_word_trans_0), diff=True)

In [None]:
#high_freq_word_trans_1 = translate_words_all_langs(high_freq_enrich_words, training_corpus + '-' + enrich_corpus, 'sgns_enriched_1')
#pprint_translations(high_freq_word_trans_1)

In [None]:
#high_freq_word_trans_2 = translate_words_all_langs(high_freq_enrich_words, training_corpus + '-' + enrich_corpus, 'sgns_enriched_2')
#pprint_translations(high_freq_word_trans_2)

In [None]:
#pprint_translations(diff_dict(high_freq_word_trans_0, high_freq_word_trans_1), diff=True)

In [None]:
#pprint_translations(diff_dict(high_freq_word_trans_0, high_freq_word_trans_2), diff=True)

In [None]:
months_words = {'en-fr': [['january','february','march','april','may','june','july','august','september','october','november','december'], 
                          ['janvier','fevrier','mars','avril','mai','juin','juillet','aout','septembre','octobre','novembre','decembre']],
                'en-es': [['january','february','march','april','may','june','july','august','september','october','november','december'],
                          ['enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre']],
                'en-fi': [['january','february','march','april','may','june','july','august','september','october','november','december'],
                          ['tammikuu','helmikuu','maaliskuu','huhtikuu','toukokuu','kesäkuu','heinäkuu','elokuu','syyskuu','lokakuu','marraskuu','joulukuu']]}


In [None]:
print_word_frequencies(months_words, corpus_vocab)

In [None]:
month_word_trans = translate_words_all_langs(months_words, training_corpus, 'sgns')
pprint_translations(month_word_trans)

In [None]:
month_word_trans = translate_words_all_langs(months_words, training_corpus + '-' + enrich_corpus, 'sgns_enriched_0')
pprint_translations(month_word_trans)