In [None]:
# -*- coding: utf-8 -*-
from __future__ import division
from os import listdir
from os.path import join
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import codecs
import os

#%matplotlib notebook

In [None]:
# lang_names = {
#     'ar':'Arabic',
#     'cs':'Czech',
#     'de':'German',
#     'el':'Greek',
#     'en':'English',
#     'es':'Spanish',
#     'et':'Estonian',
#     'fa':'Farsi',
#     'fi':'Finnish',
#     'fr':'French',
#     'he':'Hebrew',
#     'hi':'Hindi',
#     'hu':'Hungarian',
#     'id':'Indonesian',
#     'is':'Icelandic',
#     'it':'Italian',
#     'ja':'Japanese',
#     'ko':'Korean',
#     'lt':'Lithuanian',
#     'mi':'Maori',
#     'my':'Myanmar',
#     'ne':'Nepali',
#     'pl':'Polish',
#     'pt':'Portuguese',
#     'ro':'Romanian',
#     'ru':'Russian',
#     'sk':'Slovak',
#     'sv':'Swedish',
#     'th':'Thai',
#     'tr':'Turkish',
#     'vi':'Vietnamese',
#     'zh':'Chinese',
#     }

In [None]:
lang_names = {
    'af':'Afrikaans',
    'ar':'Arabic',
    'bg':'Bulgarian',
    'cb':'Cebuano',
    'cf':'Haitian Creole',
    'cs':'Czech',
    'da':'Danish',
    'de':'German',
    'el':'Greek',
    'en':'English',
    'eo':'Esperanto',
    'es':'Spanish',
    'et':'Estonian',
    'fa':'Farsi',
    'fi':'Finnish',
    'fr':'French',
    'he':'Hebrew',
    'hi':'Hindi',
    'hr':'Croatian',
    'hu':'Hungarian',
    'id':'Indonesian',
    'is':'Icelandic',
    'it':'Italian',
    'ja':'Japanese',
    'kn':'Kannada',
    'ko':'Korean',
    'la':'Latin',
    'lt':'Lithuanian',
    'lv':'Latvian',
    'mg':'Malagasy',
    'mi':'Maori',
    'ml':'Malayalam',
    'mr':'Marathi',
    'my':'Myanmar',
    'ne':'Nepali',
    'nl':'Dutch',
    'no':'Norwegian',
    'pa':'Paite (Chin)',
    'pl':'Polish',
    'pt':'Portuguese',
    'qe':'Q’eqchi’',
    'ro':'Romanian',
    'ru':'Russian',
    'sk':'Slovak',
    'sl':'Slovene',
    'so':'Somali',
    'sq':'Albanian',
    'sr':'Serbian',
    'sv':'Swedish',
    'te':'Telugu',
    'th':'Thai',
    'tl':'Tagalog',
    'tr':'Turkish',
    'vi':'Vietnamese',
    'ww':'English',
    'xh':'Xhosa',
    'zh':'Chinese',
    'zm':'Zarma'
    }

In [None]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

## Training Corpus Analysis ###

Investigate the ratio of word count to vocabulary size for each language (indicator of Isolating vs Polysynthetic qualities of the language).

What proportion of the overall vocabulary is above the frequency threshold to be included in the embeddings.

Contrast the two corpora in terms of size and breadth of vocabulary.

In [None]:
def corpus_lang_counts(dir_path, lang_counts=None):
    
    if lang_counts is None:
        lang_counts = dict()
        
    for path in [join(dir_path, f) for f in listdir(dir_path) if f.endswith('.txt')]:
        language_prefix = path[-7:-5]
        with codecs.open(path, encoding="utf8", errors="replace") as fin:
            lines = [line.strip() for line in fin]
            vocab = Counter()
            word_count = 0
            for line in lines:
                words = line.split('\t')[1].split(' ')
                word_count += len(words)
                vocab.update(Counter(words))
        lang_counts[language_prefix] = [len(lines), word_count, len(vocab.keys()), vocab]
    
    return lang_counts

In [None]:
## Get the word count and vocabulary size for each language file
def corpus_counts(training_corpus, enrich_corpus=None):
    dir_path = './' + training_corpus
    lang_counts = corpus_lang_counts(dir_path)
    if enrich_corpus is None:
        vecs_path = dir_path + '/sgns'
    else:
        dir_path = './' + enrich_corpus
        lang_counts = corpus_lang_counts(dir_path, lang_counts)
        vecs_path = './' + training_corpus + '-' + enrich_corpus + '/sgns'
    
    # Get the size of the learnt vocabulary (thresholded at word_freq > 2)
    for path in [join(vecs_path, f) for f in listdir(vecs_path) if f.endswith('.vecs.vocab')]:
        language_prefix = path[-14:-12]
        learnt = set()
        if not is_number(language_prefix): # Exclude some extraneous files e.g. 741.vecs.vocab
            count_list = lang_counts[language_prefix]
            with codecs.open(path, encoding="utf-8", errors="replace") as fin:
                lines = [line.strip() for line in fin]
                for word in lines:
                    learnt.add(word)
                count_list.append(len(lines))
                count_list.append(learnt)                
        
    return lang_counts

In [None]:
def tabulate_corpus_counts(counter):
    counts = dict()
    for key,values in counter.items():
        if key in lang_names:
            counts[key] = ((key, codecs.decode(lang_names[key], 'utf-8', 'replace'), values[0], values[1], values[2], values[4]))
    return counts

In [None]:
def print_corpus_counts(counts, enriched_counts=None):
    print(str(len(counts)) + ' languages:\n')
    if enriched_counts is None:
        print('lang\tname\t\tsents\twords\tvocab\tlearnt\tlearnt%')
        print('======\t==========\t======\t======\t======\t======\t======')
        for key, lang in counts.items():
            lang_name = lang[1] +'\t' if len(lang[1])<8 else lang[1]
            learnt_pc = round((lang[5]/lang[4])*100, 1)
            print(lang[0] + '\t' + lang_name + '\t' + str(lang[2]) + '\t' + str(lang[3]) + '\t' + str(lang[4]) + 
                  '\t' + str(lang[5]) + '\t' + str(learnt_pc))
    else:
        print('\t\t\t|enrich corpus\t|unenriched\t\t|enriched\t\t|')
        print('lang\tname\t\t|sents\twords\t|vocab\tlearnt\tlearnt%\t|vocab\tlearnt\tlearnt%\t|learnt_gain%')
        print('======\t==========\t|======\t======\t|======\t======\t======\t|======\t======\t======\t|============')
        for key, lang in enriched_counts.items():
            lang_name = lang[1] +'\t' if len(lang[1])<8 else lang[1]
            enriched_learnt_pc = round((lang[5]/lang[4])*100, 1)
            learnt_pc = round((counts[key][5]/counts[key][4])*100, 1)
            gain_pc = round((lang[5]/counts[key][5])*100, 1)
            print(lang[0] + '\t' + lang_name + '\t' + '|' + str(lang[2]) + '\t' + str(lang[3]) + '\t' +
                  '|' + str(counts[key][4]) + '\t' + str(counts[key][5]) + '\t' + str(learnt_pc) + '\t' +
                  '|' + str(lang[4]) + '\t' + str(lang[5]) + '\t' + str(enriched_learnt_pc) + '\t' +
                  '|' + str(gain_pc))

In [None]:
def bar_chart_corpus_counts(counts, corpus):
    counts = counts.values()
    N = len(counts)
    
    words = tuple([int(lang[3]) for lang in counts])

    ind = np.arange(N)  # the x locations for the groups
    width = 0.3       # the width of the bars
    gap = 0.05

    fig, ((ax1), (ax2)) = plt.subplots(nrows=2, ncols=1, sharex=True) # Create matplotlib figure
    fig.set_size_inches(width*N*2 + gap*N, 16)
    fig.suptitle(corpus + ' corpus')
    
    rects1 = ax1.bar(ind, words, width, color='r')

    vocab = tuple([int(lang[4]) for lang in counts])
    rects2 = ax2.bar(ind, vocab, width, color='b')
    
    learnt = tuple([int(lang[5]) for lang in counts])
    rects3 = ax2.bar(ind + width, learnt, width, color='g')

    # add some text for labels, title and axes ticks
    ax1.set_ylabel('Word count')
    ax2.set_ylabel('Vocabulary size')
    ax1.set_xticks(ind)
    ax2.set_xticks(ind)
    ax1.grid()
    ax2.grid()
    ax1.set_xticklabels(tuple([lang[1] for lang in counts]), rotation=45)
    ax2.set_xticklabels(tuple([lang[1] for lang in counts]), rotation=45)

    ax2.legend((rects2, rects3), ('Vocab', 'Learnt Vocab'))

    fig.tight_layout()

    plt.show()
    fig.savefig('corpus_counts_' + corpus + '.png')

In [None]:
## Get metrics on the bible corpus
training_corpus = 'bible'
bible_corpus_counter = corpus_counts(training_corpus)
bible_corpus_table = tabulate_corpus_counts(bible_corpus_counter)
print_corpus_counts(bible_corpus_table)
bar_chart_corpus_counts(bible_corpus_table, 'Bible')

In [None]:
## Get metrics on the europarl corpus
training_corpus = 'europarl'
europarl_corpus_counter = corpus_counts(training_corpus)
europarl_corpus_table = tabulate_corpus_counts(europarl_corpus_counter)
print_corpus_counts(europarl_corpus_table)
bar_chart_corpus_counts(europarl_corpus_table, 'Europarl')

In [None]:
## Get metrics on the wikipedia corpus
## DON'T RUN FOR NOW - causing out of memory errors due to the size the corpus has grown to

# training_corpus = 'wikipedia'
# wikipedia_corpus_counter = corpus_counts(training_corpus)
# wikipedia_corpus_table = tabulate_corpus_counts(wikipedia_corpus_counter)
# print_corpus_counts(wikipedia_corpus_table)
# bar_chart_corpus_counts(wikipedia_corpus_table, 'Wikipedia')

In [None]:
## Get metrics on the bible-wikipedia enriched corpus
# training_corpus = 'bible'
# enrich_corpus = 'wikipedia'
# enrich_langs = ['ar','en','es','fi','fr','he','hu','pt','tr']
# bible_wikipedia_corpus_counter = corpus_counts(training_corpus, enrich_corpus)
# bible_wikipedia_corpus_table = tabulate_corpus_counts(bible_wikipedia_corpus_counter)
# unenriched_counts = dict()
# for key, lang in bible_corpus_table.items():
#     if key in enrich_langs:
#         unenriched_counts[key] = lang
# enriched_counts = dict()
# for key, lang in bible_wikipedia_corpus_table.items():
#     if key in enrich_langs:
#         enriched_counts[key] = lang
# print_corpus_counts(unenriched_counts, enriched_counts)

## Evaluation Benchmark Analysis ###

Identify what proportion of the source & target benchmark words are missing from the learnt vocabulary for each training corpus

In [None]:
def benchmark_counts(bmk, source, target):
    
    base_path = './eval_data'
    s_path = base_path + '/' + bmk + '/test.' + source
    t_path = base_path + '/' + bmk + '/test.' + target
    s_sents=[l.strip().split(">")[1].split("<")[0].split() for l in codecs.open(s_path,'r',"utf8",errors='ignore').readlines()]
    t_sents=[l.strip().split(">")[1].split("<")[0].split() for l in codecs.open(t_path,'r',"utf8",errors='ignore').readlines()]
    
    s_vocab = Counter()
    t_vocab = Counter()
    
    s_word_count = 0
    t_word_count = 0
    
    for s_sent in s_sents:
        s_word_count += len(s_sent)
        s_vocab.update(Counter(s_sent))
    
    for t_sent in t_sents:
        t_word_count += len(t_sent)
        t_vocab.update(Counter(t_sent))
    
    bmk_counts = [len(s_sents), len(t_sents), s_word_count, t_word_count, 
                  len(s_vocab.keys()), len(t_vocab.keys()), s_vocab, t_vocab]
    
    return bmk_counts

In [None]:
bmk_counts = dict()
bmk_counts['cakmak-en-tr'] = benchmark_counts('cakmak','en','tr')
bmk_counts['holmqvist-en-sv'] = benchmark_counts('holmqvist','en','sv')
bmk_counts['mihalcea-en-ro'] = benchmark_counts('mihalcea','en','ro')
bmk_counts['lambert-en-es'] = benchmark_counts('lambert','en','es')
bmk_counts['hansards-en-fr'] = benchmark_counts('hansards','en','fr')
bmk_counts['graca-en-fr'] = benchmark_counts('graca/enfr','en','fr')
bmk_counts['graca-en-es'] = benchmark_counts('graca/enes','en','es')
bmk_counts['graca-en-pt'] = benchmark_counts('graca/enpt','en','pt')

In [None]:
def compare_corpus_bmk_counts(bmk_counter, corpus_counter, source, target):
    
    comparison_stats = []
    
    try:
        s_corpus_counter = corpus_counter[source]
        t_corpus_counter = corpus_counter[target]

        s_corpus_vocab = s_corpus_counter[3]
        s_corpus_learnt = s_corpus_counter[5]
        s_bmk_vocab = bmk_counter[6]

        s_vocab_matches = 0
        s_learnt_matches = 0
        for key, value in s_bmk_vocab.items():
            if s_corpus_vocab[key]:
                s_vocab_matches += 1
            if key in s_corpus_learnt:
                s_learnt_matches += 1
        s_vocab_match_pc = round((s_vocab_matches/len(s_bmk_vocab.items()))*100, 1)
        s_learnt_match_pc = round((s_learnt_matches/len(s_bmk_vocab.items()))*100, 1)

        t_corpus_vocab = t_corpus_counter[3]
        t_corpus_learnt = t_corpus_counter[5]
        t_bmk_vocab = bmk_counter[7]

        t_vocab_matches = 0
        t_learnt_matches = 0
        for key, value in t_bmk_vocab.items():
            if t_corpus_vocab[key]:
                t_vocab_matches += 1
            if key in t_corpus_learnt:
                t_learnt_matches += 1
        t_vocab_match_pc = round((t_vocab_matches/len(t_bmk_vocab.items()))*100, 1)
        t_learnt_match_pc = round((t_learnt_matches/len(t_bmk_vocab.items()))*100, 1)

        comparison_stats = [s_vocab_matches, s_learnt_matches, t_vocab_matches, t_learnt_matches,
                           s_vocab_match_pc, s_learnt_match_pc, t_vocab_match_pc, t_learnt_match_pc]

        return comparison_stats
    except KeyError:
        #print('Source or target language not available in corpus')
        return None
        

In [None]:
def print_bmk_corpus_comparison(bmk_corpus_comparison):
    print('bmk/corpus\t\ts_voc%\ts_lnt%\tt_voc%\tt_lnt%')
    print('==============\t\t======\t======\t======\t======')
    for key, value in sorted(bmk_corpus_comparison.items()):
        print(key + '\t' + str(value[4]) + '\t' + str(value[5]) + '\t' + str(value[6]) + '\t' + str(value[7]))

In [None]:
def compare_all_bmks(bmks):
    bmk_corpus_comparison = dict()
    for key,value in bmks.items():
        bible_comparison = compare_corpus_bmk_counts(value, bible_corpus_counter, key[-5:-3], key[-2:])
        if bible_comparison is not None:
            bmk_corpus_comparison[key + '-bible'] = bible_comparison
        europarl_comparison = compare_corpus_bmk_counts(value, europarl_corpus_counter, key[-5:-3], key[-2:])
        if europarl_comparison is not None:
            bmk_corpus_comparison[key + '-europl'] = europarl_comparison
#         bible_wikipedia_comparison = compare_corpus_bmk_counts(value, bible_wikipedia_corpus_counter, key[-5:-3], key[-2:])
#         if bible_wikipedia_comparison is not None:
#             bmk_corpus_comparison[key + '-bib-wik'] = bible_wikipedia_comparison
    return bmk_corpus_comparison

### In-Vocabulary Word Percentages - Alignment Benchmarks

In [None]:
print_bmk_corpus_comparison(compare_all_bmks(bmk_counts))

In [None]:
def wiktionary_counts(source, target):
    base_path = './eval_data/wiktionary'
    path = base_path + '/' + source + '-' + target + '-' + source + 'wiktionary.txt'
    
    BX = [(l.split("|||")[-1].strip(), l.split("|||")[0].strip()) for l in codecs.open(path, encoding='utf8').readlines()]
    
    s_vocab = set()
    t_vocab = set()
    
    s_word_count = 0
    t_word_count = 0
    identical_count = 0
    identical_capitalised_count = 0
    
    translit_u_filename = './eval_data/wiktionary/translit-upper-' + source + '-' + target + '.txt'
    translit_l_filename = './eval_data/wiktionary/translit-lower-' + source + '-' + target + '.txt'
    
    # Clear down any previous version of the output files
    if os.path.isfile(translit_u_filename):
        os.remove(translit_u_filename)
    if os.path.isfile(translit_l_filename):
        os.remove(translit_l_filename)

    with codecs.open(translit_u_filename, 'a', encoding='utf-8', errors='replace') as translit_u_file:
        with codecs.open(translit_l_filename, 'a', encoding='utf-8', errors='replace') as translit_l_file:
    
            for s,t in BX:
                s_vocab.add(s)
                t_vocab.add(t)
                if s == t:
                    identical_count += 1
                    if s[0].isupper():
                        translit_u_file.write(s + ' ||| ' + t + '\n')
                        identical_capitalised_count += 1
                    else:
                        translit_l_file.write(s + ' ||| ' + t + '\n')


    
    identical_pc = identical_count * 100 / len(BX)
    identical_capitalised_pc = identical_capitalised_count * 100 / len(BX)
    
    counts = [len(BX), s_vocab, t_vocab, identical_count, identical_pc, 
              identical_capitalised_count, identical_capitalised_pc]
    
    return counts

In [None]:
wiknry_counts = dict()
wiknry_counts['en-ar'] = wiktionary_counts('en', 'ar')
wiknry_counts['en-es'] = wiktionary_counts('en', 'es')
wiknry_counts['en-fi'] = wiktionary_counts('en', 'fi')
wiknry_counts['en-fr'] = wiktionary_counts('en', 'fr')
wiknry_counts['en-he'] = wiktionary_counts('en', 'he')
wiknry_counts['en-hu'] = wiktionary_counts('en', 'hu')
wiknry_counts['en-pt'] = wiktionary_counts('en', 'pt')
wiknry_counts['en-tr'] = wiktionary_counts('en', 'tr')

In [None]:
def print_wiktionary_counts(counts, lang_pairs):
    for lang_pair in lang_pairs:
        this_count = counts[lang_pair[0] + '-' + lang_pair[1]]
        print(lang_pair[0] + '-' + lang_pair[1] + ': ' + str(this_count[0]) + 
              '\t transliterations: ' + str(this_count[3]) + ' (' + str(round(this_count[4],1)) + '%)' + 
             '\t of which capitalized: ' + str(this_count[5]) + ' (' + str(round(this_count[6],1)) + '%)')

In [None]:
lang_pairs = [('en','ar'), ('en','es'), ('en','fi'), ('en','fr'), ('en','he'), ('en','hu'), ('en','pt'), ('en','tr')]
print_wiktionary_counts(wiknry_counts, lang_pairs)

In [None]:
def compare_corpus_wiknry_counts(wiknry_counter, corpus_counter, source, target):
    
    comparison_stats = []
    
    try:
        wiknry_word_count = wiknry_counter[0]
        
        s_corpus_counter = corpus_counter[source]
        t_corpus_counter = corpus_counter[target]

        s_corpus_vocab = s_corpus_counter[3]
        s_corpus_learnt = s_corpus_counter[5]
        s_wiknry_vocab = wiknry_counter[1]

        s_vocab_matches = 0
        s_learnt_matches = 0
        for word in s_wiknry_vocab:
            if s_corpus_vocab[word]:
                s_vocab_matches += 1
            if word in s_corpus_learnt:
                s_learnt_matches += 1
        s_vocab_match_pc = round((s_vocab_matches/len(s_wiknry_vocab))*100, 1)
        s_learnt_match_pc = round((s_learnt_matches/len(s_wiknry_vocab))*100, 1)

        t_corpus_vocab = t_corpus_counter[3]
        t_corpus_learnt = t_corpus_counter[5]
        t_wiknry_vocab = wiknry_counter[2]

        t_vocab_matches = 0
        t_learnt_matches = 0
        for word in t_wiknry_vocab:
            if t_corpus_vocab[word]:
                t_vocab_matches += 1
            if word in t_corpus_learnt:
                t_learnt_matches += 1
        t_vocab_match_pc = round((t_vocab_matches/len(t_wiknry_vocab))*100, 1)
        t_learnt_match_pc = round((t_learnt_matches/len(t_wiknry_vocab))*100, 1)

        comparison_stats = [s_vocab_matches, s_learnt_matches, t_vocab_matches, t_learnt_matches,
                           s_vocab_match_pc, s_learnt_match_pc, t_vocab_match_pc, t_learnt_match_pc]

        return comparison_stats
    except KeyError:
        #print('Source or target language not available in corpus')
        return None
        

In [None]:
def print_wiknry_corpus_comparison(wiknry_corpus_comparison):
    print('lang/corpus\ts_voc%\ts_lnt%\tt_voc%\tt_lnt%')
    print('==============\t======\t======\t======\t======')
    for key, value in sorted(wiknry_corpus_comparison.items()):
        print(key + '\t' + str(value[4]) + '\t' + str(value[5]) + '\t' + str(value[6]) + '\t' + str(value[7]))

In [None]:
def compare_wiknry_vocab(counts):
    wiknry_corpus_comparison = dict()
    for key,value in counts.items():
        bible_comparison = compare_corpus_wiknry_counts(value, bible_corpus_counter, key[-5:-3], key[-2:])
        if bible_comparison is not None:
            wiknry_corpus_comparison[key + '-bible'] = bible_comparison
        europarl_comparison = compare_corpus_wiknry_counts(value, europarl_corpus_counter, key[-5:-3], key[-2:])
        if europarl_comparison is not None:
            wiknry_corpus_comparison[key + '-europarl'] = europarl_comparison
#         bible_wikipedia_comparison = compare_corpus_wiknry_counts(value, bible_wikipedia_corpus_counter, key[-5:-3], key[-2:])
#         if bible_wikipedia_comparison is not None:
#             wiknry_corpus_comparison[key + '-bib-wik'] = bible_wikipedia_comparison
    return wiknry_corpus_comparison

### In-Vocabulary Word Percentages - Wiktionary Benchmarks

In [None]:
print_wiknry_corpus_comparison(compare_wiknry_vocab(wiknry_counts))