In [None]:
# -*- coding: utf-8 -*-
from __future__ import division
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#%matplotlib notebook

In [None]:
def get_training_vocab_counts(training_corpus):
    corpus_vocab = dict()
    training_word_counts_file = './' + training_corpus + '/counts.words.vocab'
    with codecs.open(training_word_counts_file, 'r', encoding='utf-8', errors='replace') as counts_file:
        count_entries = [line.strip().split(' ') for line in counts_file]
        for count_entry in count_entries:
            corpus_vocab[count_entry[0]] = (int(count_entry[1]), 0)
    return corpus_vocab

In [None]:
def collate_training_enrich_vocab_counts(training_corpus, enrich_corpus):
    
    corpus_vocab = get_training_vocab_counts(training_corpus)

    enrich_word_counts_file = './' + enrich_corpus + '/counts.words.vocab'
    with codecs.open(enrich_word_counts_file, 'r', encoding='utf-8', errors='replace') as counts_file:
        count_entries = [line.strip().split(' ') for line in counts_file]
        for count_entry in count_entries:
            if count_entry[0] in corpus_vocab:
                count_tuple = corpus_vocab[count_entry[0]]
                if (len(count_tuple) == 1):
                    corpus_vocab[count_entry[0]] = (count_tuple[0], int(count_entry[1]))
                elif (len(count_tuple) == 2):
                    corpus_vocab[count_entry[0]] = (count_tuple[0], count_tuple[1] + int(count_entry[1]))
            else:
                corpus_vocab[count_entry[0]] = (0, int(count_entry[1]))
    return corpus_vocab

In [None]:
def eval_subset_corpus_freq_dist(eval_subset_filename, lang_pair, corpus_vocab, s_freq_dist=None, t_freq_dist=None):
    if s_freq_dist is None:
        s_freq_dist = dict()
    if t_freq_dist is None:
        t_freq_dist = dict()
    BX = [(l.split("|||")[-1].strip(), l.split("|||")[0].strip()) 
          for l in codecs.open(eval_subset_filename, encoding='utf-8', errors='replace').readlines()]
    for t, s in BX:
        s_prefixed = lang_pair[0] + '0_' + s
        if s_prefixed in corpus_vocab:
            s_freq_dist[s_prefixed] = corpus_vocab[s_prefixed]
        t_prefixed = lang_pair[1] + '0_' + t
        if t_prefixed in corpus_vocab:
            t_freq_dist[t_prefixed] = corpus_vocab[t_prefixed]
        
    return [s_freq_dist, t_freq_dist]

In [None]:
def eval_set_corpus_freq_dist(eval_dir, corpus_vocab, lang_pairs, eval_subsets, b_merge_subsets=True):
    eval_path = './' + training_corpus + '/' + eval_dir + '/'
    freq_dists = dict()
    for lang_pair in lang_pairs:
        s_key = lang_pair[0] + '-' + lang_pair[1]
        t_key = lang_pair[1] + '-' + lang_pair[0]
        for eval_subset in eval_subsets:
            if b_merge_subsets:
                if s_key in freq_dists:
                    s_freq_dist = freq_dists[s_key]
                else:
                    s_freq_dist = None
                if t_key in freq_dists:
                    t_freq_dist = freq_dists[t_key]
                else:
                    t_freq_dist = None
            else:
                s_key = eval_subset + '-' + s_key
                t_key = eval_subset + '-' + t_key
                s_freq_dist = None
                t_freq_dist = None
            eval_file = eval_path + eval_subset + '-' + lang_pair[0] + '-' + lang_pair[1] + '.txt'    
            [s_freq_dist, t_freq_dist] = eval_subset_corpus_freq_dist(eval_file, lang_pair, corpus_vocab, s_freq_dist, s_freq_dist)
            freq_dists[s_key] = s_freq_dist
            freq_dists[t_key] = t_freq_dist
    return freq_dists

In [None]:
def baseline_corpus_freq_dist(corpus_vocab, langs):
    freq_dists = dict()
    for lang in langs:
        freq_dist = dict()
        for corpus_item in corpus_vocab.items():
            if corpus_item[0][0:2] == lang:
                freq_dist[corpus_item[0]] = corpus_item[1]
        freq_dists[lang] = freq_dist
    return freq_dists

In [None]:
def eval_set_baseline_corpus_freq_dist(eval_dir, corpus_vocab, lang_pairs):
    eval_path = './' + training_corpus + '/'  + eval_dir + '/'
    freq_dists = dict()
    for lang_pair in lang_pairs:
        eval_file = eval_path + 'inv' + '-' + lang_pair[0] + '-' + lang_pair[1] + '.txt'     
        print('eval_file: ' + eval_file)
        [s_freq_dist, t_freq_dist] = eval_subset_corpus_freq_dist(eval_file, lang_pair, corpus_vocab)
        freq_dists[lang_pair[0] + '-' + lang_pair[1]] = s_freq_dist
        freq_dists[lang_pair[1] + '-' + lang_pair[0]] = t_freq_dist
    return freq_dists

In [None]:
def freq_dists_to_data_frames(freq_dists):  
    df_freq_dists = dict()
    for lang_key, freq_dist in freq_dists.items():
        df_freq_dist = pd.DataFrame(freq_dist, index=['train','enrich']).transpose()
        df_freq_dist['total'] = df_freq_dist['train'] + df_freq_dist['enrich']
        df_freq_dists[lang_key] = df_freq_dist
    return df_freq_dists

In [None]:
def plot_freq_dists(freq_dists):
    for lang_key, freq_dist in freq_dists.items():
        
        sorted_keys = sorted(freq_dist, key=freq_dist.get, reverse=True)

        train_freqs = [freq_dist[word_key][0] for word_key in sorted_keys]
        enrich_freqs = [freq_dist[word_key][1] for word_key in sorted_keys]
        print 'Vocabulary size: ' + str(len(train_freqs))
        ind = np.arange(len(train_freqs))

        fig = plt.figure(figsize=(10,6))
        plt.autoscale(enable=True)

        axes = fig.get_axes()
        for ax in axes:
            ax.set_xlim(xmax=20000)
            ax.set_ylim(1,10**5)


        p1 = plt.bar(ind, train_freqs, color='b', log=True)
        p2 = plt.bar(ind, enrich_freqs, color='r', bottom=train_freqs, log=True)


        plt.xlabel('Word Entries')
        plt.ylabel('Frequency')
        plt.title(r'Frequency Distribution: ' + lang_key)
        plt.legend((p1[0], p2[0]), (training_corpus, enrich_corpus))

        fig.tight_layout()

        plt.show()
        fig.savefig('enrich_freq_dist/' + lang_key + '.png')

In [None]:
def plot_baseline_freq_dists(freq_dists, color, eval_subset):
    for lang_key, freq_dist in freq_dists.items():
        
        sorted_keys = sorted(freq_dist, key=freq_dist.get, reverse=True)

        train_freqs = [freq_dist[word_key][0] for word_key in sorted_keys]
        print 'Vocabulary size: ' + str(len(train_freqs))
        ind = np.arange(len(train_freqs))

        fig = plt.figure(figsize=(10,6))
        plt.autoscale(enable=True)

        axes = fig.get_axes()
        for ax in axes:
            ax.set_xlim(xmax=20000)
            ax.set_ylim(1,10**5)

        p1 = plt.bar(ind, train_freqs, color=color, log=True)


        plt.xlabel('Word Entries')
        plt.ylabel('Frequency')
        plt.title(r'Frequency Distribution: ' + eval_subset + lang_key)

        fig.tight_layout()

        plt.show()
        fig.savefig('baseline_freq_dist/' + eval_subset + lang_key + '.png')