In [None]:
# -*- coding: utf-8 -*-
import alignment_eval as al_evl
import wiktionary_eval as wk_evl
import numpy as np
import pandas as pd
import scipy.stats as ss
import os

In [None]:
def alignment_eval_lang_pair(training_corpus, eval_set, sgns_dir, bmk_results, src_lang, tgt_lang):
    
    if eval_set.find('/') > -1:
        eval_set_name = eval_set[0: eval_set.find('/')]
    else:
        eval_set_name = eval_set
        
    key = eval_set_name + '-' + src_lang + '-' + tgt_lang
    print '\tEvaluating: ' + key
    bmk_results[key] = al_evl.alignment_eval('./' + training_corpus + '/' + sgns_dir + '/' + src_lang + '0.vecs',
                                                      './' + training_corpus + '/' + sgns_dir + '/' + tgt_lang + '0.vecs',
                                                      'eval_data/' + eval_set + '/alignment',
                                                      'eval_data/' + eval_set + '/test.' + src_lang,
                                                      'eval_data/' + eval_set + '/test.' + tgt_lang, False)
    key = eval_set_name + '-' + tgt_lang + '-' + src_lang
    print '\tEvaluating: ' + key
    bmk_results[key] = al_evl.alignment_eval('./' + training_corpus + '/' + sgns_dir + '/' + src_lang + '0.vecs',
                                                      './' + training_corpus + '/' + sgns_dir + '/' + tgt_lang + '0.vecs',
                                                      'eval_data/' + eval_set + '/alignment',
                                                      'eval_data/' + eval_set + '/test.' + src_lang,
                                                      'eval_data/' + eval_set + '/test.' + tgt_lang, True)
    return bmk_results

In [None]:
def alignment_eval(training_corpus, sgns_dir):
    bmk_results = dict()
    bmk_results = alignment_eval_lang_pair(training_corpus, 'graca/enfr', sgns_dir, bmk_results, 'en', 'fr')
    bmk_results = alignment_eval_lang_pair(training_corpus, 'graca/enes', sgns_dir, bmk_results, 'en', 'es')
    bmk_results = alignment_eval_lang_pair(training_corpus, 'graca/enpt', sgns_dir, bmk_results, 'en', 'pt')
    bmk_results = alignment_eval_lang_pair(training_corpus, 'hansards', sgns_dir, bmk_results, 'en', 'fr')
    bmk_results = alignment_eval_lang_pair(training_corpus, 'lambert', sgns_dir, bmk_results, 'en', 'es')
    bmk_results = alignment_eval_lang_pair(training_corpus, 'holmqvist', sgns_dir, bmk_results, 'en', 'sv')
    if training_corpus != 'europarl':
        bmk_results = alignment_eval_lang_pair(training_corpus, 'cakmak', sgns_dir, bmk_results, 'en', 'tr')
    bmk_results = alignment_eval_lang_pair(training_corpus, 'mihalcea', sgns_dir, bmk_results, 'ro', 'en')
    
    return bmk_results

In [None]:
def wiktionary_eval_lang_pair(training_corpus, sgns_dir, bmk_results, src_lang, tgt_lang, b_include_oov=False, precision_at_N=1):
    key = 'wiktionary-' + src_lang + '-' + tgt_lang
    print '\tEvaluating: ' + key
    bmk_results[key] = wk_evl.wiktionary_eval('./' + training_corpus + '/' + sgns_dir + '/' + src_lang + '0.vecs', 
                                             './' + training_corpus + '/' + sgns_dir + '/' + tgt_lang + '0.vecs', 
                                             'eval_data/wiktionary/' + src_lang + '-' + tgt_lang + '-enwiktionary.txt',
                                             b_reverse=False, b_include_oov=b_include_oov, precision_at_N=precision_at_N)
    key = 'wiktionary-' + tgt_lang + '-' + src_lang
    print '\tEvaluating: ' + key
    bmk_results[key] = wk_evl.wiktionary_eval('./' + training_corpus + '/' + sgns_dir + '/' + src_lang + '0.vecs', 
                                             './' + training_corpus + '/' + sgns_dir + '/' + tgt_lang + '0.vecs', 
                                             'eval_data/wiktionary/' + src_lang + '-' + tgt_lang + '-enwiktionary.txt',
                                             b_reverse=True, b_include_oov=b_include_oov, precision_at_N=precision_at_N)
    return bmk_results

In [None]:
def wiktionary_eval(training_corpus, sgns_dir, b_include_oov=False, lang_list=None, precision_at_N=1):
    bmk_results = dict()
    
    if lang_list is None:
        lang_list = ['ar','es','fi','fr','he','hu','pt','tr']
    
    for lang in lang_list:
        if not(lang in ['ar','he','tr'] and training_corpus == 'europarl'):
            bmk_results= wiktionary_eval_lang_pair(training_corpus, sgns_dir, bmk_results, 
                                                   'en', lang, b_include_oov=b_include_oov, precision_at_N=precision_at_N)
    return bmk_results

In [None]:
def evaluate_sample_distribution(training_corpus, vecs_dir_prefix, b_alignment=True, b_wiktionary=True, 
                                 b_include_oov=False, lang_list=None, precision_at_N=1):
    sample_dist = dict()
    #if os.path.islink(training_corpus):
    #    training_corpus = os.readlink(training_corpus)
    for dirname in os.listdir(training_corpus):
        if dirname.startswith(vecs_dir_prefix):
            print 'Test sample: ' + dirname
            if b_alignment:
                alignment_results = alignment_eval(training_corpus, dirname)
                for key, value in alignment_results.items():
                    if key in sample_dist:
                        sample_dist[key].append(value)
                    else:
                        sample_dist[key] = [value]
            if b_wiktionary:
                wiktionary_results = wiktionary_eval(training_corpus, dirname, b_include_oov, lang_list, precision_at_N)
                for key, value in wiktionary_results.items():
                    if key in sample_dist:
                        sample_dist[key].append(value)
                    else:
                        sample_dist[key] = [value]
    return sample_dist

In [None]:
def calc_t_test(df_sample_dist, pop_mu):
    sample_means = dict()
    t_statistics = dict()
    p_values = dict()
    
    for key in [col.encode('ascii', 'ignore') for col in df_sample_dist.columns.tolist()]:
        sample = df_sample_dist[key]
        [t_statistic, p_value] = ss.ttest_1samp(sample, pop_mu[key])
        sample_means[key] = sample.mean()
        t_statistics[key] = t_statistic
        p_values[key] = p_value
        
    df_sample_dist.loc['mu-sample']=pd.DataFrame(sample_means, index=[0]).loc[0]
    df_sample_dist.loc['mu-H0']=pd.DataFrame(pop_mu, index=[0]).loc[0]
    df_sample_dist.loc['t-statistic']=pd.DataFrame(t_statistics, index=[0]).loc[0]
    df_sample_dist.loc['p-value']=pd.DataFrame(p_values, index=[0]).loc[0]
    
    return df_sample_dist