In [None]:
import os
import re
import glob
import json
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd

os.makedirs('latex', exist_ok=True)
os.makedirs('sig', exist_ok=True)
os.makedirs('measure_latex', exist_ok=True)

In [None]:
floating_point_number = re.compile(r'\d+\.\d+')

rename_dict = {
    'cos1': 'Cos',
    'dam': 'Dam',
    'dice1': 'Dice',
    'jac1': 'Jac',
    'lcs': 'LCS',
    'lev': 'Lev',
    'osa': 'OSA',
    'qg1': 'QG1',
    'qg2': 'QG2',
    'qg3': 'QG3',
    'wlev_del': 'Lev$_D$',
    'wlev_ins': 'Lev$_I$',
    'wlev_sub': 'Lev$_S$'
}

replace_names = ['& ' + x.replace('_', '\\_') + ' ' for x in rename_dict.values()]

metrics_list = [
    'AP', 
    'P@1', 'P@5', 'P@10', 'P@20', 'P@50', 'P@100', 'P@500', 'P@1000',
    'R@1', 'R@5', 'R@10', 'R@20', 'R@50', 'R@100', 'R@500', 'R@1000'
]

short_names = (
    'cos1', 'dam', 'dice1', 'jac1', 'lcs', 'lev', 'osa', 'qg1', 'qg2', 'qg3', 'wlev_del', 'wlev_ins', 'wlev_sub'
)

# langs = [
#     'all', 'amh', 'arb', 'bul', 'che', 'cop', 'cym', 'deu',
#     'ell', 'eng', 'eus', 'fin', 'fra', 'guj', 'heb', 'hin',
#     'hrv', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn',
#     'kan', 'kat', 'kaz', 'khm', 'kor', 'lao', 'lit', 'mar',
#     'mkd', 'mon', 'mri', 'msa', 'mya', 'nld', 'pol', 'por',
#     'rus', 'san', 'slv', 'som', 'spa', 'sqi', 'swe', 'tel',
#     'tha', 'tur', 'ukr', 'urd', 'vie', 'xho', 'zho', 'zul'
# ]

quick_langs = sorted(
    [
        'all', 'amh', 'arb', 'cym', 'ell', 'eng', 'eus',
        'fin', 'heb', 'hin', 'hun', 'hye', 'ind', 'jpn', 'kan',
        'khm', 'kor', 'lit', 'mon', 'mri', 'rus', 'spa', 'sqi',
        'tha', 'tur', 'urd', 'vie', 'zho', 'zul'
    ]
)

def clean(tex):
    tex = tex.replace(
        '\\begin{table}', '\\begin{sidewaystable}\n\\centering'
    ).replace(
        '\\end{table}', '\\end{sidewaystable}'
    ).replace(
        'Lev$_D$', '\\textbf{Lev$_D$}'
    ).replace(
        'Lev$_I$', '\\textbf{Lev$_I$}'
    ).replace(
        'Lev$_S$', '\\textbf{Lev$_S$}'
    ).replace(
        '{llllllllllllll}', '{|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}\n\\hline'
    ).replace(
        '{lrrrrrrrrrrrrr}', '{|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}\n\\hline'
    ).replace(
        '{ll}', '{|c|c|}\n\\hline'
    ).replace(
        '\\\\', '\\\\\n\\hline'
    ).replace(
        '\\toprule\n', ''
    ).replace(
        '\\midrule\n', ''
    ).replace(
        '\\bottomrule\n', ''
    )
    return tex

alpha = 0.05 / (2 * len(quick_langs))

In [None]:
alpha

In [None]:
def get_results(lang):
    files = glob.glob(f'metrics/{lang}/*.json')
    
    agg_results = {
        'train': {},
        'test': {}
    }
    unagg_results = {
        'train': pd.DataFrame(columns=['query_id', 'measure', 'value', 'dist']),
        'test': pd.DataFrame(columns=['query_id', 'measure', 'value', 'dist'])
    }
    
    train_path = os.path.join('latex', f'{lang}_train.tex')
    test_path = os.path.join('latex', f'{lang}_test.tex')
    
    for f in files:
        file = os.path.basename(f)
        file_split = file.split('_')
        split = file_split[-6]
        if 'wlev' in file:
            dist = '_'.join(file_split[:2])
        else:
            dist = file_split[0]
        j = json.load(open(f, 'r', encoding='utf-8'))
        agg_results[split][dist] = j
        df_dist = pd.read_csv(f.replace('json', 'csv'), sep='\t')
        df_dist['dist'] = dist
        unagg_results[split] = pd.concat([unagg_results[split], df_dist])
        
    train_df = pd.DataFrame(agg_results['train'])
    train_df.rename(columns = rename_dict, inplace=True)
    train_df = train_df.reindex(sorted(train_df.columns), axis=1)
    train_df = train_df.reindex(metrics_list)
    
    test_df = pd.DataFrame(agg_results['test'])
    test_df.rename(columns = rename_dict, inplace=True)
    test_df = test_df.reindex(sorted(test_df.columns), axis=1)
    test_df = test_df.reindex(metrics_list)
    
    agg_results = {
        'train': train_df,
        'test': test_df
    }
    
    if lang == 'all':
        train_caption = f'Metrics for the train split of CogNet. AP = mean average precision, P@k = precision with cutoff k, R@k = recall with cutoff k.'
        train_caption += ' Key: Cos = cosine, Dam = Damerau-Levenshtein, Dice = Dice-S\\o rensen, Jac = Jaccard, LCS = Longest Common Subsequence, Lev = Levenshtein, Lev$_D$ ='
        train_caption += ' Levenshtein$_\\text{del}$, Lev$_I$ = Levenshtein$_\\text{ins}$, Lev$_S$ = Levenshtein$_\\text{sub}$, OSA = Optimal String Alignment,'
        train_caption += ' QG1-3 = $q$-Grams with $k\\in\\{1,2,3\\}$.'
        test_caption = 'Metrics for the test split of CogNet.'
        test_caption += ' Abbreviations and key are as in Table~\\ref{tab:train_all}.'
    else:
        train_caption = 'Metrics for the train split of CogNet (language = \\texttt{' + lang + '}).'
        train_caption += ' Abbreviations and key are as in Table~\\ref{tab:train_all}.'
        test_caption = 'Metrics for the test split of CogNet (language = \\texttt{' + lang + '}).'
        test_caption += ' Abbreviations and key are as in Table~\\ref{tab:train_all}.'
    
    with open(train_path, 'w+', encoding='utf-8') as f:
        tex = train_df.to_latex(
            bold_rows=True,
            label=f'tab:train_{lang}',
            caption=train_caption,
            float_format='%.3f'
        )
        for name in replace_names:
            tex = tex.replace(name, '& \\textbf{' + name[2:-1] + '} ')
        for num in set(re.findall(floating_point_number, tex)):
            tex = tex.replace(num, '\\gradient{' + num + '}')
        f.write(clean(tex))
    with open(test_path, 'w+', encoding='utf-8') as g:
        tex = test_df.to_latex(
            bold_rows=True,
            label=f'tab:test_{lang}',
            caption=test_caption,
            float_format='%.3f'
        )
        for name in replace_names:
            tex = tex.replace(name, '& \\textbf{' + name[2:-1] + '} ')
        for num in set(re.findall(floating_point_number, tex)):
            tex = tex.replace(num, '\\gradient{' + num + '}')
        g.write(clean(tex))
        
    # for measure in ['R@100']:#unagg_results['train']['measure'].unique():
    #     subset = unagg_results['train'][unagg_results['train']['measure'] == measure]
    #     s = pd.DataFrame(pairwise_tukeyhsd(subset['value'], subset['dist'], alpha=alpha).summary())
    #     rej = s[s[6].astype(str) == 'True']
    #     if not rej.empty:
    #         tex = rej[[0, 1]].to_latex(
    #             header=False,
    #             index=False,
    #             label=f'tab:{measure}_train_{lang}',
    #             caption='Significantly different methods for the test split of language \\texttt{' + lang + '} according to a TukeyHSD test.'
    #         )
    #         for name1, name2 in zip(short_names, replace_names):
    #             #print(name1, name2.replace('&', '').strip())
    #             tex = tex.replace(name1+' ', name2.replace('&', '').strip().replace('\\', '')+' ')
    #             tex = tex.replace(' '+name1, ' '+name2.replace('&', '').strip().replace('\\', ''))
    #         with open(os.path.join('sig', f'{measure}_{lang}_train.tex'), 'w+', encoding='utf-8') as h:
    #             h.write(clean(tex))
                
    # for measure in ['R@100']:#unagg_results['test']['measure'].unique():
    #     subset = unagg_results['test'][unagg_results['test']['measure'] == measure]
    #     s = pd.DataFrame(pairwise_tukeyhsd(subset['value'], subset['dist'], alpha=alpha).summary())
    #     rej = s[s[6].astype(str) == 'True']
    #     if not rej.empty:
    #         tex = rej[[0, 1]].to_latex(
    #             header=False,
    #             index=False,
    #             label=f'tab:{measure}_test_{lang}',
    #             caption='Significantly different methods for the test split of language \\texttt{' + lang + '} according to a TukeyHSD test.'
    #         )
    #         for name1, name2 in zip(short_names, replace_names):
    #             #print(name1, name2.replace('&', '').strip())
    #             tex = tex.replace(name1+' ', name2.replace('&', '').strip().replace('\\', '')+' ')
    #             tex = tex.replace(' '+name1, ' '+name2.replace('&', '').strip().replace('\\', ''))
    #         with open(os.path.join('sig', f'{measure}_{lang}_test.tex'), 'w+', encoding='utf-8') as h:
    #             h.write(clean(tex))
    
    measure = 'R@100'
    for split in ('train', 'test'):
        agg_subset = agg_results[split].loc[measure].rename({
            'Cos': 'cos1',
            'Dam': 'dam',
            'Dice': 'dice1',
            'Jac': 'jac1',
            'LCS': 'lcs',
            'Lev': 'lev',
            'Lev$_D$': 'wlev_del',
            'Lev$_I$': 'wlev_ins',
            'Lev$_S$': 'wlev_sub',
            'OSA': 'osa',
            'QG1': 'qg1',
            'QG2': 'qg2',
            'QG3': 'qg3'
        })
        unagg_subset = unagg_results[split][unagg_results[split]['measure'] == measure]
        
        # Run Tukey HSD and extract results
        tukey_result = pairwise_tukeyhsd(unagg_subset['value'], unagg_subset['dist'], alpha=alpha)
        tukey_df = pd.DataFrame(data=tukey_result.summary().data[1:], columns=tukey_result.summary().data[0])

        # Build matrix
        column_names = sorted(set(tukey_df['group1']) | set(tukey_df['group2']))
        sig_matrix = pd.DataFrame('', index=column_names, columns=column_names)

        # Fill matrix based on significance and mean comparison
        for _, row in tukey_df.iterrows():
            g1, g2 = row['group1'], row['group2']
            sig = row['reject']
            if sig:
                if agg_subset.loc[g2] >= agg_subset.loc[g1]:
                    sig_matrix.at[g1, g2] = '-'
                    sig_matrix.at[g2, g1] = '+'
                else:
                    sig_matrix.at[g1, g2] = '+'
                    sig_matrix.at[g2, g1] = '-'

        # Done

        # s = pd.DataFrame(pairwise_tukeyhsd(unagg_subset['value'], unagg_subset['dist'], alpha=alpha).summary())[[0, 1, 6]]
        # s = s.loc[1:]
        # for column in s.columns:
        #     s[column] = s[column].astype(str)
        # column_names = sorted(list(set(s[0]).union(set(s[1]))))
        # sig_matrix = pd.DataFrame(index=column_names, columns=column_names)
        # for col in column_names:
        #     for row in column_names:
        #         if col == row:
        #             is_sig = False
        #         else:
        #             try:
        #                 is_sig = ((s.loc[(s[0] == col) & (s[1] == row)])[6] == 'True').values[0]
        #             except IndexError:
        #                 is_sig = False
        #         if is_sig and (agg_subset.loc[row] >= agg_subset.loc[col]):
        #             #print(row, col)
        #             sig_matrix[col].loc[row] = '+'
        #             sig_matrix[row].loc[col] = '-'
        #         elif is_sig and (agg_subset.loc[row] < agg_subset.loc[col]):
        #             sig_matrix[col].loc[row] = '-'
        #             sig_matrix[row].loc[col] = '+'
        #         else:
        #             sig_matrix[col].loc[row] = ''
                    
        sig_matrix = sig_matrix.fillna('')

        tex = sig_matrix.to_latex(
            #bold_rows=True,
            label=f'tab:sig_{lang}_{split}_{measure}',
            caption='Each row-indexed distance function performs significantly better (worse) than any distance function with a + (-) in its row for language \\texttt{' + lang + '}' +
            f' {measure} on the {split} split.',
            float_format='%.3f'
        )

        for name1, name2 in zip(short_names, replace_names):
            tex = tex.replace(name1, '\\textbf{' + name2[2:-1] + '} ')

        with open(f'sig/{lang}_{split}_{measure}.tex', 'w+', encoding='utf-8') as file:
            file.write(clean(tex).replace(
                'w\\textbf{Lev} _del', '\\textbf{Lev$_D$}'
            ).replace(
                'w\\textbf{Lev} _ins', '\\textbf{Lev$_I$}'
            ).replace(
                'w\\textbf{Lev} _sub', '\\textbf{Lev$_S$}'
            ))
        
    return agg_results, unagg_results

In [None]:
for lang in quick_langs:
    get_results(lang)

In [None]:
print('\\onecolumn')
for lang in sorted(quick_langs):
    print('\\clearpage')
    print('\\input{tables/' + lang + '_train.tex}')
    print('\\clearpage')
    print('\\input{tables/' + lang + '_test.tex}')
print('\\clearpage')
print('\\twocolumn')

In [None]:
d = {}

for file in os.listdir('timing'):
    if 'test' in file:
        continue
    j = json.load(open(f'timing/{file}', 'r', encoding='utf-8'))
    btpi = j['tree_build_time_per_item']
    tpq = j['query_time_per_query']
    if 'wlev' in file:
        name = '_'.join(file.split('_')[:2])
    else:
        name = file.split('_')[0]
    d[name] = {
        'Index Time Per Item': btpi,
        'Time Per Query': tpq
    }
    
df = pd.DataFrame(d).transpose()

In [None]:
for col in df.columns:
    df[col] = df[col].astype(float)

In [None]:
df = df.sort_index()

In [None]:
print(clean(df.to_latex(float_format='%.4f', caption='Timing information.').replace('lrr', '|c|c|c|')))

In [None]:
df_test = {}
df_train = {}

for lang in quick_langs:
    df_test[lang] = {}
    df_train[lang] = {}
    for file in glob.glob(f'metrics/{lang}/*.json'):
        basename = os.path.basename(file)
        if 'wlev' in basename:
            name = '_'.join(basename.split('_')[:2])
        else:
            name = basename.split('_')[0]
        if 'train' in file:
            df_train[lang][name] = json.load(open(file, 'r', encoding='utf-8'))['R@100']
        else:
            assert 'test' in file
            df_test[lang][name] = json.load(open(file, 'r', encoding='utf-8'))['R@100']

In [None]:
df_train = pd.DataFrame(df_train).transpose().rename(columns={
    'qg3': '\\textbf{QG3}',
    'qg2': '\\textbf{QG2}',
    'qg1': '\\textbf{QG1}',
    'lcs': '\\textbf{LCS}',
    'osa': '\\textbf{OSA}',
    'dice1': '\\textbf{Dice}',
    'jac1': '\\textbf{Jac}',
    'cos1': '\\textbf{Cos}',
    'dam': '\\textbf{Dam}',
    'lev': '\\textbf{Lev}',
    'wlev_ins': '\\textbf{Lev$_I$}',
    'wlev_del': '\\textbf{Lev$_D$}',
    'wlev_sub': '\\textbf{Lev$_S$}'
})
df_test = pd.DataFrame(df_test).transpose().rename(columns={
    'qg3': '\\textbf{QG3}',
    'qg2': '\\textbf{QG2}',
    'qg1': '\\textbf{QG1}',
    'lcs': '\\textbf{LCS}',
    'osa': '\\textbf{OSA}',
    'dice1': '\\textbf{Dice}',
    'jac1': '\\textbf{Jac}',
    'cos1': '\\textbf{Cos}',
    'dam': '\\textbf{Dam}',
    'lev': '\\textbf{Lev}',
    'wlev_ins': '\\textbf{Lev$_I$}',
    'wlev_del': '\\textbf{Lev$_D$}',
    'wlev_sub': '\\textbf{Lev$_S$}'
})

In [None]:
df_train = df_train.sort_index(axis=1)
df_test = df_test.sort_index(axis=1)

In [None]:
tex = clean(df_train.to_latex(float_format='%.3f'))
for num in set(re.findall(floating_point_number, tex)):
    tex = tex.replace(num, '\\gradient{' + num + '}')
print(tex)

In [None]:
tex = clean(df_test.to_latex(float_format='%.3f'))
for num in set(re.findall(floating_point_number, tex)):
    tex = tex.replace(num, '\\gradient{' + num + '}')
print(tex)

In [None]:
for lang in quick_langs:
    print('\\clearpage')
    print('\\noindent')
    print('\\begin{minipage}[t]{0.5\\linewidth}')
    print('\\input{tables/'+lang+'_train.tex}')
    print('\\end{minipage}%')
    print('\\begin{minipage}[t]{0.5\\linewidth}')
    print('\\input{tables/'+lang+'_test.tex}')
    print('\\end{minipage}')