In [104]:
import os
import itertools
from collections import defaultdict
import re
import math
import sacrebleu
from numpy import cov

In [106]:
from scipy.stats import pearsonr

In [2]:
import pandas as pd
from functools import reduce
import operator

In [3]:
# Adding module to sys path
import sys
sys.path.append("/home/cfourrie/documents/software/public/CopperMT/")
# RNN imports
import pipeline
import torch, numpy as np
from fairseq import checkpoint_utils, data, options, tasks
from pipeline.neural_translation.multilingual_rnns.multilingual_rnn import MultilingualRNNModel

In [71]:
bleu = sacrebleu.corpus_bleu(["a a"], [["a a"]])
print(bleu)

BLEU = 0.00 100.0/100.0/0.0/0.0 (BP = 1.000 ratio = 1.000 hyp_len = 2 ref_len = 2)


In [4]:
copper_dir = "/home/cfourrie/documents/software/public/CopperMT/"
raw_data_path = "inputs/raw_data/" 
split_data_path = "inputs/split_data/"

In [5]:
folders = ["abrahammonpa", "allenbai", "backstromnorthernpakistan", "castrosui", "davletshinaztecan", 
           "felekesemitic", "hantganbangime", "hattorijaponic", "listsamplesize", "mannburmish"]
splits = ["0.10", "0.20", "0.30", "0.40", "0.50"]
models = ["baseline", "BiNMT", "MNMT", "SMT"]
train_name = "training"
test_in_name = "test"
test_out_name = "solutions"
save_as = "results"

In [6]:
models_dir = "/home/cfourrie/documents/software/public/CopperMT/workspace"


In [7]:
long2short_castrosui = {'AntangWesternSandong': 'Antang', 'BanliangYangAn': 'Banliang', 'DujiangEasternSandong': 'Dujiang', 'JiaoliPandong': 'Jiaoli', 'JiarongSouthernSandong': 'Jiarong', 'JiuqianSouthernSandong': 'Jiuqian', 'Pandong': 'Pandong', 'RenliEasternSandong': 'Renli', 'SanjiangEasternSandong': 'Sanjiang', 'ShuigenCentralSandong': 'Shuigen', 'ShuiweiSouthernSandong': 'Shuiwei', 'ShuiyaoSouthernSandong': 'Shuiyao', 'TangnianYangAn': 'Tangnian', 'TangzhouWesternSandong': 'Tangzhou', 'TingpaiWesternSandong': 'Tingpai', 'ZhongheCentralSandong': 'Zhonghe'}

## Utils

In [13]:
def sublist_with_unk(ls_with_unk, ls):
    ls = "".join(ls)
    ls_with_unk = "".join(ls_with_unk)
    
    for item in ls_with_unk.split("<unk>"):
        if item not in ls:
            return False

    return True

In [14]:
def get_neural_bleu_predictions(path, l_in, l_out, n_best):
    # Storage
    source = []
    target = []
    prediction = []
    confidence = []
    cur_prediction = []
    cur_confidence = []
    indices = []
    with open(
            f'{path}/bleu/bleu_checkpoint_best_{l_in}-{l_out}.{l_out}', 'r') as file:
        for i, line in enumerate(file):
            line = line.split("\t")
            # Actual source
            if "S-" in line[0]:
                word = line[1].strip(' ').split()
                source.append(word)
                indices.append(line[0])
                # We reinitialize the cur_prediction list
                if len(cur_prediction) > 0:
                    prediction.append(cur_prediction)
                    confidence.append(cur_confidence)
                    cur_prediction = []
                    cur_confidence = []
            # Actual target
            if "T-" in line[0]:
                word = line[1].strip(' ').split()
                target.append(word)
            # Hypothesis
            if "H-" in line[0] and len(cur_prediction) < n_best:
                word = line[2].strip(' ').split()
                cur_prediction.append(word)
                cur_confidence.append(math.exp(float(line[1])))
        prediction.append(cur_prediction)
        confidence.append(cur_confidence)
        try:
            prediction = [[bor[n] for bor in prediction] for n in range(n_best)]
        except IndexError as e:
            raise e

        #prediction = [[bor[n] for bor in prediction] for n in range(n_best)]

    return source, target, prediction, confidence, indices


def get_statistical_bleu_predictions(path_data, path, l_in, l_out, n_best, cur_n_best):
    target = []
    try:
        with open(f'{path_data}/test.{l_in}-{l_out}.{l_out}', 'r') as file:
            for i, line in enumerate(file):
                target.append(line.split())
    except FileNotFoundError:
        pass

    source = []
    with open(f'{path_data}/test.{l_in}-{l_out}.{l_in}', 'r') as file:
        for i, line in enumerate(file):
            source.append(line.split())

    prediction = []
    confidence = []
    cur_ix = -1
    cur_prediction = []
    cur_confidence = []
    indices = []
    with open(f'{path}/{l_in}-{l_out}/out/'
              f'test.{l_in}-{l_out}_nbest_{str(n_best)}.{l_out}', 'r') as file:
        for i, line in enumerate(file):
            line = line.split("|||")
            ix = int(line[0])
            word = line[1].strip(' ').split()

            if cur_ix != ix:
                if cur_ix != -1:
                    indices.append(cur_ix)
                    while len(cur_prediction) < cur_n_best:
                        cur_prediction.append(cur_prediction[-1])
                        cur_confidence.append(cur_confidence[-1])
                    prediction.append(cur_prediction)
                    confidence.append(cur_confidence)
                cur_prediction = [word]
                cur_confidence = [math.exp(float(line[-1]))]
                cur_ix = ix
            else:
                cur_prediction.append(word)
                cur_confidence.append(math.exp(float(line[-1])))
        # Management of last prediction
        indices.append(cur_ix)
        while len(cur_prediction) < cur_n_best:
            cur_prediction.append(cur_prediction[-1])
            cur_confidence.append(cur_confidence[-1])
        prediction.append(cur_prediction)
        confidence.append(cur_confidence)

    prediction = [[bor[n] for bor in prediction] for n in range(cur_n_best)]

    return source, target, prediction, confidence, indices

## Logic

In [15]:
def get_results_from_file(langs, model, cur_data):
    cur_results_grouped = {lang: defaultdict(dict) for lang in langs}
    cur_results_by_lang = {lang: defaultdict(dict) for lang in langs}
    # Read results
    for lang_out in langs:
        for lang_in in langs:
            if lang_in == lang_out: continue
            tmp_in = long2short_castrosui[lang_in] if cur_data == "castrosui" else lang_in
            tmp_out = long2short_castrosui[lang_out] if cur_data == "castrosui" else lang_out
            if model == "shared_bilingual":
                all_sources, _, all_predictions, all_confidences, all_indices = get_neural_bleu_predictions(
                    f"{models_dir}/{cur_data}/{model}/{tmp_in}-{tmp_out}/{split}", tmp_in, tmp_out, 10)
            elif model == "shared_multilingual":
                all_sources, _, all_predictions, all_confidences, all_indices = get_neural_bleu_predictions(
                    f"{models_dir}/{cur_data}/{model}/{split}", tmp_in, tmp_out, 10)
            else:
                all_sources, _, all_predictions, all_confidences, all_indices = get_statistical_bleu_predictions(
                    f"{split_data_path}/shared_{cur_data}/{split}",
                    f"{models_dir}/{cur_data}/{model}/{split}", tmp_in, tmp_out, 10, 10)
             
            for ix, (source, index) in enumerate(zip(all_sources, all_indices)):
                predictions = [" ".join(all_predictions[n_best][ix]) for n_best in range(10)]
                confidences = all_confidences[ix]
                cur_results_grouped[lang_out][index].update(
                    {f"{lang_in}_source": " ".join(source),
                     lang_in: sorted([(p, c) for p, c in zip(predictions, confidences)])}
                ) 
                cur_results_by_lang[lang_out][lang_in].update(
                     {" ".join(source): sorted([(p, c) for p, c in zip(predictions, confidences)])}
                ) 
                
                
    return cur_results_grouped, cur_results_by_lang

In [16]:
def get_best_prediction(row_results):
    predictions_scores = defaultdict(int)
    predictions_counts = defaultdict(int)
    for lang_res in row_results.values():
        for pred, score in lang_res:
            predictions_scores[pred] += score
            predictions_counts[pred] += 1
    # prediction scores is better for SMT models! (considerably)
    best_prediction = [k for k, v in predictions_scores.items() if v == max(predictions_scores.values())]
    if best_prediction:
        best_prediction = best_prediction[0]
    else:
        best_prediction = ""

    return best_prediction

In [100]:
def reordering(raw_data_path, cur_data, model, split, results_by_lang):
    test_df = pd.read_csv(os.path.join(raw_data_path, cur_data, f"test-{split}.tsv"), sep="\t")
    gold_df = pd.read_csv(os.path.join(raw_data_path, cur_data, f"solutions-{split}.tsv"), sep="\t", index_col="COGID")
    final_results = defaultdict(list)
    for ix, (_, row) in enumerate(test_df.iterrows()):
        row_dict = dict(row)
        cogid = row_dict.pop("COGID")
        lang_out = [k for k, v in row_dict.items() if v == "?"][0]
        gold = gold_df.loc[cogid][lang_out]
        row_total = []
        
        row_results = {}
        # Compute all predictions
        for lang, val in row_dict.items():
            if lang == lang_out: continue            
            if not isinstance(val, str): continue # nan because was empty
            try:
                row_results[lang] = results_by_lang[model][lang_out][lang][val]
            except KeyError: # some chars are only present in test, and encoded as unk
                # We extract possible keys 
                keys_with_unk = [v for v in results_by_lang[model][lang_out][lang].keys() 
                                 if "<unk>" in v and len(v.split(" ")) == len(val.split(" "))]
                possible_keys = []
                for key in keys_with_unk:
                    if sublist_with_unk(key.split(" "), val.split(" ")):
                        possible_keys.append(key)

                if len(possible_keys) > 1:
                    raise Exception("Problem! Several plausible keys!", val, possible_keys)
                elif len(possible_keys) == 0:
                    raise Exception("Problem! No plausible key!", val, lang, lang_out, results_by_lang[model][lang_out])
                else:
                    row_results[lang] = results_by_lang[model][lang_out][lang][possible_keys[0]]
                    
            # This is a gross approximation of the BLEU score - tokenizer should be none, words should be split on char, etc
            # will do for now though
            row_total.extend([(sacrebleu.corpus_bleu(
                [v[0]], [[gold]], tokenize="char", use_effective_order=True).score, v[1]) 
                                 for v in row_results[lang] if 0.3 < len(v[0])/len(val) < 3])


        # Rank best prediction, then Save
        final_results[cogid] = row_total

    return final_results

# Choosing best answer

In [101]:
loc_models = ['shared_bilingual', 'shared_multilingual', 'shared_statistical']

In [109]:
for cur_data in folders:
    for split in splits:
        df = pd.read_csv(os.path.join(raw_data_path, cur_data, f"cognates.tsv"), sep="\t")
        langs = [c for c in df.columns if c != 'COGID']
        results_grouped = {model: {lang: defaultdict(dict) for lang in langs} for model in loc_models}
        results_by_lang = {model: {lang: defaultdict(dict) for lang in langs} for model in loc_models}

        for model in loc_models:
            try:
                # Read results
                cur_results_grouped, cur_results_by_lang = get_results_from_file(langs, model, cur_data)
                results_grouped[model] = cur_results_grouped
                results_by_lang[model] = cur_results_by_lang

                # Reorder according to initial file
                final_results = reordering(raw_data_path, cur_data, model, split, results_by_lang)
                total_bleu2conf = []
                for row in final_results.values():
                    total_bleu2conf.extend(row)
                total_bleu2conf_bleu = [i[0] for i in total_bleu2conf]
                total_bleu2conf_conf = [i[1] for i in total_bleu2conf]

                print(cur_data, split, model, pearsonr(total_bleu2conf_bleu, total_bleu2conf_conf)[0])
            except Exception as e:
                print("ERROR", cur_data, split, model, e)

abrahammonpa 0.10 shared_bilingual 0.23505522855490182
abrahammonpa 0.10 shared_multilingual 0.16473269356817993
abrahammonpa 0.10 shared_statistical 0.44790996289724755
abrahammonpa 0.20 shared_bilingual 0.2154852819570765
abrahammonpa 0.20 shared_multilingual 0.0925935770372647
abrahammonpa 0.20 shared_statistical 0.3664352640342917
abrahammonpa 0.30 shared_bilingual 0.1793040426099256
abrahammonpa 0.30 shared_multilingual 0.015264590831792818
abrahammonpa 0.30 shared_statistical 0.28629163457102424
abrahammonpa 0.40 shared_bilingual 0.04349038820900493
abrahammonpa 0.40 shared_multilingual -0.04953530205465923
abrahammonpa 0.40 shared_statistical 0.22666877555138132
abrahammonpa 0.50 shared_bilingual 0.03133134910556062
abrahammonpa 0.50 shared_multilingual -0.09576671278302411
abrahammonpa 0.50 shared_statistical 0.16278401829546715
allenbai 0.10 shared_bilingual 0.3407190554721396
ERROR allenbai 0.10 shared_multilingual list index out of range
allenbai 0.10 shared_statistical 0.39

hattorijaponic 0.50 shared_bilingual -0.044711501958247954
hattorijaponic 0.50 shared_multilingual 0.00044876194525827486
hattorijaponic 0.50 shared_statistical 0.21123349611612316
listsamplesize 0.10 shared_bilingual 0.3466062191379084
listsamplesize 0.10 shared_multilingual 0.19822395407154558
listsamplesize 0.10 shared_statistical 0.331472928213488
listsamplesize 0.20 shared_bilingual 0.24140977006800246
listsamplesize 0.20 shared_multilingual 0.11025532981301045
listsamplesize 0.20 shared_statistical 0.22632892221752535
listsamplesize 0.30 shared_bilingual 0.2102263285513636
listsamplesize 0.30 shared_multilingual 0.14148683748295626
listsamplesize 0.30 shared_statistical 0.29389703512280324
listsamplesize 0.40 shared_bilingual 0.2771069199696752
listsamplesize 0.40 shared_multilingual 0.08116543616471106
listsamplesize 0.40 shared_statistical 0.27599959038637695
listsamplesize 0.50 shared_bilingual 0.2582503991663772
listsamplesize 0.50 shared_multilingual 0.044197922980569654
lis

```bash
source ~/Desktop/SIGTYP2022/venv/bin/activate
# Baseline
#model="backstromnorthernpakistan"
for model in "abrahammonpa" "allenbai" "backstromnorthernpakistan" "castrosui" "davletshinaztecan" "felekesemitic" "hantganbangime" "hattorijaponic" "listsamplesize" "mannburmish"; do
    echo ")0.10 - baseline" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/result-0.10.tsv --solution-file=${model}/solutions-0.10.tsv >> analysis_${model}.txt;
    echo ")0.10 - BiNMT" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/results-shared_bilingual-0.10.tsv --solution-file=${model}/solutions-0.10.tsv  >> analysis_${model}.txt;
    echo ")0.10 - MNMT" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/results-shared_multilingual-0.10.tsv --solution-file=${model}/solutions-0.10.tsv  >> analysis_${model}.txt;
    echo ")0.10 - SMT" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/results-shared_statistical-0.10.tsv --solution-file=${model}/solutions-0.10.tsv  >> analysis_${model}.txt;
    
    echo ")0.50 - baseline" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/result-0.50.tsv --solution-file=${model}/solutions-0.50.tsv  >> analysis_${model}.txt;
    echo ")0.50 - BiNMT" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/results-shared_bilingual-0.50.tsv --solution-file=${model}/solutions-0.50.tsv >> analysis_${model}.txt;
    echo ")0.50 - MNMT" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/results-shared_multilingual-0.50.tsv --solution-file=${model}/solutions-0.50.tsv  >> analysis_${model}.txt;
    echo ")0.50 - SMT" >> analysis_${model}.txt;
    st2022 --compare --prediction-file=${model}/results-shared_statistical-0.50.tsv --solution-file=${model}/solutions-0.50.tsv >> analysis_${model}.txt;
done
```

## Problems to manage
- sur allenbai encoding en 0.10 pour bilingue et multilingue
- castrosui languages changes for saves, need to change them when called here too
- davletshinaztecan 0.20 bilingual not launched for all language pairs?

In [75]:
results_dict = {language_family: {split: {
    model: {} for model in models
} for split in splits} for language_family in folders}

for language_family in folders:
    cur_model = "baseline"
    cur_split = "0.10"
    with open(os.path.join(copper_dir, raw_data_path, f"analysis_{language_family}.txt"), "r") as f: 
        for line in f:
            if "Language" in line: continue
            if "--" in line: continue
            if line[0] == ")":
                cur_split, cur_model = line[1:].replace("\n", "").split(" - ");
                continue
            lang, ed, norm_ed, f5 = " ".join(line.split()).split()
            results_dict[language_family][cur_split][cur_model].update(
                {lang: {"ED": ed, "Normalized ED": norm_ed, "B2 F5": f5}}
            )

KeyError: 'O.10'

In [72]:
models

['shared_bilingual', 'shared_multilingual', 'shared_statistical']

In [76]:
for k, v in results_dict.items():
    for split in ["0.10", "0.50"]:
        for model in models:
            try:
                print(k, split, v[split][model]["TOTAL"])
            except KeyError:
                print(k, model, split)

abrahammonpa 0.10 {'ED': '0.459', 'Normalized ED': '0.088', 'B2 F5': '0.884'}
abrahammonpa 0.10 {'ED': '1.150', 'Normalized ED': '0.223', 'B2 F5': '0.712'}
abrahammonpa 0.10 {'ED': '1.041', 'Normalized ED': '0.194', 'B2 F5': '0.725'}
abrahammonpa 0.10 {'ED': '0.372', 'Normalized ED': '0.069', 'B2 F5': '0.900'}
abrahammonpa 0.50 {'ED': '1.486', 'Normalized ED': '0.281', 'B2 F5': '0.687'}
abrahammonpa 0.50 {'ED': '3.483', 'Normalized ED': '0.657', 'B2 F5': '0.285'}
abrahammonpa 0.50 {'ED': '3.694', 'Normalized ED': '0.675', 'B2 F5': '0.249'}
abrahammonpa 0.50 {'ED': '0.961', 'Normalized ED': '0.168', 'B2 F5': '0.724'}
allenbai baseline 0.10
allenbai BiNMT 0.10
allenbai MNMT 0.10
allenbai SMT 0.10
allenbai baseline 0.50
allenbai BiNMT 0.50
allenbai MNMT 0.50
allenbai SMT 0.50
backstromnorthernpakistan baseline 0.10
backstromnorthernpakistan BiNMT 0.10
backstromnorthernpakistan MNMT 0.10
backstromnorthernpakistan SMT 0.10
backstromnorthernpakistan baseline 0.50
backstromnorthernpakistan Bi

In [42]:
for k, v in results_dict.items():
    for split in ["0.10", "0.50"]:
        for model in models:
            try:
                print(k, split, v[split][model]["TOTAL"])
            except KeyError:
                print(k, model, split)

abrahammonpa 0.10 {'ED': '0.459', 'Normalized ED': '0.088', 'B2 F5': '0.884'}
abrahammonpa 0.10 {'ED': '1.150', 'Normalized ED': '0.223', 'B2 F5': '0.712'}
abrahammonpa 0.10 {'ED': '1.041', 'Normalized ED': '0.194', 'B2 F5': '0.725'}
abrahammonpa 0.10 {'ED': '0.372', 'Normalized ED': '0.069', 'B2 F5': '0.900'}
abrahammonpa 0.50 {'ED': '1.486', 'Normalized ED': '0.281', 'B2 F5': '0.687'}
abrahammonpa 0.50 {'ED': '3.483', 'Normalized ED': '0.657', 'B2 F5': '0.285'}
abrahammonpa 0.50 {'ED': '3.694', 'Normalized ED': '0.675', 'B2 F5': '0.249'}
abrahammonpa 0.50 {'ED': '0.961', 'Normalized ED': '0.168', 'B2 F5': '0.724'}
allenbai 0.10 {'ED': '0.882', 'Normalized ED': '0.288', 'B2 F5': '0.743'}
allenbai BiNMT 0.10
allenbai MNMT 0.10
allenbai SMT 0.10
allenbai 0.50 {'ED': '1.192', 'Normalized ED': '0.379', 'B2 F5': '0.637'}
allenbai 0.50 {'ED': '1.273', 'Normalized ED': '0.406', 'B2 F5': '0.499'}
allenbai 0.50 {'ED': '1.206', 'Normalized ED': '0.386', 'B2 F5': '0.519'}
allenbai 0.50 {'ED': '0