In [36]:
import pandas as pd 
import os 
from scipy import spatial
import langdetect
import numpy as np 
import os 


In [13]:
df = pd.read_csv("../../data/translation/df_translated_with_tests.csv")

  df = pd.read_csv("../../data/translation/df_translated_with_tests.csv")


In [73]:
tested = df[~(df["translationTest"].isna())]
non_matched = tested[~(tested["translatedText"] == tested["translationTest"])]
print("verbatim matches:", len(tested) - len(non_matched), f"({(len(tested)-len(non_matched)) / len(tested)})")

verbatim matches: 454 (0.227)


In [17]:
non_matched.to_csv("../../data/translation/df_non_matched.csv", index=False)

In [4]:
del df

## Testing whether Gemini accurately detected English language

In [None]:
# using 
baseline_en = tested["text"].map(langdetect.detect) == "en"
gemini_en = tested["translationTestSource"] == "original_gm"
parllaw_en = tested["translationSource"] == "original_pl"

In [32]:
percent_language_non_matched = sum(~(gemini_en == parllaw_en)) / len(tested)
parllaw_correct = sum(baseline_en == parllaw_en) / len(tested)
gemini_correct = sum(baseline_en == gemini_en) / len(tested)

print("% disagreement parllaw and gemini", percent_language_non_matched)
print("% correct parllaw", parllaw_correct)
print("% correct gemini", gemini_correct)

% disagreement parllaw and gemini 0.1955
% correct parllaw 0.992
% correct gemini 0.8035


Hmm, thats not great... I looked at some examples where Gemini was wrong and it seems like it handled the cases where the original language was still included in a tag e.g. "(DE) this is the speech"

In [34]:
# false positives: 
print("Gemini false positives (eng when was not actually eng)", sum((gemini_en == True) & (baseline_en == False)) / len(tested))
print("Gemini false negatives (not eng when was not actually eng)", sum((gemini_en == False) & (baseline_en == True)) / len(tested))

Gemini false positives (eng when was not actually eng) 0.0
Gemini false negatives (not eng when was not actually eng) 0.1965


Okay thats nice, so it only was too generous by re-translating text that was already in English. Confirming this by testing whether its mistakes were made primarily when the text was of a pattern "(XX) ...", indicating that the language tag XX was before the acutal speech

In [69]:
import re

pattern = re.compile(r"^\(.{2}\)\s")
contains_langflag = (tested["text"].map(lambda s: bool(pattern.match(s))))

print("N of those who did not start with (XX):")
print("Gemini english detection accuracy with language flag present:", sum(baseline_en[contains_langflag] == gemini_en[contains_langflag]) / len(baseline_en[contains_langflag]))
print("Gemini english detection accuracy without language flag present:", sum(baseline_en[~contains_langflag] == gemini_en[~contains_langflag]) / len(baseline_en[~contains_langflag]))

N of those who did not start with (XX):
Gemini english detection accuracy with language flag present: 0.3723916532905297
Gemini english detection accuracy without language flag present: 0.9985475671750181


phew... so Gemini itself is reliable, we should have just removed the language tags before translating...

## Comparing translation embeddings to make sure they are comparable to the original and parllaw translation

In [None]:
# run on colab: 
# import os 
# model_ids = [
#     "jinaai/jina-embeddings-v4",
#   ]


# for model_id in model_ids:
#   model = AutoModel.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto").to(device)
#   modelname = model_id.split("/")[-1]
#   os.makedirs(modelname, exist_ok=True)
#   for column in ["text", "translatedText", "translationTest"]:
#     embedd_speech(model, non_matched, column, f"{modelname}/embedding_{column}.npy")


In [74]:
for rooth, dirs, files in os.walk("../../data/embeddings/translation_test"): 
    for modelname in dirs: 
        embedding_og = np.load(open(f"../../data/embeddings/translation_test/{modelname}/embedding_text.npy", "rb"))
        embedding_translated_pl = np.load(open(f"../../data/embeddings/translation_test/{modelname}/embedding_translatedText.npy", "rb"))
        embedding_translated_gm = np.load(open(f"../../data/embeddings/translation_test/{modelname}/embedding_translationTest.npy", "rb"))

        indices_english = non_matched["translationSource"] == "original_pl"

        def sim(m1, m2): 
            return np.diag(m1 @ m2.T)
        
        # first compute embedding similarity of texts that were in english already (but not matched word to word)
        # - e.g. Gemini did not respond with "the text is in English" but repeated it verbatim and removed some comments: 
        similarity_eng_og_pl = sim(embedding_og[indices_english], embedding_translated_pl[indices_english])
        similarity_eng_og_gm = sim(embedding_og[indices_english], embedding_translated_gm[indices_english])
        similarity_eng_gm_pl = sim(embedding_translated_gm[indices_english], embedding_translated_pl[indices_english])
        print("English texts: <= texts that where in English according to parllaw, but gemini's response was not the verbatim speech")
        print("og = pl", similarity_eng_og_pl.mean())
        print("og = gm", similarity_eng_og_gm.mean())
        print("gm = pl", similarity_eng_gm_pl.mean())

        # now similarity of non-english texts:
        similarity_non_eng_og_pl = sim(embedding_og[~indices_english], embedding_translated_pl[~indices_english])
        similarity_non_eng_og_gm = sim(embedding_og[~indices_english], embedding_translated_gm[~indices_english])
        similarity_non_eng_gm_pl = sim(embedding_translated_gm[~indices_english], embedding_translated_pl[~indices_english])
        print("\nNon-english texts: <= texts that where not in English according to parllaw")
        print("og = pl", similarity_non_eng_og_pl.mean())
        print("og = gm", similarity_non_eng_og_gm.mean())
        print("gm = pl", similarity_non_eng_gm_pl.mean())

        sort_indices = np.argsort(similarity_non_eng_gm_pl)

        n_examples = 5
        worst_sim_examples = non_matched.iloc[sort_indices[:n_examples]]
        worst_sims = similarity_non_eng_gm_pl[sort_indices[:n_examples]]

        for (i, ex), sim in zip(worst_sim_examples.iterrows(), worst_sims):
            print("\nSimilarity (gemini & parllaw translation):", sim)
            print("original:", ex["text"])
            print("parllaw:", ex["translatedText"])
            print("gemini:", ex["translationTest"])
        # TODO: look at examples with least similarity


English texts: <= texts that where in English according to parllaw, but gemini's response was not the verbatim speech
og = pl 0.999686
og = gm 0.99542874
gm = pl 0.9955103

Non-english texts: <= texts that where not in English according to parllaw
og = pl 0.9461046
og = gm 0.9433505
gm = pl 0.9885919

Similarity (gemini & parllaw translation): 0.9226074
original: Ich habe der Agentur die Entlastung verweigert, da sie in zahlreichen Bereichen ihre Kompetenzen überschreitet; negativ fiel mir dabei insbesondere die Einflussnahme bei Interessenkonflikten auf.
parllaw: I refused to grant the agency relief because it exceeded its authority in numerous areas; What struck me particularly negatively was the influence exerted in conflicts of interest.
gemini: I have denied discharge to the agency, as it exceeds its competences in numerous areas; I was particularly critical of its interference in conflicts of interest.

Similarity (gemini & parllaw translation): 0.9240891
original: Hemos votado a

=> gemini translations are very comparable to parllaw's machine translations (avg. similarity for jina-v4 embeddings roughly 0.99)