# Imports

In [9]:
import os
import re
import random
import pandas as pd
from openai import OpenAI
from itertools import permutations
from dotenv import load_dotenv
from ast import literal_eval

In [88]:
load_dotenv()

api_key = os.getenv("API_KEY")

if api_key:
    print("API_KEY found successfully.")
else:
    print("Error: API_KEY not found. Please set API_KEY environment variable.")

API_KEY found successfully.


In [89]:
client = OpenAI(api_key=api_key)

# Utility Functions

In [90]:
def get_triplet(row, alt_ids):
    alt_keys = [f"alt{i}" for i in alt_ids]
    return {
        "ground_truth": row["ground_truth"],
        "altA": row[alt_keys[0]],
        "altB": row[alt_keys[1]],
        "altC": row[alt_keys[2]],
        "alt_order": alt_keys
    }

In [91]:
def make_triplet_ranking_prompt(gt, a, b, c):
    return f"""Ground truth: "{gt}"

Below are three alternative translations of the same sentence. Please rank them from most similar to least similar in meaning compared to the ground truth.

A: "{a}"
B: "{b}"
C: "{c}"

Your response should give a ranking in this format: A > B > C (or any other order). Please do not include any other text in your response.
"""

In [92]:
def query_openai_triplet(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a linguistic evaluator. Your task is to rank alternative translations by their similarity to a ground truth sentence."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=100
    )
    return response.choices[0].message.content.strip()

# Ranking Test

In [10]:
df = pd.read_csv("translation_versions.csv", header=None)
df = df.iloc[:, :12]
df.columns = ["gloss", "ground_truth"] + [f"alt{i}" for i in range(1, 11)]

In [None]:
# # Do not run this block to avoid overwriting the file

# triplet_permutations = list(permutations([1, 2, 3]))
# results = []

# for i in range(5):
#     for row_idx in range(10):
#         row = df.iloc[row_idx]
#         for perm_id, alt_ids in enumerate(triplet_permutations):
#             triplet = get_triplet(row, alt_ids)
#             prompt = make_triplet_ranking_prompt(
#                 triplet["ground_truth"],
#                 triplet["altA"],
#                 triplet["altB"],
#                 triplet["altC"]
#             )
#             output = query_openai_triplet(prompt)
#             results.append({
#                 "run_id": i,
#                 "row_index": row_idx,
#                 "perm_id": perm_id,
#                 "alt_order": triplet["alt_order"],
#                 "response": output
#             })
            
# pd.DataFrame(results).to_csv(f"triplet_ranking_results_openai.csv", index=False)

# Analysis

In [6]:
def parse_ranking_response(response, alt_order):
    match = re.search(r"[ABC]\s*>\s*[ABC]\s*>\s*[ABC]", str(response).upper())
    if not match:
        return None
    order = match.group().replace(" ", "").split(">")
    return " > ".join(alt_order[ord(c) - ord('A')].lower() for c in order)

In [4]:
def report_ranking_variation_across_runs(df):
    print(f"Across {df['run_id'].nunique()} runs:\n")

    for row_idx, group in df.groupby("row_index"):
        print(f"For sentence {row_idx}:\n")
        for perm_id, perm_group in group.groupby("perm_id"):
            rankings = [
                parse_ranking_response(row["response"], literal_eval(row["alt_order"]))
                for _, row in perm_group.iterrows()
            ]
            rankings = [r for r in rankings if r]
            unique = sorted(set(rankings))
            print(f"Permutation {perm_id}:")
            print(f"{len(unique)} different rankings:")
            for i, r in enumerate(unique, 1):
                print(f"{i}. {r}")
            print()

In [13]:
def report_ranking_variation_across_permutations(df):
    print(f"Across {df['perm_id'].nunique()} permutations:\n")

    for run_id, run_group in df.groupby("run_id"):
        print(f"For run {run_id}:\n")
        for row_idx, row_group in run_group.groupby("row_index"):
            rankings = [
                parse_ranking_response(row["response"], literal_eval(row["alt_order"]))
                for _, row in row_group.iterrows()
            ]
            rankings = [r for r in rankings if r]
            unique = sorted(set(rankings))
            print(f"Sentence {row_idx}:")
            print(f"{len(unique)} different rankings:")
            for i, r in enumerate(unique, 1):
                print(f"{i}. {r}")
            print()

In [99]:
results_df = pd.read_csv("triplet_ranking_results_openai.csv")
report_ranking_variation_across_runs(results_df)
report_ranking_variation_across_permutations(results_df)

Across 5 runs:

For sentence 0:

Permutation 0:
1 different rankings:
1. alt3 > alt2 > alt1

Permutation 1:
1 different rankings:
1. alt2 > alt3 > alt1

Permutation 2:
1 different rankings:
1. alt1 > alt3 > alt2

Permutation 3:
1 different rankings:
1. alt1 > alt3 > alt2

Permutation 4:
1 different rankings:
1. alt1 > alt2 > alt3

Permutation 5:
1 different rankings:
1. alt1 > alt2 > alt3

For sentence 1:

Permutation 0:
1 different rankings:
1. alt2 > alt3 > alt1

Permutation 1:
1 different rankings:
1. alt2 > alt3 > alt1

Permutation 2:
2 different rankings:
1. alt3 > alt1 > alt2
2. alt3 > alt2 > alt1

Permutation 3:
2 different rankings:
1. alt1 > alt2 > alt3
2. alt1 > alt3 > alt2

Permutation 4:
2 different rankings:
1. alt2 > alt1 > alt3
2. alt2 > alt3 > alt1

Permutation 5:
1 different rankings:
1. alt1 > alt2 > alt3

For sentence 2:

Permutation 0:
1 different rankings:
1. alt3 > alt2 > alt1

Permutation 1:
1 different rankings:
1. alt3 > alt2 > alt1

Permutation 2:
1 different 