## This file implements translation evaluations using a modified GEMBA prompt system for the test dataset.

### Install Libraries

In [None]:
%pip install pandas
%pip install langchain
%pip install langchain-openai

### Imports

In [None]:
import os
import pandas as pd
from tqdm import tqdm

# Langchain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

### Globals

In [None]:
LLM_MODEL_NAME="gpt-3.5-turbo"

### Utility function to set up the OpenAI API

In [None]:
def load_openai_key():
    try:
        # Open and read the entire content of the file
        with open("openai-key.txt", 'r') as file:
            contents = file.read()
        
        return contents
    
    except FileNotFoundError:
        print(f"The file does not exist.")
        return None
    except Exception as e:
        # Handle other potential exceptions (e.g., permission errors)
        print(f"An error occurred while reading the file: {str(e)}")
        return None

openai_key = load_openai_key()
os.environ["OPENAI_API_KEY"] = openai_key

chat = ChatOpenAI(model="gpt-3.5-turbo")

### Prompt definition

In [None]:
system = """\
You are a helpful evaluator of the quality of translations.
Score the following translations from English to Portuguese on a continuous scale from 0 to 100, 
where a score of zero means "no meaning preserved" and score of one hundred means
"perfect meaning and grammar".
"""

human = """\
English source: {source_seg}
Portuguese translation 1: "{target_seg1}" 
Portuguese translation 2: "{target_seg2}"
Portuguese translation 3: "{target_seg3}"
Reply with only the scores of your evaluation:
Score 1:
Score 2:
Score 3:
"""
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

### Initial test

In [None]:
chain = prompt | chat
response = chain.invoke(input={"source_seg": "There are more things between heaven and earth",
                    "target_seg1": "Existem mais coisas entre o céu e a terra",
                    "target_seg2": "Existem coisas entre o céu e a terra",
                    "target_seg3": "Existem coisas entre o paraíso e a terra"})

In [None]:
print(response.content)

### Translation Datasets for comparisons (TEST)

In [None]:
df_tst = pd.read_csv('data/tst.csv')
df_tst = df_tst.drop(columns=['Unnamed: 0'])

In [None]:
df_tst

In [None]:
df_tst1_pt = pd.read_csv('translated/marian_mt/tst_pt.csv')
df_tst2_pt = pd.read_csv('translated/unicamp_t5/tst_pt.csv')
df_tst3_pt = pd.read_csv('translated/nllb_1.3b/tst_pt.csv')

# clean up data
df_tst1_pt = df_tst1_pt.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
df_tst2_pt = df_tst2_pt.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'correct'])
df_tst3_pt = df_tst3_pt.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'correct'])

# Rename columns
df_tst1_pt = df_tst1_pt.rename(columns={'context': 'context_1', 'question': 'question_1',
                                  'answerA': 'answerA_1', 'answerB': 'answerB_1',
                                  'answerC': 'answerC_1'})
df_tst2_pt = df_tst2_pt.rename(columns={'context': 'context_2', 'question': 'question_2',
                                  'answerA': 'answerA_2', 'answerB': 'answerB_2',
                                  'answerC': 'answerC_2'})
df_tst3_pt = df_tst3_pt.rename(columns={'context': 'context_3', 'question': 'question_3',
                                  'answerA': 'answerA_3', 'answerB': 'answerB_3',
                                  'answerC': 'answerC_3'})
# Merge the three dataframes
df_tst_pt = pd.concat((df_tst1_pt, df_tst2_pt, df_tst3_pt), axis=1)
df_tst_pt = df_tst_pt[['context_1', 'context_2', 'context_3',
                 'question_1', 'question_2', 'question_3',
                 'answerA_1', 'answerA_2', 'answerA_3',
                 'answerB_1', 'answerB_2', 'answerB_3',
                 'answerC_1', 'answerC_2', 'answerC_3',
                 'correct']]

In [None]:
df_tst_pt

In [None]:

def get_best_translations(df_src_lang, df_tgt_lang):

    df_output = df_src_lang.copy()
    df_output.loc[:, ['context', 'question', 'answerA', 'answerB', 'answerC']] = None

    for i in tqdm(range(len(df_src_lang))):

        sentence_list = ['context', 'question', 'answerA', 'answerB', 'answerC']
        chain = prompt | chat

        for sentence in sentence_list:

            # Candidate translations
            sentence_t1 = sentence + "_1"
            sentence_t2 = sentence + "_2"
            sentence_t3 = sentence + "_3"

            response = chain.invoke(input={"source_seg": df_src_lang.loc[i, sentence],
                            "target_seg1": df_tgt_lang.loc[i, sentence_t1],
                            "target_seg2": df_tgt_lang.loc[i, sentence_t2],
                            "target_seg3": df_tgt_lang.loc[i, sentence_t3]})
            results = [int(line.split(': ')[1]) for line in response.content.strip().split('\n')]
            
            best = results.index(max(results))
            col_best = sentence + '_' + str(best+1)
            best_result = df_tgt_lang.loc[i, col_best]
            sentence_idx = sentence_list.index(sentence)
            df_output.iat[i, sentence_idx] = best_result

    return df_output

In [None]:
# Dev base
df_tst_final = get_best_translations(df_tst, df_tst_pt)

In [None]:
df_tst_final

In [None]:
df_tst_final.to_csv('translated/final/tst.csv')