## Setting up the Chat Completion API

In [None]:
def chat_completion_request(messages, tools=None, tool_choice=None, model):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            tools=tools,
            tool_choice=tool_choice,
            temperature=0.2,
        )
        return response
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
        return e

## Function description of get_wiki
This is required in order for the GPT to identify mathematical terms and send them back in an array

In [None]:
tools_RAG = [
    {
        "type": "function",
        "function": {
            "name": "get_wiki",
            "description": "Retrieve contextual information of given mathematical terminology from English and Swedish Wikipedia.",
            "parameters": {
                "type": "object",
                "properties": {
                    "terms": {
                        "type": "array",
                        "items": {
                            "type": "string",
                        },
                        "description": "List of mathematical terminology to check their context on Wikipedia before translating them to Swedish.",
                    },
                },
                "required": ["terms"],
            },
        }
    },
]

## Helper functions for get_wiki

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if POS tag is unknown

def save_embedding_to_csv(word, embedding):
    try:
        df = pd.read_csv('saved_embeddings.csv')
    except FileNotFoundError:
        df = pd.DataFrame(columns=['word', 'embedding'])

    new_entry = pd.DataFrame({'word': [word], 'embedding': [embedding]})
    df = pd.concat([df, new_entry], ignore_index=True)
    df.to_csv('saved_embeddings.csv', index=False)

def load_embedding_from_csv(word):
    try:
        df = pd.read_csv('saved_embeddings.csv')
        df['embedding'] = df['embedding'].apply(eval).apply(np.array)  # Convert strings to arrays
        embedding = df.loc[df['word'] == word, 'embedding'].values
        if len(embedding) > 0:
            return embedding[0]
        else:
            return None
    except FileNotFoundError:
        return None

def get_embedding(text, model="text-embedding-3-small"):
        text = text.replace("\n", " ")
        return client.embeddings.create(input = [text], model=model).data[0].embedding  

## get_wiki

In [None]:
def get_wiki(words):
    def cosine(content_emb, word_emb):
        return np.dot(content_emb, word_emb) / (norm(word_emb) * norm(content_emb))
    df = pd.read_csv('wikipedia_content.csv')
    df['English embeddings'] = df['English embeddings'].apply(eval).apply(np.array)  

    results = {}
    lemmatizer = WordNetLemmatizer()
    threshold = 0.39
    if isinstance(words, str):
        words = [words]
        
    for word in words:    
        word = lemmatizer.lemmatize(word.lower(), get_wordnet_pos(word.lower()))
        match = df[df['English title'].str.lower() == word.lower()]
        match2 = df[df['English section title'].str.lower() == word.lower()]
        
        if not match.empty:
            sv_content = match.iloc[0]['Swedish section content']
            en_content = match.iloc[0]['English section content']
            results[word] = {
                'sv_content': sv_content,
                'en_content': en_content
            }

        elif not match2.empty: 
            sv_content = match2.iloc[0]['Swedish section content']
            en_content = match2.iloc[0]['English section content']
            results[word] = {
                'sv_content': sv_content,
                'en_content': en_content
            }
            
        else:
            word_emb = load_embedding_from_csv(word)
            if word_emb is None:
                word_emb = get_embedding(word)
                save_embedding_to_csv(word, word_emb)
            df['distance'] = df['English embeddings'].apply(lambda x: cosine(x, word_emb))
            df.sort_values('distance', ascending=False, inplace=True)
            value = df['distance'].iloc[0]
            content = df['English section content'].iloc[0]
    
            if value >= threshold:  
                sv_content = df['Swedish section content'].iloc[0] 
                en_content = df['English section content'].iloc[0]
                results[word] = {
                    'sv_content': sv_content,
                    'en_content': en_content
                }
            else:
                results[word] = None

    return results

## RAG

In [4]:
def construct_final_translation(terms):
    final_translation_parts = []
    for term, content in terms.items():
        if content:
            term_sentence = f"{term}: {content['sv_content']}: {content['en_content']}"
            final_translation_parts.append(term_sentence + "\n\n")
        else:
            term_sentence = f"{term}: 'None'."
            final_translation_parts.append(term_sentence + "\n\n")
    final_translation_parts.append("Don't wait for context for terminology that have value None.")
    final_translation = " ".join(final_translation_parts)
    return final_translation

def RAG(sentence):
    messages = []
    messages.append({"role": "system", 
                     "content": "Your sole task is to translate a given English sentence into Swedish. Ensure the use of correct mathematical terminology and make sure the sentence sounds natural to a Swedish student. Careful attention should be paid to mathematical terminology, avoiding assumptions about translations. Instead, list all mathematical terms requiring verification, and corresponding Wikipedia content in both Swedish and English will be provided to ensure accuracy. If the context for terminology is unavailable or unclear, translation should proceed accurately by verifying common practices in Sweden."})
    messages.append({"role": "user", 
                    "content": f"Translate the following content to Swedish. Keep any LaTeX expressions and numbers as they are. Output only the translated sentence: {sentence}"})


    chat_response = chat_completion_request(messages, tools=tools_RAG, tool_choice="auto", model='gpt-4o')
    assistant_message = chat_response.choices[0].message
    if assistant_message.tool_calls:
        function_name = assistant_message.tool_calls[0].function.name
        arguments_value = json.loads(assistant_message.tool_calls[0].function.arguments) # <--- this is where GPT identified mathematical terms
        terms = arguments_value.get("terms")
        if terms:
            pass
        else:
            return "ERROR"
        wikipedia_content = get_wiki(terms) # <--- this is where the retrival happens
        if wikipedia_content:
            final_translation_content = construct_final_translation(wikipedia_content) 
            final_translation = f"If possible and relevant, refine the translation with the help of the provided content. Output only the translated sentence: {final_translation_content}."                
            messages.append({"role": "assistant", "content": final_translation})
            
    chat_response = chat_completion_request(messages, tools=tools_RAG, tool_choice="none", model='gpt-4o')
    assistant_message = chat_response.choices[0].message
    return assistant_message.content