In [None]:
import os
from pymongo import MongoClient
import openai
import tiktoken
from dotenv import load_dotenv

# Load environment variables from a specified .env file for secure API key storage
env_path = '/home/msd4/aidocsMosi/openAI_Token.env'
load_dotenv(dotenv_path=env_path)

# Retrieve the OpenAI API key from the environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    # Raise an error if the API key is not found, to prevent API calls without authentication
    raise ValueError("No API key found. Please set your OPENAI_API_KEY in the .env file.")

In [None]:
# MongoDB client setup
client = MongoClient('mongodb://localhost:27017/')
db = client['MIMIC-IV']

# source_collection = db['NLP-EXP-pram-sampled']
source_collection = db['NLP-EXPANDED-prammed-2']
target_collection = db['NLP-EXPANDED-prammed-postprocessed_translation']

In [None]:
# OpenAI API setup
openai.api_key = OPENAI_API_KEY
MODEL = "gpt-4o"
MAX_TOKENS = 4096
DESIRED_OUTPUT_TOKENS = 2000
MAX_INPUT_TOKENS = MAX_TOKENS - DESIRED_OUTPUT_TOKENS
tokenizer = tiktoken.get_encoding("cl100k_base")

In [None]:
def count_tokens(text):
    return len(tokenizer.encode(text))

def split_text(text, max_chunk_size):
    words = text.split()
    chunks = []
    current_chunk = []
    current_chunk_size = 0

    for word in words:
        word_tokens = count_tokens(word)
        if current_chunk_size + word_tokens <= max_chunk_size:
            current_chunk.append(word)
            current_chunk_size += word_tokens
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_chunk_size = word_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def translate_chunk(chunk, source_language="German", target_language="English"):
    # prompt_template = f"Translate the following {source_language} text to {target_language}:\n\n{chunk}"
    prompt_template = f"Translate the following {source_language} text to {target_language}, and keep the numerical values unchanged:\n\n{chunk}"

    completion = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant for translating medical dialogue text!"},
            {"role": "user", "content": prompt_template}
        ],
        max_tokens=DESIRED_OUTPUT_TOKENS,
        temperature=0.3,
    )

    translated_dialogue = completion.choices[0].message.content

    return translated_dialogue



def translate_text(text):
    chunks = split_text(text, MAX_INPUT_TOKENS)
    translated_chunks = [translate_chunk(chunk) for chunk in chunks]
    return " ".join(translated_chunks)

def process_documents(start_index=None, end_index=None, range_type='range'):
    if range_type == 'all':
        documents = source_collection.find()
    else:
        if start_index is None or end_index is None:
            raise ValueError("Start and end indices must be provided for range processing.")
        documents = source_collection.find().skip(start_index).limit(end_index - start_index + 1)

    for doc in documents:
        print('doc-text is being translated')
        original_text = doc.get("final_dialogue", "")
        if original_text:  # Proceed only if there's text to translate
            translated_text = translate_text(original_text)
            doc["final_to_english"] = translated_text
            # Remove the '_id' field to avoid DuplicateKeyError
            doc.pop("_id", None)
            target_collection.insert_one(doc)

In [None]:
#process_documents(start_index=1, end_index=99, range_type='range')
#process_documents(range_type='all')

process_documents(start_index=102, end_index=127, range_type='range')
process_documents(start_index=101, end_index=101, range_type='range')

# First Test: single text

# Snippets

In [None]:
#testing
#original_token_count = count_tokens(german_text)
#print(f"Original Token Count: {original_token_count}\n")

#chunks = split_text(german_text, MAX_INPUT_TOKENS)
#print_chunks_info(chunks)