# Translate the chunks of the Burundi to English
 Once that all the databases and websites have been scraped, we need to translate the chunks in other languages to English.
 We also create a new collection with content only in English.

Load the functions

In [7]:
import os
os.chdir('/Users/dianaavalos/PycharmProjects/InternationalBridgesToJustice')

from src.internationalbridgestojustice.openai_utils import (
    upload_batch_file_to_openAI,
    submit_batch_job,
    openai_client,
    retrieve_and_save_batch_results,
    check_progress_batch_id,
)
from src.internationalbridgestojustice.get_translation import (
    Translator,
    get_chunks_in_english,
    get_chunks_for_one_country,
    create_new_chunks_from_translated_results,
)
from src.internationalbridgestojustice.config import Paths
from src.internationalbridgestojustice.file_manager import save_file

from src.internationalbridgestojustice.chromadb_utils import (
    load_collection,
    batch_embed_and_add,
)

Load the chunks from the 3 soucrces: defensewiki.ibj.org, constitution, and other legal documents.

In [8]:
defense_chunks_in_english, defense_chunks_not_in_english = get_chunks_in_english(
    jsonl_file_path="data/processed/defensewiki.ibj.org/unique_chunks.jsonl"
)

constitution_chunks_in_english, constitution_chunks_not_in_english = (
    get_chunks_in_english(jsonl_file_path=Paths.PATH_JSONL_FILE_CONSTITUTION_CHUNKS)
)

other_legal_docs_chunks_in_english, other_legal_docs_chunks_not_in_english = (
    get_chunks_in_english(jsonl_file_path=Paths.PATH_JSONL_FILE_LEGAL_OTHERS)
)

In [9]:
print("DefenseWiki chunks not in English:", len(defense_chunks_not_in_english))
print("DefenseWiki chunks in English:", len(defense_chunks_in_english))
print("Constitution chunks not in English:", len(constitution_chunks_not_in_english))
print("Constitution chunks in English:", len(constitution_chunks_in_english))
print(
    "Other legal chunks not in English:",
    len(other_legal_docs_chunks_not_in_english),
)
print("Other legal chunks in English:", len(other_legal_docs_chunks_in_english))


DefenseWiki chunks not in English: 3553
DefenseWiki chunks in English: 7052
Constitution chunks not in English: 0
Constitution chunks in English: 39439
Other legal chunks not in English: 431
Other legal chunks in English: 212


In [10]:
total_chunks_not_in_english = (
    defense_chunks_not_in_english
    + constitution_chunks_not_in_english
    + other_legal_docs_chunks_not_in_english
)
total_chunks_in_english = (
    defense_chunks_in_english
    + constitution_chunks_in_english
    + other_legal_docs_chunks_in_english
)

print("Count of chunks not in English", len(total_chunks_not_in_english))
print("Count of chunks in English", len(total_chunks_in_english))

Count of chunks not in English 3984
Count of chunks in English 46703


## Filter the chunks on Burundi to create a Burundi collection and run just for that country 

In [11]:
COUNTRY = "Burundi"
Country_chunks_not_in_english = get_chunks_for_one_country(
    total_chunks_not_in_english, country=COUNTRY
)
Country_chunks_in_english = get_chunks_for_one_country(
    total_chunks_in_english, country=COUNTRY
)
print(COUNTRY, "chunks not in English: ", len(Country_chunks_not_in_english))
print(COUNTRY, "chunks  in English: , ", len(Country_chunks_in_english))

Burundi_chunks_not_in_english:  706
Burundi_chunks_in_english:  304


Create batches to translate and submit requests --------------------------------
Key limits and considerations when using GPT-4o Mini via OpenAI's Batch API
Maximum Enqueued Tokens per Batch: Up to 2,000,000 tokens can be enqueued at one time.
Context Window: Up to 128,000 tokens per request.
Maximum Output Tokens: Up to 16,384 tokens per request.
estimate one request = 1500 tokens

Translate these chunks

In [None]:
filtered_chunks = Country_chunks_not_in_english

translator = Translator(model_name="gpt-4o-mini")

translator.create_batch_file_for_translation(
    jsonl_output_file_path="data/interim/batch_input_translation_Burundi.jsonl",
    chunks=filtered_chunks,
)

file = upload_batch_file_to_openAI(
    client=openai_client,
    batch_file_name="data/interim/batch_input_translation_Burundi.jsonl",
)

batch = submit_batch_job(client=openai_client, file_id=file.id)

In [None]:
batch_id = "batch_6842f6bc28848190a58223b8d7c5c36b"
check_progress_batch_id(batch_id=batch_id)

In [None]:
parsed_results = retrieve_and_save_batch_results(
    batch_id=batch_id,
    output_file_path_jsonl="data/interim/translation_Burundi_results.jsonl",
    return_parsed_results=True,
)

# create chunks_translated
translated_chunks = create_new_chunks_from_translated_results(
    chunks_not_in_english=filtered_chunks, parsed_results=parsed_results
)

# save new chunks
save_file(
    filename=Paths.PATH_TRANSLATED_CHUNKS,
    content=translated_chunks,
    file_type="jsonl1",
)

## Create a new collection with the translated chunks of Burundi + original in English
V2 will only have chunks in english

In [None]:
Country_chunks = Country_chunks_in_english + translated_chunks

In [None]:
chroma_collection_file_path = "data/chroma_db_v2"
collection_name = "legal_collection_v2"
chunk_ids_present_in_chromadb_collection_file_path = "data/chroma_db_v2/seen_ids.txt"
raw_embeddings = "data/chroma_db_v2/raw_embeddings.jsonl"

collection = load_collection(
    chroma_collection_file_path=chroma_collection_file_path,
    collection_name=collection_name,
    new_collection=True,  # Set to True to create a new collection
)

In [None]:
# in 2 steps otherwise error: 'Requested 310340 tokens, max 300000 tokens per request'
for Burundi_chunks in [Country_chunks_in_english, translated_chunks]:
    collection = batch_embed_and_add(
        Burundi_chunks,
        collection,
        raw_embeddings,
        chunk_ids_present_in_chromadb_collection_file_path,
        batch_size=1000,
    )
print(f"Collection contains {collection.count()} documents.")