# Dataset Translation

In this notebook, I will translate the sentences from the `b-mc2/sql-create-context` dataset into portuguese using OpenAI`s GPT.

In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd
import os
from huggingface_hub import login
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import numpy as np
from tqdm import tqdm
from langchain.callbacks import get_openai_callback

In [None]:
update = False

filepath = "data/raw/sql_create_context_v4.parquet"

if update:
    login(token=os.environ["HUGGINGFACE_TOKEN"])
    
    REPO_ID = "b-mc2/sql-create-context"
    FILENAME = "sql_create_context_v4.json"
    
    dataset = pd.read_json(
        hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type="dataset", force_download=True)
    )
    dataset.to_parquet(filepath)

dataset = pd.read_parquet(filepath)

In [None]:
prompt = PromptTemplate(
    input_variables = ["query"],
    template = (
        "Translate the following query to portuguese:\n"
        "'{query}'"
    )
)
chain = prompt | ChatOpenAI(model = 'gpt-3.5-turbo', temperature=0.0)


In [None]:
%%time

translated_path = "data/processed/translated"
n_batch = 100

n_iterations = 10

for j in range(n_iterations):
    batches = [int(x.replace(".parquet", "")) for x in os.listdir(translated_path)]
    new_batch = str(int(round(np.max(batches)+1))).zfill(4)
    translated = pd.read_parquet(translated_path)
    done = translated["index"].to_list()
    
    elegible = [x for x in dataset.index if x not in done]
    selected_ids = np.random.choice(elegible, n_batch)
    selected = dataset[dataset.reset_index()["index"].isin(selected_ids)]
    
    
    responses = []
    total_cost = 0
    for i, row in tqdm(selected[["question"]].iterrows()):
        query = row["question"]
        with get_openai_callback() as cb:
            response =  chain.invoke(input={"query": query})
            total_cost += cb.total_cost
            responses.append(dict(index=i, translated=response.content))
    print(f"total_cost = {total_cost}")
    
    translated = pd.DataFrame(responses)
    
    translated.to_parquet(f"{translated_path}/{new_batch}.parquet")