# Dataset Translation
In this notebook, I will translate the sentences from the `b-mc2/sql-create-context` dataset into portuguese locally using the model `facebook/nllb-200-distilled-1.3B`.

In [1]:
from huggingface_hub import hf_hub_download
import pandas as pd
import os
from huggingface_hub import login
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import numpy as np
from tqdm import tqdm
from langchain.callbacks import get_openai_callback

import torch
assert torch.cuda.is_available()

from transformers import BitsAndBytesConfig


print(f"Device name: '{torch.cuda.get_device_name()}'")
print(f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'")
print("Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16.")

Device name: 'NVIDIA GeForce RTX 4060 Ti'
Device properties: '_CudaDeviceProperties(name='NVIDIA GeForce RTX 4060 Ti', major=8, minor=9, total_memory=16059MB, multi_processor_count=34)'
Suporta bfloat16.




In [2]:
class Translator:
    languages = {
        "en": "eng_Latn",
        "pt": "por_Latn",
    }

    def __init__(self, tokenizer, model, max_length=1000):
        self.tokenizer = tokenizer
        self.model = model
        self.max_length = max_length
        self._translate_to_en = pipeline(
            "translation",
            model=self.model,
            tokenizer=self.tokenizer,
            src_lang=self.languages["pt"],
            tgt_lang=self.languages["en"],
            max_length=self.max_length
        )
        self._translate_to_pt = pipeline(
            "translation",
            model=self.model,
            tokenizer=self.tokenizer,
            src_lang=self.languages["en"],
            tgt_lang=self.languages["pt"],
            max_length=self.max_length
        )

    def to_english(self, text):
        return self._translate(text, input_lang="pt", output_lang="en")

    def to_portuguese(self, text):
        return self._translate(text, input_lang="en", output_lang="pt")

    def _translate(self, text, input_lang, output_lang):
        if (input_lang=="en") and (output_lang=="pt"):
            translator = self._translate_to_pt
        elif (input_lang=="pt") and (output_lang=="en"):
            translator = self._translate_to_en
        else:
            print(f"Input and/or output languages not recognized.")
            
        output = translator(text)
        translated_text = output[0]['translation_text']
        return translated_text


# 4-bits

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

checkpoint = "facebook/nllb-200-distilled-1.3B" # "facebook/nllb-200-distilled-600M"

quant_config = BitsAndBytesConfig(
   load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto", quantization_config=quant_config)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map="auto", quantization_config=quant_config)

translator_4bits = Translator(model=model, tokenizer=tokenizer)

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.48G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]



In [4]:
translator_4bits.to_portuguese("example of text")

'exemplo de texto'

In [5]:
translator_4bits.to_english("exemplo de texto")

'example text'

### Translate SQL database

In [None]:
update = False

filepath = "data/raw/sql_create_context_v4.parquet"

if update:
    login(token=os.environ["HUGGINGFACE_TOKEN"])
    
    REPO_ID = "b-mc2/sql-create-context"
    FILENAME = "sql_create_context_v4.json"
    
    dataset = pd.read_json(
        hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type="dataset", force_download=True)
    )
    dataset.to_parquet(filepath)

dataset = pd.read_parquet(filepath)

In [14]:
%%time

translated_path = "data/processed/translated"
n_batch = 100

n_iterations = 143

for j in range(n_iterations):
    print("interacao",j)
    try:
        batches = [int(x.replace(".parquet", "")) for x in os.listdir(translated_path)]
        new_batch = str(int(round(np.max(batches)+1))).zfill(4)
        translated = pd.read_parquet(translated_path)
        done = translated["index"].to_list()
        
        elegible = [x for x in dataset.index if x not in done]
        selected_ids = np.random.choice(elegible, n_batch)
        selected = dataset[dataset.reset_index()["index"].isin(selected_ids)]
        
        
        responses = []
        total_cost = 0
        for i, row in tqdm(selected[["question"]].iterrows()):
            query = row["question"]
            with get_openai_callback() as cb:
                # response =  chain.invoke(input={"query": query}).content
                response = translator_4bits.to_portuguese(query)
                total_cost += cb.total_cost
                responses.append(dict(index=i, translated=response))
        # print(f"total_cost = {total_cost}")
        
        translated = pd.DataFrame(responses)
        translated.to_parquet(f"{translated_path}/{new_batch}.parquet")
    except:
        pass

interacao 0


8it [00:04,  1.95it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100it [00:59,  1.68it/s]


interacao 1


100it [01:02,  1.61it/s]


interacao 2


100it [01:02,  1.59it/s]


interacao 3


100it [01:02,  1.60it/s]


interacao 4


100it [00:59,  1.68it/s]


interacao 5


99it [00:57,  1.73it/s]


interacao 6


100it [00:59,  1.67it/s]


interacao 7


100it [01:01,  1.64it/s]


interacao 8


100it [00:58,  1.71it/s]


interacao 9


97it [00:55,  1.74it/s]


interacao 10


100it [00:58,  1.70it/s]


interacao 11


100it [00:59,  1.68it/s]


interacao 12


99it [01:03,  1.57it/s]


interacao 13


98it [01:00,  1.61it/s]


interacao 14


100it [01:02,  1.61it/s]


interacao 15


100it [01:00,  1.64it/s]


interacao 16


99it [00:58,  1.70it/s]


interacao 17


99it [00:56,  1.74it/s]


interacao 18


100it [00:59,  1.69it/s]


interacao 19


100it [00:57,  1.75it/s]


interacao 20


99it [00:55,  1.78it/s]


interacao 21


100it [01:01,  1.63it/s]


interacao 22


100it [00:59,  1.68it/s]


interacao 23


100it [00:57,  1.75it/s]


interacao 24


100it [00:59,  1.67it/s]


interacao 25


99it [00:59,  1.68it/s]


interacao 26


100it [00:59,  1.69it/s]


interacao 27


99it [00:59,  1.67it/s]


interacao 28


100it [01:12,  1.38it/s]


interacao 29


100it [01:00,  1.67it/s]


interacao 30


99it [01:00,  1.63it/s]


interacao 31


100it [01:05,  1.53it/s]


interacao 32


100it [01:02,  1.61it/s]


interacao 33


100it [00:59,  1.69it/s]


interacao 34


99it [00:57,  1.71it/s]


interacao 35


100it [01:02,  1.60it/s]


interacao 36


100it [01:00,  1.65it/s]


interacao 37


99it [00:59,  1.66it/s]


interacao 38


99it [01:00,  1.63it/s]


interacao 39


99it [00:58,  1.68it/s]


interacao 40


100it [01:02,  1.59it/s]


interacao 41


100it [01:02,  1.60it/s]


interacao 42


99it [00:59,  1.67it/s]


interacao 43


100it [01:01,  1.63it/s]


interacao 44


99it [00:57,  1.71it/s]


interacao 45


100it [00:59,  1.69it/s]


interacao 46


99it [01:01,  1.60it/s]


interacao 47


99it [00:54,  1.81it/s]


interacao 48


99it [00:55,  1.78it/s]


interacao 49


100it [00:59,  1.68it/s]


interacao 50


99it [00:59,  1.66it/s]


interacao 51


100it [01:00,  1.66it/s]


interacao 52


99it [01:00,  1.63it/s]


interacao 53


98it [00:59,  1.63it/s]


interacao 54


99it [01:00,  1.64it/s]


interacao 55


100it [00:57,  1.75it/s]


interacao 56


100it [01:04,  1.54it/s]


interacao 57


99it [00:59,  1.68it/s]


interacao 58


98it [00:59,  1.65it/s]


interacao 59


99it [00:58,  1.69it/s]


interacao 60


98it [01:00,  1.63it/s]


interacao 61


98it [00:57,  1.70it/s]


interacao 62


100it [00:59,  1.67it/s]


interacao 63


100it [01:04,  1.55it/s]


interacao 64


100it [01:01,  1.61it/s]


interacao 65


99it [01:01,  1.62it/s]


interacao 66


100it [01:01,  1.63it/s]


interacao 67


99it [01:02,  1.59it/s]


interacao 68


100it [01:00,  1.66it/s]


interacao 69


98it [00:58,  1.68it/s]


interacao 70


100it [01:00,  1.65it/s]


interacao 71


98it [00:55,  1.76it/s]


interacao 72


100it [01:01,  1.63it/s]


interacao 73


99it [00:58,  1.68it/s]


interacao 74


99it [01:01,  1.62it/s]


interacao 75


100it [01:04,  1.56it/s]


interacao 76


100it [01:01,  1.62it/s]


interacao 77


99it [00:53,  1.85it/s]


interacao 78


98it [01:00,  1.62it/s]


interacao 79


100it [00:58,  1.71it/s]


interacao 80


100it [00:59,  1.69it/s]


interacao 81


99it [01:01,  1.61it/s]


interacao 82


100it [01:04,  1.54it/s]


interacao 83


99it [01:03,  1.56it/s]


interacao 84


100it [00:59,  1.68it/s]


interacao 85


99it [00:58,  1.69it/s]


interacao 86


99it [00:55,  1.77it/s]


interacao 87


99it [00:58,  1.69it/s]


interacao 88


100it [00:57,  1.74it/s]


interacao 89


100it [01:00,  1.66it/s]


interacao 90


100it [00:57,  1.73it/s]


interacao 91


100it [00:59,  1.69it/s]


interacao 92


99it [00:59,  1.66it/s]


interacao 93


98it [01:01,  1.61it/s]


interacao 94


99it [00:59,  1.68it/s]


interacao 95


99it [00:57,  1.71it/s]


interacao 96


99it [00:57,  1.72it/s]


interacao 97


100it [00:59,  1.67it/s]


interacao 98


99it [00:59,  1.67it/s]


interacao 99


100it [00:54,  1.82it/s]


interacao 100


99it [00:57,  1.71it/s]


interacao 101


99it [00:58,  1.68it/s]


interacao 102


99it [00:56,  1.76it/s]


interacao 103


97it [00:57,  1.69it/s]


interacao 104


98it [00:58,  1.68it/s]


interacao 105


98it [01:01,  1.60it/s]


interacao 106


99it [00:56,  1.76it/s]


interacao 107


98it [00:56,  1.73it/s]


interacao 108


100it [00:58,  1.70it/s]


interacao 109


99it [00:58,  1.70it/s]


interacao 110


99it [00:55,  1.77it/s]


interacao 111


98it [00:53,  1.83it/s]


interacao 112


97it [00:56,  1.72it/s]


interacao 113


97it [00:55,  1.74it/s]


interacao 114


97it [00:56,  1.72it/s]


interacao 115


99it [01:01,  1.61it/s]


interacao 116


99it [01:00,  1.63it/s]


interacao 117


96it [00:56,  1.70it/s]


interacao 118


97it [00:53,  1.80it/s]


interacao 119


98it [00:58,  1.68it/s]


interacao 120


97it [01:03,  1.53it/s]


interacao 121


100it [00:59,  1.68it/s]


interacao 122


99it [00:56,  1.75it/s]


interacao 123


96it [00:58,  1.64it/s]


interacao 124


96it [00:57,  1.67it/s]


interacao 125


99it [01:02,  1.58it/s]


interacao 126


95it [00:54,  1.74it/s]


interacao 127


96it [00:53,  1.80it/s]


interacao 128


98it [01:01,  1.59it/s]


interacao 129


95it [00:55,  1.70it/s]


interacao 130


95it [00:53,  1.76it/s]


interacao 131


97it [00:58,  1.64it/s]


interacao 132


99it [01:01,  1.62it/s]


interacao 133


96it [00:58,  1.64it/s]


interacao 134


94it [00:53,  1.75it/s]


interacao 135


93it [00:53,  1.74it/s]


interacao 136


92it [00:55,  1.67it/s]


interacao 137


91it [00:52,  1.75it/s]


interacao 138


89it [00:54,  1.64it/s]


interacao 139


85it [00:49,  1.73it/s]


interacao 140


76it [00:43,  1.75it/s]


interacao 141


56it [00:32,  1.72it/s]


interacao 142


18it [00:11,  1.54it/s]

CPU times: user 2h 45min 10s, sys: 6.75 s, total: 2h 45min 17s
Wall time: 2h 44min 48s



