In [1]:
from huggingface_hub import hf_hub_download
import pandas as pd
import os
from huggingface_hub import login
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import numpy as np
from tqdm import tqdm
from langchain.callbacks import get_openai_callback

import torch
assert torch.cuda.is_available()
from datasets import load_dataset
from transformers import BitsAndBytesConfig


print(f"Device name: '{torch.cuda.get_device_name()}'")
print(f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'")
print("Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16.")

Device name: 'NVIDIA GeForce RTX 2060 SUPER'
Device properties: '_CudaDeviceProperties(name='NVIDIA GeForce RTX 2060 SUPER', major=7, minor=5, total_memory=7957MB, multi_processor_count=34)'
Suporta bfloat16.




In [2]:
class Translator:
    languages = {
        "en": "eng_Latn",
        "pt": "por_Latn",
    }

    def __init__(self, tokenizer, model, max_length=4000):
        self.tokenizer = tokenizer
        self.model = model
        self.max_length = max_length
        self._translate_to_en = pipeline(
            "translation",
            model=self.model,
            tokenizer=self.tokenizer,
            src_lang=self.languages["pt"],
            tgt_lang=self.languages["en"],
            max_length=self.max_length
        )
        self._translate_to_pt = pipeline(
            "translation",
            model=self.model,
            tokenizer=self.tokenizer,
            src_lang=self.languages["en"],
            tgt_lang=self.languages["pt"],
            max_length=self.max_length
        )

    def to_english(self, text):
        return self._translate(text, input_lang="pt", output_lang="en")

    def to_portuguese(self, text):
        return self._translate(text, input_lang="en", output_lang="pt")

    def _translate(self, text, input_lang, output_lang):
        if (input_lang=="en") and (output_lang=="pt"):
            translator = self._translate_to_pt
        elif (input_lang=="pt") and (output_lang=="en"):
            translator = self._translate_to_en
        else:
            print(f"Input and/or output languages not recognized.")
            
        output = translator(text)

        output_translated = [x.get('translation_text') for x in output]

        if len(output_translated) == 1:
            return output_translated[0]
        return output_translated


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

checkpoint = "facebook/nllb-200-distilled-1.3B" # "facebook/nllb-200-distilled-600M"

quant_config = BitsAndBytesConfig(
   load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto", quantization_config=quant_config)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map="auto", quantization_config=quant_config)

translator_4bits = Translator(model=model, tokenizer=tokenizer, max_length=4000)

In [4]:
translator_4bits.to_portuguese("example of text")

'exemplo de texto'

In [5]:
translator_4bits.to_english("exemplo de texto")

'example text'

In [6]:
%%time

tokenizer = translator_4bits._translate_to_pt.tokenizer

def get_n_tokens(text):
    return len(tokenizer.encode(text))

REPO_ID = "databricks/databricks-dolly-15k"

dataset = load_dataset(REPO_ID, split="train")
dataset = dataset.filter(lambda row: len(str(row["context"])) == 0)
dataset = dataset.select_columns(['instruction', 'response'])
dataset = dataset.filter(lambda x: get_n_tokens(x["instruction"]) < 900 )
dataset = dataset.shuffle(seed=42)

df = dataset.to_pandas()
df.sample(5)

CPU times: user 338 ms, sys: 11.3 ms, total: 349 ms
Wall time: 5.33 s


Unnamed: 0,instruction,response
7365,Who is the most decorated olympian of all time?,Michael Phelps is the most decorated olympian ...
5099,"Categorize the following ingredients as meat, ...",mayonnaise - spread\nham - meat\nswiss - chees...
2986,What is the best University in Canada for comp...,To decide which university is the best univers...
2765,Let's play a game of tic tac toe. I'll be X an...,Fun! Here's my next move:\n\n_ _ _\n_ O X\n_ _ _
733,What is an AVA when it comes to wine?,"AVA stands for American Viticultural Area, whi..."


In [None]:
%%time

batch_size = 100
filepath = "data/processed/databricks/translated.parquet"

total_indices = len(df)
batches_bounds = [(i, min(i + batch_size, total_indices)) for i in range(0, total_indices, batch_size)]

for batch_id, (start, end) in tqdm(enumerate(batches_bounds)):
    batch_label = f"{str(batch_id).zfill(4)}_{str(start).zfill(4)}_{str(end).zfill(4)}"
    print(f"\n\nbatch {batch_label}")
    batch = df.iloc[start:end]
    instruction_en = batch["instruction"].to_list()
    response_en = batch["response"].to_list()

    print(f"\t> translating instruction")
    instruction_pt = translator_4bits.to_portuguese(instruction_en)
    print(f"\t> translating response")
    response_pt = translator_4bits.to_portuguese(response_en)
    
    output_dataframe = pd.DataFrame(
        {
            "instruction_original": instruction_en,
            "response_original": response_en,
            "instruction_translated": instruction_pt,
            "response_translated": response_pt,
        }
    )
    
    output_dataframe["batch_id"] = batch_label
    
    output_dataframe.to_parquet(filepath, partition_cols=["batch_id"])

0it [00:00, ?it/s]



batch 0000_0000_0100
	> translating instruction
	> translating response


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
1it [09:47, 587.49s/it]



batch 0001_0100_0200
	> translating instruction
	> translating response


2it [18:42, 556.41s/it]



batch 0002_0200_0300
	> translating instruction
	> translating response


3it [25:50, 497.83s/it]



batch 0003_0300_0400
	> translating instruction
	> translating response


Token indices sequence length is longer than the specified maximum sequence length for this model (1937 > 1024). Running this sequence through the model will result in indexing errors
4it [33:47, 489.78s/it]



batch 0004_0400_0500
	> translating instruction


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


	> translating response


5it [38:40, 418.71s/it]



batch 0005_0500_0600
	> translating instruction
	> translating response


6it [45:59, 425.53s/it]



batch 0006_0600_0700
	> translating instruction


In [34]:
output_dataframe

Unnamed: 0,instruction_original,response_original,instruction_translated,response_translated,batch_id
0,What do you know about the Fatimid Empire?,The Fatimid Dynasty was a Shia Ismaili Empire ...,O que sabe sobre o Império Fatimida?,A dinastia Fatimid era um império ismaelitas x...,0001_0000_0005
1,Identify which instrument is string or percuss...,"Tati is string, Thappu is percussion.",Identifique qual instrumento é de corda ou de ...,"Tati é corda, Thappu é percussão.",0001_0000_0005
2,Identify which car manufacturer is German or A...,"Audi is German, Buick is American",Identificar qual fabricante de automóveis é al...,"A Audi é alemã, o Buick é americano.",0001_0000_0005
3,What is a mirepoix?,A mirepoix is rooted in French cooking but use...,O que é um mirepoix?,"A mirepoix é uma raiz da culinária francesa, m...",0001_0000_0005
4,"How can you answer questions like ""Are certain...",Answering questions like this would require us...,"Como você pode responder a perguntas como ""Ser...","Para responder a perguntas como esta, seria ne...",0001_0000_0005


In [13]:
%%time
_list = df.sample(100)["instruction"].to_list()
translator_4bits.to_portuguese(_list)

CPU times: user 1min 8s, sys: 8.28 ms, total: 1min 8s
Wall time: 1min 8s


['Onde vivem os Kardashians?',
 'O que é um CPT em relação à saúde?',
 'Qual é a recomendação atual para uma dieta saudável?',
 "Classifique-os como opções de alimentos saudáveis ou não saudáveis ou não um alimento: batatas fritas, KFC, maçã, salada de frutas, bife, cerveja, vinho, uísque, batatas fritas, couve-flor, chave, cascalho, McDonald's, kebab turco, pêra, tijolos, aloe vera, areia.",
 'Quais são as vantagens e desvantagens de aprender Java em comparação com Python?',
 'Onde está a sede do GAFI ?',
 'Classifique os estados abaixo com base em qual costa eles estão localizados.',
 'O que motiva um director executivo?',
 'Que produto vendem as Girl Scouts?',
 'Que invenções do século 20 tiveram o maior impacto positivo na vida útil humana?',
 'escrever um poema sobre gratidão para minhas irmãs amigos que lideram Obon dançando comigo a cada ano',
 'Qual é a melhor maneira de cozinhar um bife?',
 'Quais dos seguintes são os filmes com a atriz Deepika Padukone?',
 'Identifique qual i

['Quem é o Paul McIver ?',
 'Por que é que os waffles de proteína estão tão secos?']

In [44]:
translator_4bits._translate_to_pt(["Who is Paul McIver", "Why are protein waffles so dry?"])

[{'translation_text': 'Quem é o Paul McIver ?'},
 {'translation_text': 'Por que é que os waffles de proteína estão tão secos?'}]