In [1]:
from huggingface_hub import hf_hub_download
import pandas as pd
import os
from huggingface_hub import login
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import numpy as np
from tqdm import tqdm
from langchain.callbacks import get_openai_callback

import torch
assert torch.cuda.is_available()
from datasets import load_dataset
from transformers import BitsAndBytesConfig


print(f"Device name: '{torch.cuda.get_device_name()}'")
print(f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'")
print("Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16.")

Device name: 'NVIDIA GeForce RTX 2060 SUPER'
Device properties: '_CudaDeviceProperties(name='NVIDIA GeForce RTX 2060 SUPER', major=7, minor=5, total_memory=7957MB, multi_processor_count=34)'
Suporta bfloat16.




In [2]:
class Translator:
    languages = {
        "en": "eng_Latn",
        "pt": "por_Latn",
    }

    def __init__(self, tokenizer, model, max_length=4000):
        self.tokenizer = tokenizer
        self.model = model
        self.max_length = max_length
        self._translate_to_en = pipeline(
            "translation",
            model=self.model,
            tokenizer=self.tokenizer,
            src_lang=self.languages["pt"],
            tgt_lang=self.languages["en"],
            max_length=self.max_length
        )
        self._translate_to_pt = pipeline(
            "translation",
            model=self.model,
            tokenizer=self.tokenizer,
            src_lang=self.languages["en"],
            tgt_lang=self.languages["pt"],
            max_length=self.max_length
        )

    def to_english(self, text):
        return self._translate(text, input_lang="pt", output_lang="en")

    def to_portuguese(self, text):
        return self._translate(text, input_lang="en", output_lang="pt")

    def _translate(self, text, input_lang, output_lang):
        if (input_lang=="en") and (output_lang=="pt"):
            translator = self._translate_to_pt
        elif (input_lang=="pt") and (output_lang=="en"):
            translator = self._translate_to_en
        else:
            print(f"Input and/or output languages not recognized.")
            
        output = translator(text)

        output_translated = [x.get('translation_text') for x in output]

        if len(output_translated) == 1:
            return output_translated[0]
        return output_translated


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

checkpoint = "facebook/nllb-200-distilled-1.3B" # "facebook/nllb-200-distilled-600M"

quant_config = BitsAndBytesConfig(
   load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto", quantization_config=quant_config)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map="auto", quantization_config=quant_config)

translator_4bits = Translator(model=model, tokenizer=tokenizer, max_length=4000)

In [4]:
translator_4bits.to_portuguese("example of text")

'exemplo de texto'

In [5]:
translator_4bits.to_english("exemplo de texto")

'example text'

In [6]:
%%time

tokenizer = translator_4bits._translate_to_pt.tokenizer

def get_n_tokens(text):
    return len(tokenizer.encode(text))

REPO_ID = "databricks/databricks-dolly-15k"

dataset = load_dataset(REPO_ID, split="train")
dataset = dataset.filter(lambda row: len(str(row["context"])) == 0)
dataset = dataset.select_columns(['instruction', 'response'])
dataset = dataset.filter(lambda x: get_n_tokens(x["instruction"]) < 900 )
dataset = dataset.shuffle(seed=42)

df = dataset.to_pandas()
df.sample(5)

CPU times: user 338 ms, sys: 11.3 ms, total: 349 ms
Wall time: 5.33 s


Unnamed: 0,instruction,response
7365,Who is the most decorated olympian of all time?,Michael Phelps is the most decorated olympian ...
5099,"Categorize the following ingredients as meat, ...",mayonnaise - spread\nham - meat\nswiss - chees...
2986,What is the best University in Canada for comp...,To decide which university is the best univers...
2765,Let's play a game of tic tac toe. I'll be X an...,Fun! Here's my next move:\n\n_ _ _\n_ O X\n_ _ _
733,What is an AVA when it comes to wine?,"AVA stands for American Viticultural Area, whi..."


In [7]:
%%time

batch_size = 100
filepath = "data/processed/databricks/translated.parquet"

total_indices = len(df)
batches_bounds = [(i, min(i + batch_size, total_indices)) for i in range(0, total_indices, batch_size)]

for batch_id, (start, end) in tqdm(enumerate(batches_bounds)):
    batch_label = f"{str(batch_id).zfill(4)}_{str(start).zfill(4)}_{str(end).zfill(4)}"
    print(f"\n\nbatch {batch_label}")
    batch = df.iloc[start:end]
    instruction_en = batch["instruction"].to_list()
    response_en = batch["response"].to_list()

    print(f"\t> translating instruction")
    instruction_pt = translator_4bits.to_portuguese(instruction_en)
    print(f"\t> translating response")
    response_pt = translator_4bits.to_portuguese(response_en)
    
    output_dataframe = pd.DataFrame(
        {
            "instruction_original": instruction_en,
            "response_original": response_en,
            "instruction_translated": instruction_pt,
            "response_translated": response_pt,
        }
    )
    
    output_dataframe["batch_id"] = batch_label
    
    output_dataframe.to_parquet(filepath, partition_cols=["batch_id"])

0it [00:00, ?it/s]



batch 0000_0000_0100
	> translating instruction
	> translating response


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
1it [09:47, 587.49s/it]



batch 0001_0100_0200
	> translating instruction
	> translating response


2it [18:42, 556.41s/it]



batch 0002_0200_0300
	> translating instruction
	> translating response


3it [25:50, 497.83s/it]



batch 0003_0300_0400
	> translating instruction
	> translating response


Token indices sequence length is longer than the specified maximum sequence length for this model (1937 > 1024). Running this sequence through the model will result in indexing errors
4it [33:47, 489.78s/it]



batch 0004_0400_0500
	> translating instruction


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


	> translating response


5it [38:40, 418.71s/it]



batch 0005_0500_0600
	> translating instruction
	> translating response


6it [45:59, 425.53s/it]



batch 0006_0600_0700
	> translating instruction
	> translating response


7it [56:24, 490.90s/it]



batch 0007_0700_0800
	> translating instruction
	> translating response


8it [1:06:51, 534.08s/it]



batch 0008_0800_0900
	> translating instruction
	> translating response


9it [1:13:57, 500.53s/it]



batch 0009_0900_1000
	> translating instruction
	> translating response


10it [1:18:43, 434.19s/it]



batch 0010_1000_1100
	> translating instruction
	> translating response


11it [1:26:33, 445.01s/it]



batch 0011_1100_1200
	> translating instruction
	> translating response


12it [1:32:44, 422.66s/it]



batch 0012_1200_1300
	> translating instruction
	> translating response


13it [1:39:50, 423.53s/it]



batch 0013_1300_1400
	> translating instruction
	> translating response


14it [1:45:58, 406.72s/it]



batch 0014_1400_1500
	> translating instruction
	> translating response


15it [1:52:48, 407.87s/it]



batch 0015_1500_1600
	> translating instruction
	> translating response


16it [1:57:22, 367.57s/it]



batch 0016_1600_1700
	> translating instruction
	> translating response


17it [2:04:41, 388.90s/it]



batch 0017_1700_1800
	> translating instruction
	> translating response


18it [2:13:27, 430.13s/it]



batch 0018_1800_1900
	> translating instruction
	> translating response


19it [2:19:11, 404.30s/it]



batch 0019_1900_2000
	> translating instruction
	> translating response


20it [2:27:56, 440.61s/it]



batch 0020_2000_2100
	> translating instruction
	> translating response


21it [2:34:45, 431.20s/it]



batch 0021_2100_2200
	> translating instruction
	> translating response


22it [2:41:21, 420.56s/it]



batch 0022_2200_2300
	> translating instruction
	> translating response


23it [2:46:30, 387.14s/it]



batch 0023_2300_2400
	> translating instruction
	> translating response


24it [2:51:31, 361.32s/it]



batch 0024_2400_2500
	> translating instruction
	> translating response


25it [2:58:13, 373.35s/it]



batch 0025_2500_2600
	> translating instruction
	> translating response


26it [3:02:54, 345.63s/it]



batch 0026_2600_2700
	> translating instruction
	> translating response


27it [3:08:37, 345.02s/it]



batch 0027_2700_2800
	> translating instruction
	> translating response


28it [3:14:29, 347.14s/it]



batch 0028_2800_2900
	> translating instruction
	> translating response


29it [3:21:08, 362.67s/it]



batch 0029_2900_3000
	> translating instruction
	> translating response


30it [3:26:24, 348.55s/it]



batch 0030_3000_3100
	> translating instruction
	> translating response


31it [3:31:07, 328.91s/it]



batch 0031_3100_3200
	> translating instruction
	> translating response


32it [3:38:08, 356.47s/it]



batch 0032_3200_3300
	> translating instruction
	> translating response


33it [3:48:29, 435.89s/it]



batch 0033_3300_3400
	> translating instruction
	> translating response


34it [3:55:38, 433.75s/it]



batch 0034_3400_3500
	> translating instruction
	> translating response


35it [4:01:08, 402.73s/it]



batch 0035_3500_3600
	> translating instruction
	> translating response


36it [4:06:07, 371.67s/it]



batch 0036_3600_3700
	> translating instruction
	> translating response


37it [4:13:21, 390.30s/it]



batch 0037_3700_3800
	> translating instruction
	> translating response


38it [4:22:12, 432.49s/it]



batch 0038_3800_3900
	> translating instruction
	> translating response


39it [4:27:13, 393.07s/it]



batch 0039_3900_4000
	> translating instruction
	> translating response


40it [4:34:26, 405.08s/it]



batch 0040_4000_4100
	> translating instruction
	> translating response


41it [4:39:26, 373.40s/it]



batch 0041_4100_4200
	> translating instruction
	> translating response


42it [4:45:51, 376.86s/it]



batch 0042_4200_4300
	> translating instruction
	> translating response


43it [4:50:29, 347.33s/it]



batch 0043_4300_4400
	> translating instruction
	> translating response


44it [4:57:56, 377.07s/it]



batch 0044_4400_4500
	> translating instruction
	> translating response


45it [5:02:41, 349.63s/it]



batch 0045_4500_4600
	> translating instruction
	> translating response


46it [5:09:02, 359.09s/it]



batch 0046_4600_4700
	> translating instruction
	> translating response


47it [5:13:29, 331.38s/it]



batch 0047_4700_4800
	> translating instruction
	> translating response


48it [5:20:24, 356.59s/it]



batch 0048_4800_4900
	> translating instruction
	> translating response


49it [5:26:34, 360.60s/it]



batch 0049_4900_5000
	> translating instruction
	> translating response


50it [5:35:24, 411.41s/it]



batch 0050_5000_5100
	> translating instruction
	> translating response


51it [5:44:13, 446.72s/it]



batch 0051_5100_5200
	> translating instruction
	> translating response


52it [5:49:20, 404.56s/it]



batch 0052_5200_5300
	> translating instruction
	> translating response


53it [5:56:15, 407.94s/it]



batch 0053_5300_5400
	> translating instruction
	> translating response


54it [6:00:54, 369.16s/it]



batch 0054_5400_5500
	> translating instruction
	> translating response


55it [6:07:42, 380.68s/it]



batch 0055_5500_5600
	> translating instruction
	> translating response


56it [6:15:20, 403.90s/it]



batch 0056_5600_5700
	> translating instruction
	> translating response


57it [6:20:58, 384.11s/it]



batch 0057_5700_5800
	> translating instruction
	> translating response


58it [6:27:47, 391.81s/it]



batch 0058_5800_5900
	> translating instruction
	> translating response


59it [6:35:03, 405.05s/it]



batch 0059_5900_6000
	> translating instruction
	> translating response


60it [6:40:20, 378.46s/it]



batch 0060_6000_6100
	> translating instruction
	> translating response


61it [6:45:26, 356.79s/it]



batch 0061_6100_6200
	> translating instruction
	> translating response


62it [6:50:51, 347.28s/it]



batch 0062_6200_6300
	> translating instruction
	> translating response


63it [6:56:26, 343.63s/it]



batch 0063_6300_6400
	> translating instruction
	> translating response


64it [7:02:04, 341.79s/it]



batch 0064_6400_6500
	> translating instruction
	> translating response


65it [7:11:56, 417.01s/it]



batch 0065_6500_6600
	> translating instruction
	> translating response


66it [7:18:26, 408.85s/it]



batch 0066_6600_6700
	> translating instruction
	> translating response


67it [7:23:11, 371.78s/it]



batch 0067_6700_6800
	> translating instruction
	> translating response


68it [7:30:22, 389.46s/it]



batch 0068_6800_6900
	> translating instruction
	> translating response


69it [7:36:06, 375.85s/it]



batch 0069_6900_7000
	> translating instruction
	> translating response


70it [7:44:44, 418.33s/it]



batch 0070_7000_7100
	> translating instruction
	> translating response


71it [7:50:00, 387.89s/it]



batch 0071_7100_7200
	> translating instruction
	> translating response


72it [7:56:17, 384.33s/it]



batch 0072_7200_7300
	> translating instruction
	> translating response


73it [8:03:29, 398.72s/it]



batch 0073_7300_7400
	> translating instruction
	> translating response


74it [8:10:51, 411.81s/it]



batch 0074_7400_7500
	> translating instruction
	> translating response


75it [8:16:47, 394.94s/it]



batch 0075_7500_7600
	> translating instruction
	> translating response


76it [8:21:18, 357.88s/it]



batch 0076_7600_7700
	> translating instruction
	> translating response


77it [8:29:03, 389.93s/it]



batch 0077_7700_7800
	> translating instruction
	> translating response


78it [8:36:00, 398.24s/it]



batch 0078_7800_7900
	> translating instruction
	> translating response


79it [8:47:18, 482.07s/it]



batch 0079_7900_8000
	> translating instruction
	> translating response


80it [8:51:51, 419.43s/it]



batch 0080_8000_8100
	> translating instruction
	> translating response


81it [8:58:22, 410.79s/it]



batch 0081_8100_8200
	> translating instruction
	> translating response


82it [9:03:18, 376.37s/it]



batch 0082_8200_8300
	> translating instruction
	> translating response


83it [9:12:04, 421.10s/it]



batch 0083_8300_8400
	> translating instruction
	> translating response


84it [9:16:26, 373.48s/it]



batch 0084_8400_8500
	> translating instruction
	> translating response


85it [9:23:10, 382.52s/it]



batch 0085_8500_8600
	> translating instruction
	> translating response


86it [9:30:04, 392.24s/it]



batch 0086_8600_8700
	> translating instruction
	> translating response


87it [9:37:23, 406.09s/it]



batch 0087_8700_8800
	> translating instruction
	> translating response


88it [9:42:41, 379.82s/it]



batch 0088_8800_8900
	> translating instruction
	> translating response


89it [9:49:12, 383.12s/it]



batch 0089_8900_9000
	> translating instruction
	> translating response


90it [9:54:28, 362.95s/it]



batch 0090_9000_9100
	> translating instruction
	> translating response


91it [9:59:29, 344.25s/it]



batch 0091_9100_9200
	> translating instruction
	> translating response


92it [10:06:36, 369.30s/it]



batch 0092_9200_9300
	> translating instruction
	> translating response


93it [10:16:10, 430.52s/it]



batch 0093_9300_9400
	> translating instruction
	> translating response


94it [10:22:28, 414.89s/it]



batch 0094_9400_9500
	> translating instruction
	> translating response


95it [10:27:04, 373.28s/it]



batch 0095_9500_9600
	> translating instruction
	> translating response


96it [10:38:19, 463.56s/it]



batch 0096_9600_9700
	> translating instruction
	> translating response


97it [10:42:44, 403.97s/it]



batch 0097_9700_9800
	> translating instruction
	> translating response


98it [10:47:55, 376.16s/it]



batch 0098_9800_9900
	> translating instruction
	> translating response


99it [10:53:38, 366.33s/it]



batch 0099_9900_10000
	> translating instruction
	> translating response


100it [11:04:43, 455.73s/it]



batch 0100_10000_10100
	> translating instruction
	> translating response


101it [11:12:26, 457.98s/it]



batch 0101_10100_10200
	> translating instruction
	> translating response


102it [11:17:31, 412.10s/it]



batch 0102_10200_10300
	> translating instruction
	> translating response


103it [11:22:12, 372.85s/it]



batch 0103_10300_10400
	> translating instruction
	> translating response


104it [11:31:19, 425.02s/it]



batch 0104_10400_10500
	> translating instruction
	> translating response


105it [11:39:11, 439.10s/it]



batch 0105_10500_10543
	> translating instruction
	> translating response


106it [11:43:15, 398.07s/it]

CPU times: user 11h 54min 45s, sys: 17.1 s, total: 11h 55min 2s
Wall time: 11h 43min 15s





In [9]:
df = pd.read_parquet("data/processed/databricks/translated.parquet")

In [29]:
temp = df.sample().iloc[0].to_dict()

print(f"instruction_translated = {temp['instruction_translated']}")
print(f"response_translated = {temp['response_translated']}")

instruction_translated = Descreva estes itens como caros ou baratos de comprar: um relógio Rolex, uma caixa de cartas de jogo, um carro, um computador portátil, um litro de leite, um anel de noivado, uma casa
response_translated = Um relógio Rolex: caro, uma caixa de cartas de jogo: barato, um carro: caro, um computador portátil: caro, um litro de leite: barato, um anel de noivado: caro, uma casa: caro


In [22]:
df.to_parquet("data/processed/databricks/final_version.parquet")

In [13]:
%%time
_list = df.sample(100)["instruction"].to_list()
translator_4bits.to_portuguese(_list)

CPU times: user 1min 8s, sys: 8.28 ms, total: 1min 8s
Wall time: 1min 8s


['Onde vivem os Kardashians?',
 'O que é um CPT em relação à saúde?',
 'Qual é a recomendação atual para uma dieta saudável?',
 "Classifique-os como opções de alimentos saudáveis ou não saudáveis ou não um alimento: batatas fritas, KFC, maçã, salada de frutas, bife, cerveja, vinho, uísque, batatas fritas, couve-flor, chave, cascalho, McDonald's, kebab turco, pêra, tijolos, aloe vera, areia.",
 'Quais são as vantagens e desvantagens de aprender Java em comparação com Python?',
 'Onde está a sede do GAFI ?',
 'Classifique os estados abaixo com base em qual costa eles estão localizados.',
 'O que motiva um director executivo?',
 'Que produto vendem as Girl Scouts?',
 'Que invenções do século 20 tiveram o maior impacto positivo na vida útil humana?',
 'escrever um poema sobre gratidão para minhas irmãs amigos que lideram Obon dançando comigo a cada ano',
 'Qual é a melhor maneira de cozinhar um bife?',
 'Quais dos seguintes são os filmes com a atriz Deepika Padukone?',
 'Identifique qual i

['Quem é o Paul McIver ?',
 'Por que é que os waffles de proteína estão tão secos?']

In [44]:
translator_4bits._translate_to_pt(["Who is Paul McIver", "Why are protein waffles so dry?"])

[{'translation_text': 'Quem é o Paul McIver ?'},
 {'translation_text': 'Por que é que os waffles de proteína estão tão secos?'}]