In [3]:
!pip install bitsandbytes



In [None]:
from huggingface_hub import login
login(token='<YOUR TOKEN HERE>')

In [5]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [6]:
data = pd.read_csv('en_es_corpus.txt',
                   sep='\t', header=None)
data = data[[0, 1]]
data.columns = ['EN', 'ES']
data = data[-200:]
prefix = 'Translate from English to Spanish: '
sentences_en = [f'{prefix}{s}' for s in data['EN'].values.tolist()]
sentences_es = data['ES'].values.tolist()

In [7]:
model_name = 'llama-2-7b-chat-hf'
tokenizer_name = 'llama-2-7b-chat-hf'
padding_size = 10

In [8]:
tokenizer = AutoTokenizer.from_pretrained(f'meta-llama/{model_name}',
                                          use_auth_token=True,
                                          model_max_length=padding_size)
tokenizer.pad_token = tokenizer.bos_token
tokenizer.padding_side = 'left'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
sentences_en

['Translate from English to Spanish: I was just wondering if I could borrow three hundred dollars from you. I can pay you back next Monday.',
 'Translate from English to Spanish: My father, grandfather, great-grandfather and great-great-grandfather all had the same name as I have.',
 'Translate from English to Spanish: Some people consider it a waste of time to study languages such as Klingon, Interlingua and Esperanto.',
 "Translate from English to Spanish: The Super Nintendo's graphics are amazing. They're so much better than those of the original Nintendo.",
 'Translate from English to Spanish: The iPad would be a perfect solution for me if it could properly display web pages with Flash content.',
 'Translate from English to Spanish: Tom was determined not to make the same mistakes with his children that his parents had made with him.',
 'Translate from English to Spanish: When you watch television or listen to the radio, the music which you hear is often African in origin.',
 'Tran

In [10]:
inputs = tokenizer(sentences_en, truncation=True, padding=True, return_tensors='pt')

In [11]:
inputs

{'input_ids': tensor([[    1,  4103,  9632,  ..., 29901,   306,   471],
        [    1,  4103,  9632,  ..., 29901,  1619,  4783],
        [    1,  4103,  9632,  ..., 29901,  3834,  2305],
        ...,
        [    1,  4103,  9632,  ..., 29901,   960,   366],
        [    1,  4103,  9632,  ..., 29901,   739,  1122],
        [    1,  4103,  9632,  ..., 29901,  3118,  2462]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [12]:
input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']

In [13]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16,
                                         bnb_4bit_quant_type='nf4')

In [14]:
model = AutoModelForCausalLM.from_pretrained(f'meta-llama/{model_name}',
                                             device_map='cuda:0',
                                             quantization_config=quantization_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
output_ids = model.generate(input_ids=input_ids,
                            attention_mask=attention_masks,
                            max_new_tokens=padding_size,
                            pad_token_id=tokenizer.bos_token_id,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2,
                            repetition_penalty=2.0,
               #do_sample=True,
               #temperature=0.5,
               #top_p=0.5,
               #top_k=1
                            )



In [16]:
output_ids

tensor([[    1,  4103,  9632,  ...,  8459,    13, 18627],
        [    1,  4103,  9632,  ..., 29874, 29908,   322],
        [    1,  4103,  9632,  ..., 29889, 21490,   264],
        ...,
        [    1,  4103,  9632,  ..., 23196, 29915, 29879],
        [    1,  4103,  9632,  ..., 27581, 29892,   445],
        [    1,  4103,  9632,  ...,  2023,     2,     1]])

In [18]:
out_sentences = []
for out_ids in output_ids:
    out_s = tokenizer.decode(out_ids,
                             skip_special_tokens=True,
                             clean_up_tokenization_spaces=True)
    out_sentences.append(out_s)

In [19]:
out_sentences

['Translate from English to Spanish: I was feeling very hungry and so i decided\n Hinweis',
 'Translate from English to Spanish: My father is a doctor. Unterscheidung between "a" and',
 'Translate from English to Spanish: Some people may not be able, or willing. Bedeuten',
 'Translate from English to Spanish: The Super Bowl\n obviously means "El super bowl"',
 'Translate from English to Spanish: The iTalki Platform\n sierp 21,',
 'Translate from English to Spanish: Tom was tired of eating the same old thing every day',
 "Translate from English to Spanish: When you're in a new place, it can be",
 "Translate from English to Spanish: When you're in a new city, it can be",
 'Translate from English to Spanish: You should be able. Hinweis für die Nutzer von',
 'Translate from English to Spanish: If you are looking for a professional translation service, look no',
 'Translate from English to Spanish: Disconnect\n Unterscheidung between "disconnection" and its translation',
 'Translate from En