In [3]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

In [4]:
model_name = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
text = "Hi, how are you?"
encode_text = tokenizer(text)["input_ids"]
pprint(encode_text)

[12764, 13, 849, 403, 368, 32]


In [8]:
decoded_text = tokenizer.decode(encode_text)
print(decoded_text)

Hi, how are you?


In [14]:
list_texts = ["Hi, how are you?", "I am good", "Yes"]
encoded_texts = tokenizer(list_texts)
print(encoded_texts["input_ids"])

[[12764, 13, 849, 403, 368, 32], [42, 717, 1175], [4374]]


In [15]:
tokenizer.pad_token = tokenizer.eos_token
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding:", encoded_texts_longest["input_ids"])

Using padding: [[12764, 13, 849, 403, 368, 32], [42, 717, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]


In [17]:
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation:", encoded_texts_truncation["input_ids"])

Using truncation: [[12764, 13, 849], [42, 717, 1175], [4374]]


In [18]:
tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation:", encoded_texts_truncation_left["input_ids"])

Using truncation: [[403, 368, 32], [42, 717, 1175], [4374]]


In [19]:
encoded_texts_truncation_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using truncation:", encoded_texts_truncation_both["input_ids"])

Using truncation: [[403, 368, 32], [42, 717, 1175], [4374, 0, 0]]


In [22]:
from datasets import load_dataset
instruction_dataset_df = load_dataset("kotzeje/lamini_docs.jsonl", split = "train", trust_remote_code=True)
print(instruction_dataset_df)

Dataset({
    features: ['question', 'answer'],
    num_rows: 1400
})


In [24]:
examples = instruction_dataset_df.to_dict()
prompt_template = """### Question:
{question}
### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
    question = examples["question"][i]
    answer = examples["answer"][i]

    text_with_prompt_template = prompt_template.format(question=question)
    finetuning_dataset.append({"question": text_with_prompt_template ,"answer": answer})
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '### Answer:'}


In [25]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenizer_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenizer_inputs["input_ids"])

[[ 4118 19782    27   187  2347   476   309  7472   253  3045   285  3290
    273   253  4561  2505   432   418  4988    74  3210    32   187  4118
  37741    27  2512   403  2067 17082   326   476   320   908   281  7472
    253  3045   285  3290   273  4561  2505   432   418  4988    74  3210
     13  1690 44229   414    13   378  1843    54  4868    13   285  1966
   7103    15  3545 12813   414  5593   849   973   253  1566 26295   253
   1735  3159   275   247  3425    13  1223   378  1843    54  4868  5593
    253 14259   875   253  4561  2505   285   247  3806  2505    15  8801
   7103  8687  1907  1966 16006  2281   253  3290   273   253  4561  2505
   1754   327  2616   824   347 25253    13  2938  1371    13   285 17200
     15   733   310  8521   281   897   247  5019   273   841 17082   323
    247 11088  7103   273   253  1566   434  3045    15]]


In [28]:
max_length = 2048
max_length = min(
    tokenizer_inputs["input_ids"].shape[1],
    max_length
)
tokenizer_inputs = tokenizer(
    text,
    return_tensors="np",
    max_length=max_length,
    truncation=True
)

In [29]:
def tokenizer_function(examples):
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True
    )
   
    max_length = min(
        tokenizer_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer_inputs = tokenizer(
        text,
        return_tensors="np",
        max_length=max_length,
        truncation=True
    )
    return tokenizer_inputs

    
    

In [35]:
pprint(instruction_dataset_df[0])

{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': 'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?'}


In [30]:

tokenized_dataset = instruction_dataset_df.map(
    tokenizer_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)
print(tokenized_dataset)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [32]:
pprint(tokenized_dataset[0])

{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
   