<a href="https://colab.research.google.com/github/gusat/-lab-agile-planning-/blob/main/course/en/chapter2/section6_pt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Putting it all together (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [3]:
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)

In [4]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

model_inputs = tokenizer(sequences)

In [5]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [6]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [7]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

In [8]:
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [9]:
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [28]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# Template for F1 scoring the answers generated thru WD
def calculate_f1_scores(data):
    """
    Calculate F1 scores for Q&A pairs based on reference answers and LLM responses.

    Args:
        data (dict): A dictionary containing Q&A pairs, reference answers, and LLM responses.

    Returns:
        pandas.DataFrame: A DataFrame with questions and F1 scores for each pair.
    """
    # Create a DataFrame from the data
    df = pd.DataFrame(data)

    # Calculate F1 scores for each question
    f1_scores = []
    for i, row in df.iterrows():
        reference_tokens = row["reference_answer"].split()
        response_tokens = row["response"].split()

        # Calculate F1 score
        f1 = f1_score(reference_tokens, response_tokens,  average='micro')
        f1_scores.append(f1)

    # Add F1 scores to the DataFrame
    df["F1_Score"] = f1_scores

    return df[["question", "F1_Score"]]

# Example data dictionary
# Define the reference Q&A pairs and our WD LLM responses
data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    "question": [
        "What is NLP?",
        "How do I cite the course?",
        "When was the transformer architecture introduced?",
        "When was GPT-3 introduced?",
        "What is an encoder model?",
        "What is a decoder model?",
        "What are Encoder-decoder models?",
        "What does a word-based tokenizer do?",
        "What does a character-based tokenizer do?",
        "How do Subword tokenization algorithms work?",
        "What is the Hugging Face hub?",
        "What is a model card?",
        "How do I load a tokenizer?",
        "How do I save a tokenizer?",
        "What is named entity recognition?",
        "What is part-of-speech tagging?",
        "What type of task is translation?",
        "Why should I write a minimal reproducible example?",
        "How do I get my environment information with the transformers-cli command?",
        "Are all the libraries available for Python?",
    ],
    "reference_answer": [
        "Nlp is a field of linguistics and machine learning focused on understanding everything related to human language.",
        "@mischuggingfacecourse, author = hugging face, title = the hugging face course, 2022, howpublished = 'urlhttps://huggingface.co/course', year = 2022, note = '[online; accessed ]'",
        "June 2017.",
        "May 2020",
        "Encoder models use only the encoder of a transformer model.",
        "Decoder models use only the decoder of a transformer model.",
        "Encoder-decoder models (also called sequence-to-sequence models ) use both parts of the transformer architecture",
        "Can simply split a raw text into words on whitespace and punctuation",
        "Character-based tokenizers split the text into characters, rather than words",
        "Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.",
        "The hugging face hub –- our main website –- is a central platform that enables anyone to discover, use, and contribute new state-of-the-art models and datasets",
        "Model cards are a way to describe your model and its hyperparameters.",
        "From transformers import autotokenizer, datacollatorwithpadding raw_datasets = load_dataset('glue', 'mrpc') checkpoint = 'bert-base-uncased' tokenizer = autotokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer",
        "You can save it with the save_pretrained() method, or upload it to the hub with the push_to_hub() method",
        "Named entity recognition (ner) is a task where the model has to find which parts of the input text correspond to entities such as persons, locations, or organizations.",
        "Mark each word in a sentence as corresponding to a particular part of speech",
        "Translation is a sequence-to-sequence task.",
        "It's very important to isolate the piece of code that produces the bug, as no one in the hugging face team is a magician (yet), and they can't fix what they can't see.",
        "Transformers-cli env",
        "All the libraries that we'll be using in this course are available as python packages.",
    ],
    "response": [
        "Nlp is a field of linguistics and machine learning focused on understanding everything related to human language.",
        "@mischuggingfacecourse, author = hugging face, title = the hugging face course, 2022, howpublished = 'urlhttps://huggingface.co/course', year = 2022, note = '[online; accessed ]'",
        "June 2017.",
        "May 2020",
        "Encoder models use only the encoder of a transformer .",
        "Decoder models use only the decoder of a transformer .",
        "Encoder-decoder models (also called sequence-to-sequence models ) use both parts of the transformer model",
        "Can simply split a raw text into words on whitespace and punctuation",
        "Character-based tokenizers split the text into characters, rather than words",
        "Subword tokenization methods rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.",
        "The hugging face hub –- our main website –- is a central platform that enables anyone to discover, use, and contribute new state-of-the-art models and datasets",
        "Model cards can be used to describe your model and its hyperparameters.",
        "From transformers import autotokenizer, datacollatorwithpadding raw_datasets = load_dataset('glue', 'mrpc') checkpoint = 'bert-base-uncased' tokenizer = autotokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer",
        "One can save it with the save_pretrained() method, or / and upload it back with the push_to_hub() method",
        "Named entity recognition (ner) is a task where the model has to find which parts of the input text correspond to entities such as persons, locations, or organizations.",
        "Mark each word in a sentence as corresponding to a particular part of speech",
        "Translation is a sequence-to-sequence task.",
        "It's very important to isolate the piece of code that produces the bug, as no one in the hugging face team is a magician (yet), and they can't fix what they can't see.",
        "Transformers-cli env",
        "All the libraries seen and used here in this course are available as py packages.",
    ],
}


# Call the calculate_f1_scores function with our data
result_df = calculate_f1_scores(data)

# Display the DataFrame with questions and F1 scores
print(result_df)

#                                              question  F1_Score
# 0                                        What is NLP?  1.000000
# 1                           How do I cite the course?  1.000000
# 2   When was the transformer architecture introduced?  1.000000
# 3                          When was GPT-3 introduced?  1.000000
# 4                           What is an encoder model?  0.900000
# 5                            What is a decoder model?  0.900000
# 6                    What are Encoder-decoder models?  0.928571
# 7                What does a word-based tokenizer do?  1.000000
# 8           What does a character-based tokenizer do?  1.000000
# 9        How do Subword tokenization algorithms work?  0.962963
# 10                      What is the Hugging Face hub?  1.000000
# 11                              What is a model card?  0.750000
# 12                         How do I load a tokenizer?  1.000000
# 13                         How do I save a tokenizer?  0.666667
# 14                  What is named entity recognition?  1.000000
# 15                    What is part-of-speech tagging?  1.000000
# 16                  What type of task is translation?  1.000000
# 17  Why should I write a minimal reproducible exam...  1.000000
# 18  How do I get my environment information with t...  1.000000
# 19        Are all the libraries available for Python?  0.666667


                                             question  F1_Score
0                                        What is NLP?  1.000000
1                           How do I cite the course?  1.000000
2   When was the transformer architecture introduced?  1.000000
3                          When was GPT-3 introduced?  1.000000
4                           What is an encoder model?  0.900000
5                            What is a decoder model?  0.900000
6                    What are Encoder-decoder models?  0.928571
7                What does a word-based tokenizer do?  1.000000
8           What does a character-based tokenizer do?  1.000000
9        How do Subword tokenization algorithms work?  0.962963
10                      What is the Hugging Face hub?  1.000000
11                              What is a model card?  0.750000
12                         How do I load a tokenizer?  1.000000
13                         How do I save a tokenizer?  0.666667
14                  What is named entity