In [None]:
pip install sentence-transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60

In [None]:
import heapq
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertForQuestionAnswering, BertTokenizer, AutoTokenizer, pipeline, AutoModel
from datasets import load_dataset

In [None]:
# for getting the parameters of each pretrained model
from transformers import AutoModel

model_name = "sentence-transformers/msmarco-distilbert-base-v3"
model = AutoModel.from_pretrained(model_name)

print(model.config.num_hidden_layers)
print(model.config.hidden_size)
print(model.config.num_attention_heads)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))



Downloading (…)lve/main/config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

6
768
12
66362880


In [None]:
dataset = load_dataset("GroNLP/ik-nlp-22_slp")

Downloading builder script:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.97k [00:00<?, ?B/s]



Downloading and preparing dataset ik-nlp-22_slp/paragraphs to /root/.cache/huggingface/datasets/GroNLP___ik-nlp-22_slp/paragraphs/1.0.0/6c89281b2028a8a126102dda2c3fb94b1a5ccea59943d26857ae138c7aa782f8...


Downloading data:   0%|          | 0.00/741k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset ik-nlp-22_slp downloaded and prepared to /root/.cache/huggingface/datasets/GroNLP___ik-nlp-22_slp/paragraphs/1.0.0/6c89281b2028a8a126102dda2c3fb94b1a5ccea59943d26857ae138c7aa782f8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
url = "https://huggingface.co/datasets/GroNLP/ik-nlp-22_slp/raw/main/slp_questions.csv"
test_data = pd.read_csv(url)


In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['n_chapter', 'chapter', 'n_section', 'section', 'n_subsection', 'subsection', 'text'],
        num_rows: 1697
    })
})


In [None]:
train_data = dataset["train"]

book_content = []

for row in train_data:
    entry = {
        "chapter": row["n_chapter"],
        "section": row["n_section"],
        "subsection": row["n_subsection"],
        "paragraph": row["text"],
    }
    book_content.append(entry)


In [None]:
## Knowledge graph

import networkx as nx

def build_kg(book_content, model):
    G = nx.Graph()

    # Precompute and store embeddings
    paragraphs = [entry["paragraph"] for entry in book_content]
    paragraph_embeddings = model.encode(paragraphs, convert_to_tensor=True)
    
    for i, entry in enumerate(book_content):
        G.add_node(i, text=entry["paragraph"], embedding=paragraph_embeddings[i])

    # Normalize embeddings
    paragraph_embeddings_norm = paragraph_embeddings / paragraph_embeddings.norm(dim=-1, keepdim=True)

    # Compute pairwise similarity matrix
    similarity_matrix = torch.mm(paragraph_embeddings_norm, paragraph_embeddings_norm.T).cpu().numpy()

    # Remove self-similarities (diagonal elements)
    np.fill_diagonal(similarity_matrix, 0)

    # Find indices where similarity is greater than 0.8
    i_indices, j_indices = np.where(similarity_matrix > 0.8)

    # Add edges with similarity > 0.8
    for i, j in zip(i_indices, j_indices):
        similarity = similarity_matrix[i, j]
        G.add_edge(i, j, weight=similarity)

    return G

def retrieve_relevant_paragraphs(query, book_content, model, top_k=5, threshold=0.3):
    G = build_kg(book_content, model)
    query_embedding = model.encode(query, convert_to_tensor=True)

    similarities = {}
    for node in G.nodes:
        node_embedding = G.nodes[node]["embedding"]
        node_embedding_norm = node_embedding / node_embedding.norm(dim=-1)
        query_embedding_norm = query_embedding / query_embedding.norm(dim=-1)
        similarity = torch.nn.functional.cosine_similarity(query_embedding_norm, node_embedding_norm, dim=-1).cpu().numpy()
        similarities[node] = similarity

    top_k_nodes = sorted(similarities.keys(), key=lambda x: similarities[x], reverse=True)[:top_k]

    if similarities[top_k_nodes[0]] < threshold:
        return None

    top_k_paragraphs = [book_content[node]["paragraph"] for node in top_k_nodes]
    return top_k_paragraphs


# Retrieve relevant paragraphs
query = "How can we estimate the probability of a word?"
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
relevant_paragraphs = retrieve_relevant_paragraphs(query, book_content, model)

if relevant_paragraphs is None:
    print("Query is irrelevant.")
else:
    print(relevant_paragraphs)

Downloading (…)da7dc/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3fc4bda7dc/README.md:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading (…)c4bda7dc/config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)da7dc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading (…)3fc4bda7dc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)4bda7dc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

["The chain rule shows the link between computing the joint probability of a sequence and computing the conditional probability of a word given previous words. Equation 3.4 suggests that we could estimate the joint probability of an entire sequence of words by multiplying together a number of conditional probabilities. But using the chain rule doesn't really seem to help us! We don't know any way to compute the exact probability of a word given a long sequence of preceding words, P(w n |w n−1 1 ). As we said above, we can't just estimate by counting the number of times every word occurs following every long string, because language is creative and any particular context might have never occurred before!", 'Hopefully, most of you concluded that a very likely word is in, or possibly over, but probably not refrigerator or the. In the following sections we will formalize this intuition by introducing models that assign a probability to each possible next word. The same models will also ser

In [None]:
print(np.asarray(relevant_paragraphs))

["The chain rule shows the link between computing the joint probability of a sequence and computing the conditional probability of a word given previous words. Equation 3.4 suggests that we could estimate the joint probability of an entire sequence of words by multiplying together a number of conditional probabilities. But using the chain rule doesn't really seem to help us! We don't know any way to compute the exact probability of a word given a long sequence of preceding words, P(w n |w n−1 1 ). As we said above, we can't just estimate by counting the number of times every word occurs following every long string, because language is creative and any particular context might have never occurred before!"
 'Hopefully, most of you concluded that a very likely word is in, or possibly over, but probably not refrigerator or the. In the following sections we will formalize this intuition by introducing models that assign a probability to each possible next word. The same models will also ser

In [None]:
def build_kg(book_content, model):
    G = nx.Graph()

    # Precompute and store embeddings
    paragraphs = [entry["paragraph"] for entry in book_content]
    paragraph_embeddings = model.encode(paragraphs, convert_to_tensor=True)
    
    for i, entry in enumerate(book_content):
        G.add_node(i, text=entry["paragraph"], embedding=paragraph_embeddings[i])

    # Normalize embeddings
    paragraph_embeddings_norm = paragraph_embeddings / paragraph_embeddings.norm(dim=-1, keepdim=True)

    # Compute pairwise similarity matrix
    similarity_matrix = torch.mm(paragraph_embeddings_norm, paragraph_embeddings_norm.T).cpu().numpy()

    # Remove self-similarities (diagonal elements)
    np.fill_diagonal(similarity_matrix, 0)

    # Find indices where similarity is greater than 0.8
    i_indices, j_indices = np.where(similarity_matrix > 0.8)

    # Add edges with similarity > 0.8
    for i, j in zip(i_indices, j_indices):
        similarity = similarity_matrix[i, j]
        G.add_edge(i, j, weight=similarity)

    return G

def retrieve_relevant_paragraphs(query, G, model, top_k=5, threshold=0.3):
    query_embedding = model.encode(query, convert_to_tensor=True)

    similarities = {}
    for node in G.nodes:
        node_embedding = G.nodes[node]["embedding"]
        node_embedding_norm = node_embedding / node_embedding.norm(dim=-1)
        query_embedding_norm = query_embedding / query_embedding.norm(dim=-1)
        similarity = torch.nn.functional.cosine_similarity(query_embedding_norm, node_embedding_norm, dim=-1).cpu().numpy()
        similarities[node] = similarity

    top_k_nodes = sorted(similarities.keys(), key=lambda x: similarities[x], reverse=True)[:top_k]

    if similarities[top_k_nodes[0]] < threshold:
        return None

    top_k_paragraphs = [{"paragraph": book_content[node]["paragraph"]} for node in top_k_nodes]
    return top_k_paragraphs


def create_prompt(query, relevant_paragraphs):
    prompt = f"{query} \n"
    for i, paragraph in enumerate(relevant_paragraphs):
        para_text = f". {paragraph['paragraph']}"
        prompt += para_text
    return prompt


def generate_answer(prompt):
    generator = pipeline("text2text-generation", model="allenai/unifiedqa-t5-base")
    answer = generator(prompt)[0].get('generated_text')

    return answer


def kg_retrieval_pipeline(model, questions):
    correct_paragraphs = test_data["paragraph"].tolist()

    if(questions == None):
      queries = test_data["question"].tolist()
    else:
      queries = questions

    #queries = test_data["question"].tolist()
    count = 0

    # Build the knowledge graph once
    G = build_kg(book_content, model)

    no_answers = []
    answers = []

    for idx, query in enumerate(queries):
        relevant_paragraphs = retrieve_relevant_paragraphs(query, G, model, top_k=5)
        gt = correct_paragraphs[idx]

        if relevant_paragraphs is None:
            print("Query is irrelevant.")
            no_answers.append(query)
            answers.append("QUERY IRRELEVANT")

        elif gt in [p["paragraph"] for p in relevant_paragraphs]:
            count += 1
            prompt = create_prompt(query, relevant_paragraphs)
            answer = generate_answer(prompt)
            answers.append(answer)

        else:
            no_answers.append(query)
            prompt = create_prompt(query, relevant_paragraphs)
            answer = generate_answer(prompt)
            answers.append(answer)

    performance = count / len(correct_paragraphs)
    return performance, no_answers, answers


model = SentenceTransformer("sentence-transformers/msmarco-roberta-base-v2")
performance, no_answers, answers = kg_retrieval_pipeline(model, None)



Downloading (…)lve/main/config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (863 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (923 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (947 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (798 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [None]:
print(performance)

0.8305084745762712


In [None]:
print(np.asarray(answers))

['zero or more occurrences of the immediately previous character or regular expression'
 'pattern' 'tokenizing (segmenting) words'
 'token learner, and token segmenter'
 'token segmenter is a tool used to tokenize test sentences.'
 'learning a vocabulary'
 'the task of putting words/tokens in a standard format'
 'lemmatization is performed by removing suffixes from the end of the word'
 'the task of determining that two words have the same root, despite their surface differences.'
 'the minimum number of operations it takes to edit one into the other.'
 'a neural network' 'by multiplying the conditional probabilities.'
 'a model Markov chain' 'counting in a corpus and normalizing'
 'bigram models are discounted, while trigram models are unipolated.'
 'how well they fit' 'the inverse probability of the test set'
 'by shaven off a bit of probability mass from more frequent events and give it'
 'pruning'
 'if the probabilities assigned to a Stationary sequence are invariant with respect t

In [None]:
np.savetxt(f"answers_msmarco-roberta-base-v2_top_5.csv", np.asarray(answers), delimiter=",", fmt="%s")

In [None]:
def retrieve_relevant_paragraphs(query, book_content, model, top_k=5, threshold=0.3):
    query_embedding = model.encode(query, convert_to_tensor=True)

    paragraphs = [entry["paragraph"] for entry in book_content]
    paragraph_embeddings = model.encode(paragraphs, convert_to_tensor=True)

    # Compute cosine similarity using PyTorch
    query_embedding_norm = query_embedding / query_embedding.norm(dim=-1)
    paragraph_embeddings_norm = paragraph_embeddings / paragraph_embeddings.norm(dim=-1, keepdim=True)
    similarities = torch.nn.functional.cosine_similarity(query_embedding_norm, paragraph_embeddings_norm, dim=-1).cpu().numpy().flatten()

    # Check if the highest similarity score is below the threshold
    if similarities.max() < threshold:
        return None

    top_k_indices = similarities.argsort()[-top_k:][::-1]
    top_k_paragraphs = [book_content[i] for i in top_k_indices]

    return top_k_paragraphs

# Retrieve relevant paragraphs
query = "How can we estimate the probability of a word?"
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
relevant_paragraphs = retrieve_relevant_paragraphs(query, book_content, model)

if relevant_paragraphs is None:
    print("Query is irrelevant.")
else:
    print(relevant_paragraphs)

[{'chapter': '3', 'section': '3.1', 'subsection': 'nan', 'paragraph': "The chain rule shows the link between computing the joint probability of a sequence and computing the conditional probability of a word given previous words. Equation 3.4 suggests that we could estimate the joint probability of an entire sequence of words by multiplying together a number of conditional probabilities. But using the chain rule doesn't really seem to help us! We don't know any way to compute the exact probability of a word given a long sequence of preceding words, P(w n |w n−1 1 ). As we said above, we can't just estimate by counting the number of times every word occurs following every long string, because language is creative and any particular context might have never occurred before!"}, {'chapter': '3', 'section': 'nan', 'subsection': 'nan', 'paragraph': 'Hopefully, most of you concluded that a very likely word is in, or possibly over, but probably not refrigerator or the. In the following sections

In [None]:
def get_top_k_paragraphs(query, book_content, tokenizer, model, top_k=5):
    # Tokenize the query
    query_tokens = tokenizer(query, return_tensors="pt")
    
    # Get the query embedding by passing the tokens through the model and taking the mean of the last hidden state
    with torch.no_grad():
        query_embedding = model(**query_tokens)[0].mean(dim=1)

    # Calculate the embeddings for each paragraph in the book
    paragraph_embeddings = []
    for paragraph in book_content:
        paragraph_tokens = tokenizer(paragraph["paragraph"], return_tensors="pt")
        with torch.no_grad():
            paragraph_embedding = model(**paragraph_tokens)[0].mean(dim=1)
        paragraph_embeddings.append(paragraph_embedding)
    
    # Compute the cosine similarity between the query and each paragraph embedding
    similarities = cosine_similarity(query_embedding, torch.cat(paragraph_embeddings, dim=0))
    
    # Get the indices of the top_k paragraphs
    top_k_indices = similarities.argsort()[0][-top_k:][::-1]

    # Return the top_k most relevant paragraphs
    return [book_content[i] for i in top_k_indices]

query = "How can we estimate the probability of a word?"
tokenizer = AutoTokenizer.from_pretrained("twmkn9/bert-base-uncased-squad2")
model = AutoModel.from_pretrained("twmkn9/bert-base-uncased-squad2")

relevant_paragraphs = get_top_k_paragraphs(query, book_content, tokenizer, model)
print(np.asarray(relevant_paragraphs))

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at twmkn9/bert-base-uncased-squad2 were not used when initializing BertModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: ignored

In [None]:
def convert_to_string(relevant_paragraphs):
    output = []

    for rel in relevant_paragraphs:
        output.append(rel['paragraph'])

    return output

eval_list = np.asarray(convert_to_string(relevant_paragraphs))
print(eval_list)

TypeError: ignored

In [None]:
def generate_answer(prompt):
    generator = pipeline("text2text-generation", model="allenai/unifiedqa-t5-base")
    answer = generator(prompt)[0].get('generated_text')

    return answer


def create_prompt(query, relevant_paragraphs):
    prompt = f"{query} \n"
    for paragraph in relevant_paragraphs:
        prompt += f". {paragraph['paragraph']}"
    return prompt


# Create the prompt for T5
prompt = create_prompt(query, relevant_paragraphs)

print(prompt)
# Generate an answer using T5
answer = generate_answer(prompt)
print(answer)


TypeError: ignored

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("allenai/unifiedqa-t5-base")

def create_prompt(query, relevant_paragraphs, max_length=512):
    prompt = f"{query} \n"
    prompt_tokens = tokenizer.encode(prompt, return_tensors="pt")

    for i, paragraph in enumerate(relevant_paragraphs):
        para_text = f". {paragraph['paragraph']}"

        # Tokenize the paragraph text and check the total token length
        para_tokens = tokenizer.encode(para_text, return_tensors="pt")
        total_length = prompt_tokens.shape[1] + para_tokens.shape[1]

        if total_length <= max_length:
            # Add the paragraph if the total token length is within the limit
            prompt_tokens = torch.cat([prompt_tokens, para_tokens], dim=-1)
        else:
            # Stop adding paragraphs if the limit is exceeded
            break

    # Decode the tokens back to text
    prompt_text = tokenizer.decode(prompt_tokens[0])
    return prompt_text

# Create the prompt for T5
prompt = create_prompt(query, relevant_paragraphs, max_length=512)

print(prompt)

How can we estimate the probability of a word?</s>. The chain rule shows the link between computing the joint probability of a sequence and computing the conditional probability of a word given previous words. Equation 3.4 suggests that we could estimate the joint probability of an entire sequence of words by multiplying together a number of conditional probabilities. But using the chain rule doesn't really seem to help us! We don't know any way to compute the exact probability of a word given a long sequence of preceding words, P(w n |w n<unk> 1 1 ). As we said above, we can't just estimate by counting the number of times every word occurs following every long string, because language is creative and any particular context might have never occurred before!</s>. Hopefully, most of you concluded that a very likely word is in, or possibly over, but probably not refrigerator or the. In the following sections we will formalize this intuition by introducing models that assign a probability 

In [None]:
## Paraphraser tool to generate answer - Not good
import spacy
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def extract_detailed_answer(question, paragraph, model, n_sentences=2):
    nlp = spacy.load("en_core_web_sm")
    
    question_embedding = model.encode(question)
    sentences = [sent.text for sent in nlp(paragraph).sents]
    sentence_embeddings = model.encode(sentences)

    similarities = cosine_similarity(question_embedding.reshape(1, -1), sentence_embeddings)
    most_similar_index = np.argmax(similarities)

    # Get surrounding sentences
    start_index = max(0, most_similar_index - n_sentences)
    end_index = min(len(sentences), most_similar_index + n_sentences + 1)

    answer = " ".join(sentences[start_index:end_index])
    return answer

# Example usage
sentence_transformer_model = SentenceTransformer("paraphrase-distilroberta-base-v2")
context = " ".join([p["paragraph"] for p in relevant_paragraphs])
detailed_answer = extract_detailed_answer(query, context, sentence_transformer_model)
print(detailed_answer)



Downloading (…)2b9e5/.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3c1ed2b9e5/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)1ed2b9e5/config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)c1ed2b9e5/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)2b9e5/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)c1ed2b9e5/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)ed2b9e5/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Equation 3.4 suggests that we could estimate the joint probability of an entire sequence of words by multiplying together a number of conditional probabilities. But using the chain rule doesn't really seem to help us! We don't know any way to compute the exact probability of a word given a long sequence of preceding words, P(w n |w n−1 1 ). As we said above, we can't just estimate by counting the number of times every word occurs following every long string, because language is creative and any particular context might have never occurred before! Hopefully, most of you concluded that a very likely word is in, or possibly over, but probably not refrigerator or the.


In [None]:
def my_pipeline(model, questions):
    correct_paragraphs = test_data["paragraph"].tolist()
    if(questions == None):
      queries = test_data["question"].tolist()
    else:
      queries = questions
    count = 0
    no_answers = []
    answers = []

    for idx, query in enumerate(queries):
        relevant_paragraphs = retrieve_relevant_paragraphs(query, book_content, model, top_k=5)
        gt = correct_paragraphs[idx]

        if relevant_paragraphs is None:
            print("Query is irrelevant.")
            no_answers.append(query)
            answers.append("QUERY IRRELEVANT")

        elif gt in relevant_paragraphs:
            count += 1
            prompt = create_prompt(query, relevant_paragraphs)
            answer = generate_answer(prompt)
            answers.append(answer)

        else:
            no_answers.append(query)
            prompt = create_prompt(query, relevant_paragraphs)
            answer = generate_answer(prompt)
            answers.append(answer)

    
    performance = count / len(correct_paragraphs)
    return performance, no_answers, answers


model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
performance, no_answers, answers = my_pipeline(model, None)


NameError: ignored

In [None]:
print(f"Retrieval performance: {performance}")
print(len(no_answers))

Retrieval performance: 0.7966101694915254
12


In [None]:
print(np.asarray(no_answers))

['What is a language model?'
 'How can we estimate the probability of a word?'
 'What can be done to optimize Naive Bayes when an insufficient amount of labeled data is present?'
 'What is the purpose of logistic regression?'
 'What is the other name of multinomial logistic regression?'
 'What is the idea behind vector semantics?'
 "What is the output of the output layer's softmax in a neural language model?"
 'What is a Hidden Markov Model?' 'What is a lexical gap?'
 'What is the difference between cross-attention and multi-head self-attention?'
 'How is the chrF evaluation metric for MT computed?'
 'What role does the [CLS] token play in BERT?']


In [None]:
np_answers = np.asarray(answers)
#np.savetxt(f"/content/drive/MyDrive/nlp/answers_msmarco-distilbert-base-v3_top_3.csv", np_answers, delimiter=",", fmt="%s")
print(np_answers)

["zero or more a's or bs" '.' 'normalizing word formats'
 'token learner, and token segmenter'
 'to make use of the current and previous output tokens'
 'to learn a vocabulary' 'putting words in a standard format'
 'lemmatization is the task of determining that two words have the same root, '
 'text normalization'
 'the edit distance between two strings is defined as the minimum number of operations needed to change one string'
 'predicting upcoming words from prior word context'
 'by multiplying all the previous words.' 'a hidden Markov model'
 'n-grams' 'bigram and trigram models' 'the better model'
 'the number of possible next words that can follow any word'
 'we can give them the correct distribution t.' 'pruning'
 'when the probability distributions are invariant'
 'naive Bayes classifiers make two simplifying assumptions.'
 'naive Bayes is a probabilistic classifier.'
 'naive Bayes can express any property of the input text.'
 'there are many different types of text classificati

In [None]:
str_compare = zip(np.asarray(test_data['question']),np_answers, np.asarray(test_data['answer']))
print("(Query, Generated Answer, True Answer)")
for item in str_compare:
    print(item)

(Query, Generated Answer, True Answer)
('What is the meaning of the Kleene star in Regex?', "zero or more a's or bs", 'The Kleene star means "zero or more occurrences of the immediately previous character or regular expression"')
('What is the usage of the Regex lookahead operator "?="?', '.', 'The operator (?= pattern) is true if pattern occurs, but is zero-width, i.e. the match pointer doesn’t advance.')
('What are the most common steps in a text normalization process?', 'normalizing word formats', '1. Tokenizing (segmenting) words 2. Normalizing word formats 3. Segmenting sentences')
('What are the two most common components of a tokenization scheme?', 'token learner, and token segmenter', 'a token learner, and a token segmenter')
('What is the purpose of a token segmenter?', 'to make use of the current and previous output tokens', 'The token segmenter takes a raw test sentence and segments it into the tokens in the vocabulary.')
('What is the purpose of a token learner in the BPE a

In [None]:
#question generation model with only context as input

from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer

model_name = "allenai/t5-small-squad2-question-generation"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
questions_context = []


def run_model(**generator_args):
    correct_paragraphs = test_data["paragraph"].tolist()
    for idx,context in enumerate(correct_paragraphs):
        input_ids = tokenizer.encode(context, return_tensors="pt")
        res = model.generate(input_ids, **generator_args)
        output = tokenizer.batch_decode(res, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        
        for i in output: 
          questions_context.append(i)


run_model()
print(len(questions_context))

59


In [None]:
#compare questions generated by context only model with the ones from test data set
questions = test_data['question'].to_list()
for i in range(len(questions_context)):
  print(questions_context[i])
  print(questions[i])

What is the Kleene * erally pronounced?
What is the meaning of the Kleene star in Regex?
What is the name of the lookahead assertions that make use of?
What is the usage of the Regex lookahead operator "?="?
What is the first step in a normalization process?
What are the most common steps in a text normalization process?
What is the name of the first two tokenization schemes?
What are the two most common components of a tokenization scheme?
What is the name of the first two tokenization schemes?
What is the purpose of a token segmenter?
What is the token learner part of the BPE algorithm?
What is the purpose of a token learner in the BPE algorithm?
What is the task of putting words/tokens in a standard format?
What is word normalization?
What is the study of the way words are built up from smaller meaning-bearing units
How is lemmatization performed?
What is the task of Lemmatization?
What is lemmatization?
What is the minimum edit distance between two strings?
How is the minimum edit 

In [None]:
print(len(test_data['question'].to_list()))
!pip install evaluate
import evaluate


from nltk.tokenize import word_tokenize
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=questions_context, references=test_data['question'].to_list(), max_order=1)
print(results)
results = bleu.compute(predictions=questions_context, references=test_data['question'].to_list(), max_order=2)
print(results)
results = bleu.compute(predictions=questions_context, references=test_data['question'].to_list(), max_order=3)
print(results)
results = bleu.compute(predictions=questions_context, references=test_data['question'].to_list(), max_order=4)
print(results)

59
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
{'bleu': 0.39473684210526316, 'precisions': [0.39473684210526316], 'brevity_penalty': 1.0, 'length_ratio': 1.0704225352112675, 'translation_length': 608, 'reference_length': 568}
{'bleu': 0.2557928231274908, 'precisions': [0.39473684210526316, 0.1657559198542805], 'brevity_penalty': 1.0, 'length_ratio': 1.0704225352112675, 'translation_length': 608, 'reference_length': 568}
{'bleu': 0.167192281767045, 'precisions': [0.39473684210526316, 0.1657559198542805, 0.07142857142857142], 'brevity_penalty': 1.0, 'length_ratio': 1.0704225352112675, 'translation_length': 608, 'reference_length': 568}
{'bleu': 0.10204526342252244, 'precisions': [0.39473684210526316, 0.1657559198542805, 0.07142857142857142, 0.02320185614849188], 'brevity_penalty': 1.0, 'length_ratio': 1.0704225352112675, 'translation_length': 608, 'reference_length': 568}


In [None]:
meteor = evaluate.load('meteor')
results = meteor.compute(predictions=questions_context, references=test_data['question'].to_list())
print(results)

Downloading builder script:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


{'meteor': 0.37917441773901245}


In [None]:
#test model question generation with context only
model = SentenceTransformer("sentence-transformers/msmarco-roberta-base-v2")
performance, no_answers, answers = kg_retrieval_pipeline(model, questions_context)


Token indices sequence length is longer than the specified maximum sequence length for this model (870 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (999 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [None]:
print(f"Retrieval performance: {performance}")
print(len(no_answers))

print(np.asarray(no_answers))

np_answers = np.asarray(answers)
#np.savetxt(f"/content/drive/MyDrive/nlp/answers_msmarco-distilbert-base-v3_top_3.csv", np_answers, delimiter=",", fmt="%s")
print(np_answers)

str_compare = zip(questions_context,np_answers, np.asarray(test_data['answer']))
print("(Query, Generated Answer, True Answer)")
for item in str_compare:
    print(item)

Retrieval performance: 0.6610169491525424
20
['What is the task of putting words/tokens in a standard format?'
 'What is the probability of a corpus being a factor in the probability of a'
 'What is the general equation for this n-gram approximation to the conditional'
 'What is the inverse probability of a test set?'
 'What is the probability distribution for words at time t?'
 'What is the probability that the document has negative sentiment?'
 'What is the method that finds a minimum of a function?'
 'What is an online algorithm that minimizes the loss function?'
 'What is the derivative of | |?'
 'What is the term for the word "choosing from 10, 30, or even 50 different'
 'What is the name of the cell in the matrix?'
 'What is the name of the cell in the matrix?' 'What is the dot product?'
 'What is the probability of the next word w t being V 42?'
 'What is the name of the word that is a part of speech?'
 'What is the main problem of n-gram models of Chapter 3?'
 'What is the name

In [None]:
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
performance, no_answers, answers = kg_retrieval_pipeline(model, questions_context)

Token indices sequence length is longer than the specified maximum sequence length for this model (782 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (664 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (841 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Query is irrelevant.


Token indices sequence length is longer than the specified maximum sequence length for this model (783 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (881 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (723 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (827 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [None]:
print(f"Retrieval performance: {performance}")
print(len(no_answers))

print(np.asarray(no_answers))

np_answers = np.asarray(answers)
#np.savetxt(f"/content/drive/MyDrive/nlp/answers_msmarco-distilbert-base-v3_top_3.csv", np_answers, delimiter=",", fmt="%s")
print(np_answers)

str_compare = zip(questions_context,np_answers, np.asarray(test_data['answer']))
print("(Query, Generated Answer, True Answer)")
for item in str_compare:
    print(item)

Retrieval performance: 0.711864406779661
17
['What is the study of the way words are built up from smaller meaning-bearing units'
 'What is the general equation for this n-gram approximation to the conditional'
 'What is the name of the modification that is called smoothing or discounting?'
 'What is the probability distribution for words at time t?'
 'What is the probability of every possible combination of features?'
 'What is the opinion lexicon?' 'What is a naive Bayes model?'
 'What percentage of the observations that our system labels correctly?'
 'What is the probability that the document has negative sentiment?'
 'What is the derivative of | |?' 'What is the word sentiment?'
 'What is the dot product?'
 'What is the probability of the next word w t being V 42?'
 'What is the name of the word that is a part of speech?'
 'What is the main problem of n-gram models of Chapter 3?'
 'What is the name of the model that predicts a value at time t?'
 'What is the dimension of referentia

In [None]:
#question generation model with context and answer as input
from transformers import T5Tokenizer, T5ForConditionalGeneration

trained_model_path = 'ZhangCheng/T5-Base-Fine-Tuned-for-Question-Generation'
trained_tokenizer_path = 'ZhangCheng/T5-Base-Fine-Tuned-for-Question-Generation'
model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer_path)
questions_answer = []

def generate():
    correct_paragraphs = test_data["paragraph"].tolist()
    answers = test_data["answer"]

    for idx,context in enumerate(correct_paragraphs):
      input_text = '<answer> %s <context> %s ' % (answers[idx], context)
      encoding = tokenizer.encode_plus(
            input_text,
            return_tensors='pt'
      )
      input_ids = encoding['input_ids']
      attention_mask = encoding['attention_mask']
      outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask
      )
      question = tokenizer.decode(
            outputs[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
      )
      questions_answer.append(question)

generate()
#print(test_data)







In [None]:
#questions = test_data['question'].to_list()
for i in range(len(questions_answer)):
  print(questions_answer[i])
  print(questions[i])

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=questions_answer, references=test_data['question'].to_list(), max_order=1)
print(results)
results = bleu.compute(predictions=questions_answer, references=test_data['question'].to_list(), max_order=2)
print(results)
results = bleu.compute(predictions=questions_answer, references=test_data['question'].to_list(), max_order=3)
print(results)
results = bleu.compute(predictions=questions_answer, references=test_data['question'].to_list(), max_order=4)
print(results)

What does the Kleene star mean?
What is the meaning of the Kleene star in Regex?
What is the result of the operator?
What is the usage of the Regex lookahead operator "?="?
What are the three tasks that are commonly applied as part of any normalization process?
What are the most common steps in a text normalization process?
What are the two parts of most tokenization schemes?
What are the two most common components of a tokenization scheme?
What is the token segmenter?
What is the purpose of a token segmenter?
What is the token learner part of the BPE algorithm used for?
What is the purpose of a token learner in the BPE algorithm?
What is the task of word normalization?
What is word normalization?
What is the most sophisticated method for lemmatization?
How is lemmatization performed?
What is the task of determining that two words have the same root?
What is lemmatization?
What is the minimum edit distance between two strings defined as?
How is the minimum edit distance between two str

In [None]:
meteor = evaluate.load('meteor')
results = meteor.compute(predictions=questions_answer, references=test_data['question'].to_list())
print(results)

{'meteor': 0.5605552780231466}


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
#test model question generation with answer + context
model = SentenceTransformer("sentence-transformers/msmarco-roberta-base-v2")
performance, no_answers, answers = kg_retrieval_pipeline(model, questions_answer)


Token indices sequence length is longer than the specified maximum sequence length for this model (774 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (836 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (763 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (797 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (799 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [None]:
print(f"Retrieval performance: {performance}")
print(len(no_answers))

print(np.asarray(no_answers))

np_answers = np.asarray(answers)
#np.savetxt(f"/content/drive/MyDrive/nlp/answers_msmarco-distilbert-base-v3_top_3.csv", np_answers, delimiter=",", fmt="%s")
print(np_answers)

str_compare = zip(questions_answer,np_answers, np.asarray(test_data['answer']))
print("(Query, Generated Answer, True Answer)")
for item in str_compare:
    print(item)

Retrieval performance: 0.7627118644067796
14
['What is the result of the operator?'
 'What is the task of determining that two words have the same root?'
 'What are LMs?' 'How can one estimate probability?'
 'What are Markov models?'
 'What is the modification of probability mass called?'
 'What is the problem with Eq. 4.6?'
 'What is the method called that finds a minimum of a function?'
 'What is an online algorithm that minimizes the loss function?'
 'What is the idea of vector semantics?'
 'What is the dot product used for?' 'What does the output layer produce?'
 'What is the main weakness of n-gram approaches?'
 'Where do the keys and values come from?']
['zero or more occurrences of the immediately previous character or regular expression'
 'the output is 1 for the inputs'
 'normalizing word formats 3. Normalizing sentence formats'
 'token learner, and token segmenter'
 'token segmenter is a set of tokens that are used to tokenize a test'
 'learning a vocabulary' 'putting words/t

In [None]:
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
performance, no_answers, answers = kg_retrieval_pipeline(model, questions_answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (772 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (627 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (776 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (722 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (620 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Query is irrelevant.


Token indices sequence length is longer than the specified maximum sequence length for this model (760 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (892 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (889 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [None]:
print(f"Retrieval performance: {performance}")
print(len(no_answers))

print(np.asarray(no_answers))

np_answers = np.asarray(answers)
#np.savetxt(f"/content/drive/MyDrive/nlp/answers_msmarco-distilbert-base-v3_top_3.csv", np_answers, delimiter=",", fmt="%s")
print(np_answers)

str_compare = zip(questions_answer,np_answers, np.asarray(test_data['answer']))
print("(Query, Generated Answer, True Answer)")
for item in str_compare:
    print(item)

Retrieval performance: 0.7966101694915254
12
['What is the result of the operator?'
 'What is the most sophisticated method for lemmatization?'
 'What are LMs?' 'How can one estimate probability?'
 'What is the modification of probability mass called?'
 'What is the problem with Eq. 4.6?'
 'What is another name for multinomial logistic regression?'
 'What is the idea of vector semantics?'
 'What is the dot product used for?'
 'What does the comparison of an item of interest to a collection of other items reveal?'
 'What is a lexical gap?' 'Where do the keys and values come from?']
['zero or more occurrences of the immediately previous character or regular expression'
 'a score'
 'tokenizing (segmenting) words 2. Normalizing word formats 3. Segment'
 'token learner, and token segmenter'
 'token segmenter is a part of a tokenization scheme that takes a test sentence'
 'learning a vocabulary' 'putting words/tokens in a standard format'
 'symbolics' 'lemmatization'
 'the minimum number of 

In [None]:
#Analyse the contexts 

print(test_data['paragraph'][0])

This language consists of strings with a b, followed by at least two a's, followed by an exclamation point. The set of operators that allows us to say things like "some number of as" are based on the asterisk or *, commonly called the Kleene * (gen-Kleene * erally pronounced "cleany star"). The Kleene star means "zero or more occurrences of the immediately previous character or regular expression". So /a*/ means "any string of zero or more as". This will match a or aaaaaa, but it will also match Off Minor since the string Off Minor has zero a's. So the regular expression for matching one or more a is /aa*/, meaning one a followed by zero or more as. More complex patterns can also be repeated. So /[ab]*/ means "zero or more a's or b's" (not "zero or more right square braces"). This will match strings like aaaa or ababab or bbbb.


In [None]:
print(test_data['paragraph'][1])

These lookahead assertions make use of the (? syntax that we saw in the previous section for non-capture groups. The operator (?= pattern) is true if pattern occurs, but is zero-width, i.e. the match pointer doesn’t advance. The operator (?! pattern) only returns true if a pattern does not match, but again is zero-width and doesn’t advance the cursor. Negative lookahead is commonly used when we are parsing some complex pattern but want to rule out a special case. For example suppose we want to match, at the beginning of a line, any single word that doesn’t start with “Volcano”. We can use negative lookahead to do this: /ˆ(?!Volcano)[A-Za-z]+/


In [None]:
print(test_data['paragraph'][3])
print(test_data['question'][3])

Most tokenization schemes have two parts: a token learner, and a token segmenter. The token learner takes a raw training corpus (sometimes roughly preseparated into words, for example by whitespace) and induces a vocabulary, a set of tokens. The token segmenter takes a raw test sentence and segments it into the tokens in the vocabulary. Three algorithms are widely used: byte-pair encoding (Sennrich et al., 2016) , unigram language modeling (Kudo, 2018) , and WordPiece (Schuster and Nakajima, 2012) ; there is also a SentencePiece library that includes implementations of the first two of the three (Kudo and Richardson, 2018) .
What are the two most common components of a tokenization scheme?


In [None]:
print(test_data['paragraph'][4])
print(test_data['question'][4])

Most tokenization schemes have two parts: a token learner, and a token segmenter. The token learner takes a raw training corpus (sometimes roughly preseparated into words, for example by whitespace) and induces a vocabulary, a set of tokens. The token segmenter takes a raw test sentence and segments it into the tokens in the vocabulary. Three algorithms are widely used: byte-pair encoding (Sennrich et al., 2016) , unigram language modeling (Kudo, 2018) , and WordPiece (Schuster and Nakajima, 2012) ; there is also a SentencePiece library that includes implementations of the first two of the three (Kudo and Richardson, 2018) .
What is the purpose of a token segmenter?


In [None]:
print(test_data['paragraph'][50])
print(test_data['answer'][50])
print(test_data['question'][50])

The self-attention calculation lies at the core of what's called a transformer block, which, in addition to the self-attention layer, includes additional feedforward layers, residual connections, and normalizing layers. The input and output dimensions of these blocks are matched so they can be stacked just as was the case for stacked RNNs. Fig. 9 .18 illustrates a standard transformer block consisting of a single attention layer followed by a fully-connected feedforward layer with residual connections and layer normalizations following each. We've already seen feedforward layers in Chapter 7, but what are residual connections and layer norm? In deep networks, residual connections are connections that pass information from a lower layer to a higher layer without going through the intermediate layer. Allowing information from the activation going forward and the gradient going backwards to skip a layer improves learning and gives higher level layers direct access to information from lowe

In [None]:
print(test_data['paragraph'][42])
print(test_data['answer'][42])
print(test_data['question'][42])

print(test_data['paragraph'][57])
print(test_data['answer'][57])
print(test_data['question'][57])

print(test_data['paragraph'][9])
print(test_data['answer'][9])
print(test_data['question'][9])

An HMM is a probabilistic sequence model: given a sequence of units (words, letters, morphemes, sentences, whatever), it computes a probability distribution over possible sequences of labels and chooses the best label sequence.
An HMM is a probabilistic sequence model
What is a Hidden Markov Model?
The MLM training objective is to predict the original inputs for each of the masked tokens using a bidirectional encoder of the kind described in the last section. The cross-entropy loss from these predictions drives the training process for all the parameters in the model. Note that all of the input tokens play a role in the selfattention process, but only the sampled tokens are used for learning.
predict the original inputs for each of the masked tokens
What is the Masked Language Modeling training objective?
Again, the fact that these two strings are very similar (differing by only one word) seems like useful evidence for deciding that they might be coreferent. Edit distance gives us a wa

In [None]:

from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "allenai/t5-small-squad2-question-generation"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(model.config.num_hidden_layers)
print(model.config.hidden_size)
print(model.config.num_attention_heads)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

6
512
8
60506624


In [None]:
model_path = 'ZhangCheng/T5-Base-Fine-Tuned-for-Question-Generation'
tokenizer_path = 'ZhangCheng/T5-Base-Fine-Tuned-for-Question-Generation'
model = T5ForConditionalGeneration.from_pretrained(model_path)

print(model.config.num_hidden_layers)
print(model.config.hidden_size)
print(model.config.num_attention_heads)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

12
768
12
222903552
