In [None]:
!pip install sentence-transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34

In [None]:
import heapq
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertForQuestionAnswering, BertTokenizer, AutoTokenizer, pipeline, AutoModel
from datasets import load_dataset
import networkx as nx

In [None]:
dataset = load_dataset("GroNLP/ik-nlp-22_slp")
url = "https://huggingface.co/datasets/GroNLP/ik-nlp-22_slp/raw/main/slp_questions.csv"
test_data = pd.read_csv(url)
train_data = dataset["train"]

book_content = []

for row in train_data:
    entry = {
        "chapter": row["n_chapter"],
        "section": row["n_section"],
        "subsection": row["n_subsection"],
        "paragraph": row["text"],
    }
    book_content.append(entry)

Downloading builder script:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.97k [00:00<?, ?B/s]



Downloading and preparing dataset ik-nlp-22_slp/paragraphs to /root/.cache/huggingface/datasets/GroNLP___ik-nlp-22_slp/paragraphs/1.0.0/6c89281b2028a8a126102dda2c3fb94b1a5ccea59943d26857ae138c7aa782f8...


Downloading data:   0%|          | 0.00/741k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset ik-nlp-22_slp downloaded and prepared to /root/.cache/huggingface/datasets/GroNLP___ik-nlp-22_slp/paragraphs/1.0.0/6c89281b2028a8a126102dda2c3fb94b1a5ccea59943d26857ae138c7aa782f8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
## Knowledge graph
def build_kg(book_content, model):
    G = nx.Graph()

    # compute and store embeddings
    paragraphs = [entry["paragraph"] for entry in book_content]
    paragraph_embeddings = model.encode(paragraphs, convert_to_tensor=True)
    
    for i, entry in enumerate(book_content):
        G.add_node(i, text=entry["paragraph"], embedding=paragraph_embeddings[i])

    # normalize embeddings
    paragraph_embeddings_norm = paragraph_embeddings / paragraph_embeddings.norm(dim=-1, keepdim=True)

    # compute pairwise similarity matrix
    similarity_matrix = torch.mm(paragraph_embeddings_norm, paragraph_embeddings_norm.T).cpu().numpy()

    # remove self-similarities 
    np.fill_diagonal(similarity_matrix, 0)


    i_indices, j_indices = np.where(similarity_matrix > 0.8)
    for i, j in zip(i_indices, j_indices):
        similarity = similarity_matrix[i, j]
        G.add_edge(i, j, weight=similarity)

    return G

In [None]:
def retrieve_relevant_paragraphs(query, G, model, top_k=5, threshold=0.3):
    query_embedding = model.encode(query, convert_to_tensor=True)

    similarities = {}
    for node in G.nodes:
        node_embedding = G.nodes[node]["embedding"]
        node_embedding_norm = node_embedding / node_embedding.norm(dim=-1)
        query_embedding_norm = query_embedding / query_embedding.norm(dim=-1)
        similarity = torch.nn.functional.cosine_similarity(query_embedding_norm, node_embedding_norm, dim=-1).cpu().numpy()
        similarities[node] = similarity

    top_k_nodes = sorted(similarities.keys(), key=lambda x: similarities[x], reverse=True)[:top_k]

    if similarities[top_k_nodes[0]] < threshold:
        return None

    top_k_paragraphs = [{"paragraph": book_content[node]["paragraph"]} for node in top_k_nodes]
    return top_k_paragraphs

query = "How can we estimate the probability of a word?"
model = SentenceTransformer("sentence-transformers/msmarco-roberta-base-v2")
G = build_kg(book_content, model)

relevant_paragraphs = retrieve_relevant_paragraphs(query, G, model)

Downloading (…)e7783/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)46fdee7783/README.md:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading (…)fdee7783/config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)6fdee7783/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)e7783/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)6fdee7783/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)dee7783/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
print(np.asarray(relevant_paragraphs))

[{'paragraph': 'Given the bigram assumption for the probability of an individual word, we can compute the probability of a complete word sequence by substituting Eq. 3.7 into Eq. 3.4:'}
 {'paragraph': 'Let\'s begin with the task of computing P(w|h), the probability of a word w given some history h. Suppose the history h is "its water is so transparent that" and we want to know the probability that the next word is the:'}
 {'paragraph': 'To learn the probability P( f i |c), we\'ll assume a feature is just the existence of a word in the document\'s bag of words, and so we\'ll want P(w i |c), which we compute as the fraction of times the word w i appears among all words in all documents of topic c. We first concatenate all documents with category c into one big "category c" text. Then we use the frequency of w i in this concatenated document to give a maximum likelihood estimate of the probability:'}
 {'paragraph': 'Similarly, if we wanted to know the joint probability of an entire sequen

In [None]:
from tqdm import tqdm

def create_prompt(query, relevant_paragraphs):
    para_texts = [f". {para['paragraph']}" for para in relevant_paragraphs]
    prompt = f"{query} \n" + ''.join(para_texts)
    return prompt

def generate_answer(prompt):
    answer = generator(prompt)[0].get('generated_text')
    return answer

def kg_retrieval_pipeline(model):
    correct_paragraphs = test_data["paragraph"].tolist()
    queries = test_data["question"].tolist()
    count = 0

    G = build_kg(book_content, model)

    no_answers = []
    answers = []

    for idx, query in enumerate(tqdm(queries)):
        relevant_paragraphs = retrieve_relevant_paragraphs(query, G, model, top_k=3)
        gt = correct_paragraphs[idx]

        if relevant_paragraphs is None:
            print("Query is irrelevant.")
            no_answers.append(query)
            answers.append("QUERY IRRELEVANT")
        else:

            if gt in [p["paragraph"] for p in relevant_paragraphs]:
                count += 1
            else:
                no_answers.append(query)

    performance = count / len(correct_paragraphs)
    return performance, no_answers, answers

# Instantiate the text-generation pipeline
# generator = pipeline("text2text-generation", model="allenai/unifiedqa-t5-base")

#model = SentenceTransformer("sentence-transformers/msmarco-roberta-base-v2")
#model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
#model = SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v1")
model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L-12-v3")

performance, no_answers, pred_answers = kg_retrieval_pipeline(model)

100%|██████████| 59/59 [00:25<00:00,  2.33it/s]


In [None]:
print(performance)

0.6949152542372882


In [None]:
def generate_answer(prompt, context):
    answer = qa_pipeline({"question": prompt, "context": context})["answer"]
    return answer

def kg_retrieval_pipeline(model):
    correct_paragraphs = test_data["paragraph"].tolist()
    queries = test_data["question"].tolist()
    count = 0

    G = build_kg(book_content, model)

    no_answers = []
    answers = []

    for idx, query in enumerate(tqdm(queries)):
        relevant_paragraphs = retrieve_relevant_paragraphs(query, G, model, top_k=5)
        gt = correct_paragraphs[idx]

        if relevant_paragraphs is None:
            print("Query is irrelevant.")
            no_answers.append(query)
            answers.append("QUERY IRRELEVANT")
        else:
            context = " ".join([p["paragraph"] for p in relevant_paragraphs])
            answer = generate_answer(query, context)
            answers.append(answer)

            if gt in [p["paragraph"] for p in relevant_paragraphs]:
                count += 1
            else:
                no_answers.append(query)

    performance = count / len(correct_paragraphs)
    return performance, no_answers, answers

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

#model = SentenceTransformer("sentence-transformers/msmarco-roberta-base-v2")
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
#model = SentenceTransformer("sentence-transformers/paraphrase-distilroberta-base-v1")
#model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L-12-v3")

performance, no_answers, pred_answers = kg_retrieval_pipeline(model)


100%|██████████| 59/59 [03:02<00:00,  3.09s/it]


In [None]:
print(f"Retrieval accuracy: {performance}")

Retrieval accuracy: 0.8305084745762712


In [None]:
pred_answers = np.asarray(pred_answers)

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('stopwords')
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
gt_answers = np.asarray(test_data["answer"].tolist())

In [None]:
corpus = np.concatenate((pred_answers, gt_answers))

vectorizer = TfidfVectorizer()

# Vectorize the corpus
tfidf = vectorizer.fit_transform(corpus)

# Calculate the cosine similarity between each pair of answers
similarity_scores = []
for i in range(len(pred_answers)):
    score = cosine_similarity(tfidf[i], tfidf[i+len(pred_answers)])[0][0]
    similarity_scores.append(score)

# Calculate the average similarity score
avg_similarity = np.mean(similarity_scores)

print("Average similarity score: ", avg_similarity)

Average similarity score:  0.5218948012939791
