In [1]:
import os
import time
import numpy as np
from datasets import load_dataset
import pandas as pd
import random

from scipy.spatial.distance import cosine
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from pprint import pprint
from project_utility import prepare_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("rony/climate-change-MRC")
train_ds = ds["train"]
valid_ds = ds["validation"]
test_ds = ds["test"]

# each is a 1-item list, so take first index
train_ds = train_ds[0]
valid_ds = valid_ds[0]
test_ds = test_ds[0]

# take the 'data' key of the dict, ignoring 'version' (there's just one)
train_ds = train_ds['data'][0]['paragraphs']
valid_ds = valid_ds['data'][0]['paragraphs']
test_ds = test_ds['data'][0]['paragraphs']
# each dataset is a list of dicts, where each list item is a context paragraph ('context' key) with qas ('qas' key) which contain questions, id, and answer

train_df = pd.DataFrame(prepare_data(train_ds))
print(f"{train_df.shape=}")

valid_df = pd.DataFrame(prepare_data(valid_ds))
print(f"{valid_df.shape=}")

test_df = pd.DataFrame(prepare_data(test_ds))
print(f"{test_df.shape=}")

train_df.shape=(14756, 5)
valid_df.shape=(4229, 5)
test_df.shape=(2096, 5)


### Baseline: TF-IDF retrieval

In [3]:
corpus = [train_df['context'][i] for i in train_df.index]
vectorizer = TfidfVectorizer() # default is at word level rather than n-gram
X = vectorizer.fit_transform(corpus)

In [4]:
def get_context(corpus, question, context_vectorized, vectorizer, top_k=1):
    question_vec = vectorizer.transform([question])
    # use pairwise cosine similarity
    similarity_scores = cosine_similarity(question_vec, context_vectorized).flatten()
    top_k_indices = similarity_scores.argsort()[::-1][:top_k]

    return ';'.join([corpus[i] for i in top_k_indices])

In [5]:
tqdm.pandas(desc="tfidf retrieval")
train_df['retrieved_context'] = train_df['question'].progress_apply(
    lambda q: get_context(corpus, q, X, vectorizer, top_k=1))

tfidf retrieval: 100%|██████████| 14756/14756 [27:13<00:00,  9.04it/s]   


In [29]:
train_df['retrieval_correct'] = train_df.apply(
    lambda row: row['context'].lower() in row['retrieved_context'].lower(), axis=1
)

In [7]:
train_df.head()

Unnamed: 0,context,question,id,answer,answer_start,retrieved_context,retrieval_correct
0,"outside of the clean air act, there is support...",Is a mandate for electric production to come f...,665,"and there is ongoing interest in a ""national r...",222,energy efficiency and renewable energy policie...,False
1,"outside of the clean air act, there is support...",How wide ranging are climate policies across t...,666,"as of 2010, climate policies were being contem...",618,while these two conclusions are not mutually e...,False
2,"outside of the clean air act, there is support...",What kind of energy standards have been implem...,667,"at latest count, 30 states have implemented re...",734,"as commented, discharge standards vary from co...",False
3,acknowledgments. we thank jim haywood for the ...,What was Jim Haywood thanked for?,1731,we thank jim haywood for the aerosol forcing pdf,17,acknowledgments. we thank jim haywood for the ...,True
4,acknowledgments. we thank jim haywood for the ...,Where was Jonathan Gregory supported?,1732,jonathan gregory was supported at the universi...,277,acknowledgments. we thank jim haywood for the ...,True


In [8]:
# evaluate on test
corpus_test = [test_df['context'][i] for i in test_df.index]
X_test = vectorizer.transform(corpus_test)
tqdm.pandas(desc="tfidf retrieval")
test_df['retrieved_context'] = test_df['question'].progress_apply(
    lambda q: get_context(corpus_test, q, X_test, vectorizer, top_k=1))

tfidf retrieval: 100%|██████████| 2096/2096 [00:03<00:00, 622.63it/s]


In [10]:
test_df['retrieval_correct'] = test_df.apply(
    lambda row: row['context'].lower() in row['retrieved_context'].lower(), axis=1
)
print(f"percent correct: {test_df.retrieval_correct.astype(int).sum()/len(test_df):.2%}")

percent correct: 65.03%


### Model 1: out of the box semantic search model

In [35]:
# performing asymmetric semantic search (https://www.sbert.net/examples/sentence_transformer/applications/semantic-search/README.html)
# selecting a model that has high performance scores on MS Marco https://www.sbert.net/docs/pretrained-models/msmarco-v5.html
encoder_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-cos-v5')

In [None]:
train_context_embeddings = encoder_model.encode([train_df['context'][i] for i in train_df.index])
train_question_embeddings = encoder_model.encode([train_df['question'][i] for i in train_df.index])

In [None]:
print(train_context_embeddings.shape)
print(train_question_embeddings.shape)

(14756, 768)
(14756, 768)


In [None]:
corpus = train_df['context'].to_list()
nn = NearestNeighbors(n_neighbors=1, metric='cosine')
nn.fit(train_context_embeddings)

In [None]:
def retrieve_knn_context(corpus, question_embedding):
    distances, indices = nn.kneighbors([question_embedding], return_distance=True)
    return corpus[indices[0][0]]

retrieved_context_knn_ls = []
for i in tqdm(range(len(train_question_embeddings))):
    ret_context = retrieve_knn_context(corpus, train_question_embeddings[i])
    retrieved_context_knn_ls.append(ret_context)
train_df['retrieved_context_knn'] = retrieved_context_knn_ls

100%|██████████| 14756/14756 [05:07<00:00, 48.00it/s]


In [None]:
train_df['retrieval_correct_knn'] = train_df.apply(
    lambda row: row['context'].lower() in row['retrieved_context_knn'].lower(), axis=1
)

In [31]:
train_df.head()

Unnamed: 0,context,question,id,answer,answer_start,retrieved_context,retrieval_correct,retrieved_context_knn,retrieval_correct_knn
0,"outside of the clean air act, there is support...",Is a mandate for electric production to come f...,665,"and there is ongoing interest in a ""national r...",222,energy efficiency and renewable energy policie...,False,"outside of the clean air act, there is support...",True
1,"outside of the clean air act, there is support...",How wide ranging are climate policies across t...,666,"as of 2010, climate policies were being contem...",618,while these two conclusions are not mutually e...,False,addressing climate change in the united states...,False
2,"outside of the clean air act, there is support...",What kind of energy standards have been implem...,667,"at latest count, 30 states have implemented re...",734,"as commented, discharge standards vary from co...",False,"outside of the clean air act, there is support...",True
3,acknowledgments. we thank jim haywood for the ...,What was Jim Haywood thanked for?,1731,we thank jim haywood for the aerosol forcing pdf,17,acknowledgments. we thank jim haywood for the ...,True,acknowledgements we wish to thank all our frie...,False
4,acknowledgments. we thank jim haywood for the ...,Where was Jonathan Gregory supported?,1732,jonathan gregory was supported at the universi...,277,acknowledgments. we thank jim haywood for the ...,True,acknowledgements this research benefited from ...,False


In [39]:
encoder_model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')

In [40]:
test_context_embeddings = encoder_model.encode([test_df['context'][i] for i in test_df.index])
test_question_embeddings = encoder_model.encode([test_df['question'][i] for i in test_df.index])
print(test_context_embeddings.shape)
print(test_question_embeddings.shape)

corpus = test_df['context'].to_list()
nn = NearestNeighbors(n_neighbors=1, metric='cosine')
nn.fit(test_context_embeddings)

(2096, 384)
(2096, 384)


In [41]:
retrieved_context_knn_ls = []
for i in tqdm(range(len(test_question_embeddings))):
    ret_context = retrieve_knn_context(corpus, test_question_embeddings[i])
    retrieved_context_knn_ls.append(ret_context)
test_df['retrieved_context_knn'] = retrieved_context_knn_ls

test_df['retrieval_correct_knn'] = test_df.apply(
    lambda row: row['context'].lower() in row['retrieved_context_knn'].lower(), axis=1
)

print(f"percent correct: {test_df.retrieval_correct_knn.astype(int).sum()/len(test_df):.2%}")

100%|██████████| 2096/2096 [00:04<00:00, 473.66it/s]

percent correct: 44.13%





k = 1 results
| Encoder model                                       | Embedding size | % correct |
|-----------------------------------------------------|----------------|-----------|
| sentence-transformers/msmarco-distilbert-cos-v5     | 768            | 47.66%    |
| sentence-transformers/msmarco-distilbert-base-tas-b | 768            | 47.33%    |
| sentence-transformers/msmarco-MiniLM-L6-cos-v5      | 384            | 44.13%    |