In [1]:
import os
import time
import numpy as np
from datasets import load_dataset
import pandas as pd
import random

from scipy.spatial.distance import cosine
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load data
def prepare_data(data):
    articles = []

    for paragraph in data:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            id = qa['id']
            for ans in qa['answers']:
                answer = ans['text']
                answer_start = ans['answer_start']
                articles.append({'context': context, 'question': question, 'id': id, 'answer': answer, 'answer_start': answer_start})

    return articles

In [3]:
ds = load_dataset("rony/climate-change-MRC")
train_ds = ds["train"]
valid_ds = ds["validation"]
test_ds = ds["test"]

# each is a 1-item list, so take first index
train_ds = train_ds[0]
valid_ds = valid_ds[0]
test_ds = test_ds[0]

# take the 'data' key of the dict, ignoring 'version' (there's just one)
train_ds = train_ds['data'][0]['paragraphs']
valid_ds = valid_ds['data'][0]['paragraphs']
test_ds = test_ds['data'][0]['paragraphs']
# each dataset is a list of dicts, where each list item is a context paragraph ('context' key) with qas ('qas' key) which contain questions, id, and answer

train_df = pd.DataFrame(prepare_data(train_ds))
print(f"{train_df.shape=}")

valid_df = pd.DataFrame(prepare_data(valid_ds))
print(f"{valid_df.shape=}")

test_df = pd.DataFrame(prepare_data(test_ds))
print(f"{test_df.shape=}")

train_df.shape=(14756, 5)
valid_df.shape=(4229, 5)
test_df.shape=(2096, 5)


### Baseline: TF-IDF retrieval

In [4]:
corpus_train = [train_df['context'][i] for i in train_df.index]
vectorizer = TfidfVectorizer() # default is at word level rather than n-gram
# X = vectorizer.fit_transform(corpus)
vectorizer_fit_train = vectorizer.fit(corpus_train)

In [5]:
def get_context(corpus, question, context_vectorized, vectorizer, top_k=1):
    question_vec = vectorizer.transform([question])
    # use pairwise cosine similarity
    similarity_scores = cosine_similarity(question_vec, context_vectorized).flatten()
    top_k_indices = similarity_scores.argsort()[::-1][:top_k]

    return ';'.join([corpus[i] for i in top_k_indices])

In [6]:
corpus_test = [test_df['context'][i] for i in test_df.index]
corpus_test_vectorized = vectorizer_fit_train.transform(corpus_test)
tqdm.pandas(desc="tfidf retrieval on test")
test_df['retrieved_context'] = test_df['question'].progress_apply(
    lambda q: get_context(corpus_test, q, corpus_test_vectorized, vectorizer_fit_train, top_k=1))

tfidf retrieval on test: 100%|██████████| 2096/2096 [00:03<00:00, 673.10it/s]


In [7]:
test_df['retrieval_correct'] = test_df.apply(
    lambda row: row['context'].lower() in row['retrieved_context'].lower(), axis=1)
print(test_df.retrieval_correct.value_counts(normalize=True))

retrieval_correct
True     0.650286
False    0.349714
Name: proportion, dtype: float64


In [9]:
test_df.to_csv('test_df_tfidf_retrieval.csv', index=False)
test_df.head()

Unnamed: 0,context,question,id,answer,answer_start,retrieved_context,retrieval_correct
0,some more detailed work has been done at natio...,The 9 percent reduction of rice in Bangladesh ...,14095,flooding damage and climate variability,514,some more detailed work has been done at natio...,True
1,some more detailed work has been done at natio...,What kind of model of Bangladesh was had been ...,14096,a dynamic economywide model,70,some more detailed work has been done at natio...,True
2,some more detailed work has been done at natio...,What approach did Ahmed use to estimate how ch...,14097,a modelling approach,639,some more detailed work has been done at natio...,True
3,extreme sea level height fluctuations are also...,Where height fluctuations are large?,2843,extreme sea level height fluctuations are also...,0,"d [?]t ln rz, which is derived by dividing rzf...",False
4,extreme sea level height fluctuations are also...,How non-tide sea levels are obtained?,2844,the non-tide sea levels are obtained by spectr...,167,extreme sea level height fluctuations are also...,True


### Model 1: out of the box semantic search model

In [8]:
# performing asymmetric semantic search (https://www.sbert.net/examples/sentence_transformer/applications/semantic-search/README.html)
# selecting a model that has high performance scores on MS Marco https://www.sbert.net/docs/pretrained-models/msmarco-v5.html
encoder_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-cos-v5')

In [12]:
train_context_embeddings = encoder_model.encode([train_df['context'][i] for i in train_df.index])
train_question_embeddings = encoder_model.encode([train_df['question'][i] for i in train_df.index])

In [None]:
print(train_context_embeddings.shape)
print(train_question_embeddings.shape)

(14756, 768)
(14756, 768)


In [None]:
corpus = train_df['context'].to_list()
nn = NearestNeighbors(n_neighbors=1, metric='cosine')
nn.fit(train_context_embeddings)

In [13]:
def retrieve_knn_context(corpus, question_embedding):
    distances, indices = nn.kneighbors([question_embedding], return_distance=True)
    return corpus[indices[0][0]]

In [None]:
retrieved_context_knn_ls = []
for i in tqdm(range(len(train_question_embeddings))):
    ret_context = retrieve_knn_context(corpus, train_question_embeddings[i], k=2)
    retrieved_context_knn_ls.append(ret_context)
train_df['retrieved_context_knn'] = retrieved_context_knn_ls

In [None]:
train_df['retrieval_correct_knn'] = train_df.apply(
    lambda row: row['context'].lower() in row['retrieved_context_knn'].lower(), axis=1
)

In [31]:
train_df.head()

Unnamed: 0,context,question,id,answer,answer_start,retrieved_context,retrieval_correct,retrieved_context_knn,retrieval_correct_knn
0,"outside of the clean air act, there is support...",Is a mandate for electric production to come f...,665,"and there is ongoing interest in a ""national r...",222,energy efficiency and renewable energy policie...,False,"outside of the clean air act, there is support...",True
1,"outside of the clean air act, there is support...",How wide ranging are climate policies across t...,666,"as of 2010, climate policies were being contem...",618,while these two conclusions are not mutually e...,False,addressing climate change in the united states...,False
2,"outside of the clean air act, there is support...",What kind of energy standards have been implem...,667,"at latest count, 30 states have implemented re...",734,"as commented, discharge standards vary from co...",False,"outside of the clean air act, there is support...",True
3,acknowledgments. we thank jim haywood for the ...,What was Jim Haywood thanked for?,1731,we thank jim haywood for the aerosol forcing pdf,17,acknowledgments. we thank jim haywood for the ...,True,acknowledgements we wish to thank all our frie...,False
4,acknowledgments. we thank jim haywood for the ...,Where was Jonathan Gregory supported?,1732,jonathan gregory was supported at the universi...,277,acknowledgments. we thank jim haywood for the ...,True,acknowledgements this research benefited from ...,False


In [28]:
# performing asymmetric semantic search (https://www.sbert.net/examples/sentence_transformer/applications/semantic-search/README.html)
# selecting a model that has high performance scores on MS Marco https://www.sbert.net/docs/pretrained-models/msmarco-v5.html
encoder_model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')

In [29]:
test_context_embeddings = encoder_model.encode([test_df['context'][i] for i in test_df.index])
test_question_embeddings = encoder_model.encode([test_df['question'][i] for i in test_df.index])
print(test_context_embeddings.shape)
print(test_question_embeddings.shape)

corpus = test_df['context'].to_list()
nn = NearestNeighbors(n_neighbors=1, metric='cosine')
nn.fit(test_context_embeddings)

(2096, 384)
(2096, 384)


In [30]:
def retrieve_knn_context(corpus, question_embedding):
    distances, indices = nn.kneighbors([question_embedding], return_distance=True)
    return corpus[indices[0][0]]

In [31]:
retrieved_context_knn_ls = []
for i in tqdm(range(len(test_question_embeddings))):
    ret_context = retrieve_knn_context(corpus, test_question_embeddings[i])
    retrieved_context_knn_ls.append(ret_context)
test_df['retrieved_context_knn'] = retrieved_context_knn_ls

test_df['retrieval_correct_knn'] = test_df.apply(
    lambda row: row['context'].lower() in row['retrieved_context_knn'].lower(), axis=1
)

print(f"percent correct: {test_df.retrieval_correct_knn.astype(int).sum()/len(test_df):.2%}")

100%|██████████| 2096/2096 [00:04<00:00, 464.56it/s]

percent correct: 44.13%





In [15]:
test_df.head()

Unnamed: 0,context,question,id,answer,answer_start,retrieved_context_knn,retrieval_correct_knn
0,some more detailed work has been done at natio...,The 9 percent reduction of rice in Bangladesh ...,14095,flooding damage and climate variability,514,some more detailed work has been done at natio...,True
1,some more detailed work has been done at natio...,What kind of model of Bangladesh was had been ...,14096,a dynamic economywide model,70,some more detailed work has been done at natio...,True
2,some more detailed work has been done at natio...,What approach did Ahmed use to estimate how ch...,14097,a modelling approach,639,the following sections highlight the possible ...,False
3,extreme sea level height fluctuations are also...,Where height fluctuations are large?,2843,extreme sea level height fluctuations are also...,0,"helianthella plants along trail 401, a few hun...",False
4,extreme sea level height fluctuations are also...,How non-tide sea levels are obtained?,2844,the non-tide sea levels are obtained by spectr...,167,extreme sea level height fluctuations are also...,True


k = 1 results
| Encoder model                                       | Embedding size | % correct |
|-----------------------------------------------------|----------------|-----------|
| sentence-transformers/msmarco-distilbert-cos-v5     | 768            | 47.66%    |
| sentence-transformers/msmarco-distilbert-base-tas-b | 768            | 47.33%    |
| sentence-transformers/msmarco-MiniLM-L6-cos-v5      | 384            | 44.13%    |

In [20]:
# look at nearest neighbors for a sample question
encoder_model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')
sample_idx = random.randint(0,len(test_df)-1)
question_embedding = encoder_model.encode(test_df['question'][sample_idx])
context_embeddings = encoder_model.encode([test_df['context'][i] for i in test_df.index])

nn = NearestNeighbors(n_neighbors=5, metric='cosine')
nn.fit(context_embeddings)
distances, indices = nn.kneighbors([question_embedding], return_distance=True)


In [24]:
distances

array([[0.711156 , 0.711156 , 0.711156 , 0.7184627, 0.7184627]],
      dtype=float32)

In [32]:
print(f"question: {test_df['question'][sample_idx]}")
print(f"correct context: {test_df['context'][sample_idx]}")
for i in range(len(distances[0])):
    idx = indices[0][i]
    pprint(f"({distances[0][i]}) {test_df['context'][idx]}")
    print()

question: What did he forsee could possibly happen?
correct context: sir crispin tickell is chancellor of the university of kent at canterbury, chairman of the climate institute in washington, dc and director of the green college centre for environmental policy and understanding. he is the former united kingdom permanent representative to the united nations, british ambassador to mexico and permanent secretary of the overseas development administration. he serves as convener of the british government panel on sustainable development and is a member of the china council for international cooperation on environment and development. he is the author of climatic change and world affairs, which two decades ago pointed to the possibility that climate change could affect international stability.
('(0.7111560106277466) nosema apis is a microsporidian that attacks the midgut '
 'wall of adult honey bees. the disease can develop with no visible symptoms '
 'or manifest itself as a weakening of t

In [None]:
# Questions are formulated with a specific context in mind. The above question is incredibly vague if you don't have the associate context, so the embedding model struggles. 
# TF-IDF may do better because of extremely specific words in the question that only show up in the matching context.

### Evaluate TF-IDF retrieval on best-performing extractive and abstractive QA methods

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BartForConditionalGeneration, BartForQuestionAnswering,TrainingArguments, Trainer
import evaluate
from sentence_transformers import SentenceTransformer, util

In [9]:
def get_context(corpus, question, context_vectorized, vectorizer, top_k=1):
    question_vec = vectorizer.transform([question])
    # use pairwise cosine similarity
    similarity_scores = cosine_similarity(question_vec, context_vectorized).flatten()
    top_k_indices = similarity_scores.argsort()[::-1][:top_k]

    return ';'.join([corpus[i] for i in top_k_indices])

def tfidf_retrieval(corpus, input_df):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    input_df['retrieved_context'] = input_df['question'].apply(
        lambda q: get_context(corpus, q, X, vectorizer, top_k=1))
    input_df['retrieval_correct'] = input_df.apply(
        lambda row: row['context'].lower() in row['retrieved_context'].lower(), axis=1)

    # evaluation
    y_true = [True] * len(input_df) # each row has a correctly matching question & context in labeled data
    y_pred = input_df['retrieval_correct'].astype(bool).to_list()
    print(f"retrieval accuracy: {accuracy_score(y_true, y_pred):.2%}")
    print(f"retrieval precision: {precision_score(y_true, y_pred):.2%}")
    print(f"retrieval recall: {recall_score(y_true, y_pred):.2%}")
    print(f"retrieval f1: {f1_score(y_true, y_pred):.2%}")
    
    return input_df

In [11]:
# corpus = [train_df['context'][i] for i in train_df.index] # 48% accuracy
corpus = [test_df['context'][i] for i in test_df.index] # 66% accuracy
test_df_tfidf = tfidf_retrieval(corpus, test_df)

retrieval accuracy: 66.08%
retrieval precision: 100.00%
retrieval recall: 66.08%
retrieval f1: 79.57%


In [None]:
# vectorize on all datasets, run on test

In [16]:
# run recall and then top extractive
dir_path = "bart_squad_ft_climate/model"
model_checkpoint = 'valhalla/bart-large-finetuned-squadv1'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BartForQuestionAnswering.from_pretrained(dir_path)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForQuestionAnswering(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): lora.Linear(
              (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=1024, bias=False)
              )
              (lora_embedding_A):

In [19]:
for i in tqdm(test_df_tfidf.index):
    question = test_df_tfidf['question'][i]
    text = test_df_tfidf['retrieved_context'][i]

    inputs = tokenizer(question, text, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0][answer_start_index:answer_end_index + 1]
    answer_decoded = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    test_df_tfidf.at[i, 'bart_answer'] = answer_decoded

100%|██████████| 2096/2096 [02:27<00:00, 14.24it/s]


In [24]:
rouge = evaluate.load('rouge')

predictions = test_df_tfidf['bart_answer']
references = test_df_tfidf['answer']

rouge_res = rouge.compute(predictions=predictions, references=references)
print(f"rouge scores:\n{rouge_res}")

encoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
candidate_embeddings = encoder_model.encode(predictions)
reference_embeddings = encoder_model.encode(references)
similarity = util.pairwise_cos_sim(candidate_embeddings, reference_embeddings)
print(f"average semantic similarity:\n{torch.mean(similarity)}")

rouge scores:
{'rouge1': np.float64(0.356875374394934), 'rouge2': np.float64(0.2712327563222511), 'rougeL': np.float64(0.3173676659962066), 'rougeLsum': np.float64(0.31731259196610734)}
average semantic similarity:
0.5509738326072693


In [42]:
# look at examples
rouge = evaluate.load('rouge')

test_df_tfidf = pd.read_parquet('test_bart_tfidf_retrieval.parquet')
for i in test_df_tfidf.sample(5).index:
    print(f"question: {test_df_tfidf['question'][i]}\n")
    print(f"correct context: {test_df_tfidf['context'][i]}\n")
    print(f"retrieved context: {test_df_tfidf['retrieval_correct'][i]} -- {test_df_tfidf['retrieved_context'][i]}\n")
    print(f"bart answer: {test_df_tfidf['bart_answer'][i]}\n")
    print(f"correct answer: {test_df_tfidf['answer'][i]}")
    print(f"rouge scores: {rouge.compute(predictions=[test_df_tfidf['bart_answer'][i]], references=[test_df_tfidf['answer'][i]])}")
    print('='*100)

question: What is the lapse rate used to convert temperature change in elevation?

correct context: for this analysis, we assume that the current position of the birch treeline in the swedish mountain region (fig. 1) is determined by, and in balance with, climatic factors, and more specifically the mean summer (june-august) temperature. summer temperature was chosen as it correlates with treeline position over large scales (korner 1999, grace et al. 2002). furthermore, we assume that the trees have a migration rate that is fast enough to track changes in summer temperature, at least within the scale of the analysis. this implies that a change in mean summer temperature will produce a change in the position of the treeline. we convert changes in temperature to changes in elevation by using a lapse rate of 0.6degc/100 m. this lapse rate is typical for the area (swedish national atlas 1995). it is possible that the lapse rate varies over both space and time, but we have no data from our s

In [43]:
# compare to bart fine tuned on squad (no additional fine tuning)
df = pd.read_parquet('test_bart_qa_scored.parquet')
predictions = df['bart_answer']
references = df['answer']

rouge scores:
{'rouge1': np.float64(0.4340199341181842), 'rouge2': np.float64(0.379092584951664), 'rougeL': np.float64(0.4321657881093896), 'rougeLsum': np.float64(0.4323644487595499)}


In [46]:
test_df_tfidf = pd.read_parquet('test_bart_tfidf_retrieval.parquet')
for i in df.sample(5).index:
    print(f"question: {df['question'][i]}\n")
    print(f"correct context: {df['context'][i]}\n")
    print(f"bart answer: {df['bart_answer'][i]}\n")
    print(f"correct answer: {df['answer'][i]}")
    print(f"rouge scores: {rouge.compute(predictions=[df['bart_answer'][i]], references=[df['answer'][i]])}")
    print('='*100)

question: Explain flux?

correct context: overall, the models demonstrate the ability to represent a wide range of coupling between climate and the carbon cycle. having described all of the various components of these models, we now turn back to the comparison of carbon inventory and flux estimates in fig. 1. both models demonstrate excellent agreement with previous estimates in terms of the partitioning of carbon between the various reservoirs, though the land vegetation is quite a bit higher and the ocean biota quite a bit lower than those in the observational synthesis and box model analysis of siegenthaler and sarmiento (1993), which is used as a quasi-consensus estimate used in the intergovernmental panel on climate change (ipcc) fourth assessment report (randall et al. 2007). these estimates remain highly uncertain, however. while the sabine et al. (2004) land gross primary production

bart answer:  flux estimates

correct answer: having described all of the various components of