In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [2]:
passages_df = pd.read_csv("./passages.csv")

In [3]:
passages_df.head()

Unnamed: 0,#,id,title,context
0,0,572ec434c246551400ce463c,Endangered_Species_Act,"The ""Safe Harbor"" agreement is a voluntary agr..."
1,1,573383e94776f41900660c5b,University_of_Notre_Dame,"Besides its prominence in sports, Notre Dame i..."
2,2,56e7894300c9c71400d77268,Nanjing,It is believed that Nanjing was the largest ci...
3,3,57327c59b9d445190005eb4c,Humanism,"In the 6th century BCE, Taoist teacher Lao Tzu..."
4,4,5727a440ff5b5019007d91bb,Child_labour,"According to Milton Friedman, before the Indus..."


In [4]:
documents = passages_df['context'].tolist()

In [5]:
documents[0]

'The "Safe Harbor" agreement is a voluntary agreement between the private landowner and FWS. The landowner agrees to alter the property to benefit or even attract a listed or proposed species in exchange for assurances that the FWS will permit future "takes" above a pre-determined level. The policy relies on the "enhancement of survival" provision of Section §1539(a)(1)(A). A landowner can have either a "Safe Harbor" agreement or an Incidental Take Permit, or both. The policy was developed by the Clinton Administration in 1999.'

In [6]:
model = SentenceTransformer('sentence-transformers/roberta-base-nli-stsb-mean-tokens')
document_embeddings = model.encode(documents)

document_embeddings[0][0]

0.0346643

In [7]:
document_embeddings.astype

<function ndarray.astype>

In [8]:
questions_df = pd.read_csv('./questions.csv')

questions_df.head()

Unnamed: 0,#,question
0,0,Which presidential administration developed Sa...
1,1,How many individual colleges are part of Notre...
2,2,Where was the capital moved to?
3,3,Where could you read this information?
4,4,What did parents do when the wages were finall...


In [9]:
questions = questions_df['question'].tolist()

In [10]:
questions[0]

'Which presidential administration developed Safe Harbor policy?'

In [11]:
question_embeddings = model.encode(questions)

question_embeddings[0][0]

0.19474915

In [12]:
import faiss

In [13]:
dimension = document_embeddings.shape[1]

dimension

768

In [14]:
index = faiss.IndexFlatL2(dimension)
index.add(document_embeddings)

In [15]:
k = 3  # Number of nearest neighbors to retrieve
D, I = index.search(question_embeddings, k)  # D: distances, I: indices of the documents

retrieved_docs = [documents[i] for i in I[0]]  # Retrieve documents based on indices


In [16]:
retrieved_docs

['The "Safe Harbor" agreement is a voluntary agreement between the private landowner and FWS. The landowner agrees to alter the property to benefit or even attract a listed or proposed species in exchange for assurances that the FWS will permit future "takes" above a pre-determined level. The policy relies on the "enhancement of survival" provision of Section §1539(a)(1)(A). A landowner can have either a "Safe Harbor" agreement or an Incidental Take Permit, or both. The policy was developed by the Clinton Administration in 1999.',
 'The US Congress was urged to create the exemption by proponents of a conservation plan on San Bruno Mountain, California that was drafted in the early 1980s and is the first HCP in the nation. In the conference report on the 1982 amendments, Congress specified that it intended the San Bruno plan to act "as a model" for future conservation plans developed under the incidental take exemption provision and that "the adequacy of similar conservation plans shoul

In [17]:
context = " ".join(retrieved_docs)
question = "Which presidential administration developed Safe Harbor policy?"

In [18]:
#!pip install sentencepiece

In [19]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Prepare input for the model
input_text = f"Given the context: {context}, answer the question: {question}"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate an answer
outputs = model.generate(input_ids, max_length=200, num_beams=5, early_stopping=True)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(answer)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors


Clinton


In [24]:
import numpy as np
# Number of nearest neighbors to retrieve
k = 3

# Prepare a list to hold answers and retrieved documents for each question
answers_and_docs = []

for idx, question_embedding in enumerate(question_embeddings):
    # Search for the k nearest neighbors (retrieved documents)
    D, I = index.search(np.array([question_embedding]), k)
    
    # Retrieve documents based on indices
    retrieved_docs = [documents[i] for i in I[0]]
    
    # Concatenate retrieved documents for context
    context = " ".join(retrieved_docs)
    
    # Prepare input for the T5 model
    input_text = f"Given the context: {context}, answer the question: {questions[idx]}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    
    # Generate an answer
    outputs = model.generate(input_ids, max_length=200, num_beams=5, early_stopping=True)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Append the answer and retrieved documents to the list
    answers_and_docs.append({
        "question": questions[idx],
        "answer": answer,
        "retrieved_docs": retrieved_docs
    })

# Convert the list of answers and documents to a DataFrame for easy viewing/exporting
answers_df = pd.DataFrame(answers_and_docs)

# Optionally, save the DataFrame to a new CSV file
answers_df.to_csv('answers_and_retrieved_docs.csv', index=False)

print("Done generating answers and retrieving documents.")

Done generating answers and retrieving documents.
