# QA Using LangChain

In [1]:
!pip install --quiet -U langchain-community 

In [2]:
import numpy as np
import pandas as pd
import transformers

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
#from langchain.document_loaders import CSVDLoader
#from langchain.vector_stores import FAISSVectorStore

In [3]:
model = HuggingFaceHub(
        repo_id="google/flan-t5-base",
        task="text-generation",
        model_kwargs={"temperature": 1, "max_length": 256},
        huggingfacehub_api_token="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"
    )

  warn_deprecated(


In [7]:
questions = pd.read_csv('val_questions.csv')

In [8]:
questions.head()

Unnamed: 0,#,question,answer,required_context
0,0,In what century was a new settlement set up at...,['10th'],Viking raids from 840 onwards contributed to t...
1,1,What monuments were the some of most visible o...,['temples'],The Latin word templum originally referred not...
2,2,Are demand side solar technologies generally a...,['Passive'],"Active solar techniques use photovoltaics, con..."
3,3,What did the National Academy of Sciences inde...,['deeply flawed'],"For over 40 years, the FBI crime lab in Quanti..."
4,4,Which Oppidan Houses did not change from their...,"[""Godolphin House, Jourdelay's""]","The Oppidan Houses are named Godolphin House, ..."


In [9]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate



prompt = ChatPromptTemplate.from_template("Answer this {question}")
model = model
output_parser = StrOutputParser()
qa_pairs = []

chain = prompt | model | output_parser

for question in questions['question']:
    answer = chain.invoke({"question": question})

    qa_pairs.append({'question': question, 'answer': answer})


answers_df = pd.DataFrame(qa_pairs)

In [10]:
answers_df.head()

Unnamed: 0,question,answer
0,In what century was a new settlement set up at...,15th
1,What monuments were the some of most visible o...,sphinx
2,Are demand side solar technologies generally a...,passive
3,What did the National Academy of Sciences inde...,The FBI's analytical model is a flawed tool fo...
4,Which Oppidan Houses did not change from their...,oppidan house


In [11]:
answers_df.to_csv('val_no_rag.csv')

# Retrieval Augmentation with LangChain

In [12]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [13]:
loader = CSVLoader(file_path="./passages.csv")

data = loader.load()

In [14]:
!pip install --upgrade --quiet langchain sentence_transformers
#!pip install faiss-cpu

In [15]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [16]:
embeddings = HuggingFaceEmbeddings()

In [17]:
print(data[:1])

[Document(page_content='#: 0\nid: 572ec434c246551400ce463c\ntitle: Endangered_Species_Act\ncontext: The "Safe Harbor" agreement is a voluntary agreement between the private landowner and FWS. The landowner agrees to alter the property to benefit or even attract a listed or proposed species in exchange for assurances that the FWS will permit future "takes" above a pre-determined level. The policy relies on the "enhancement of survival" provision of Section §1539(a)(1)(A). A landowner can have either a "Safe Harbor" agreement or an Incidental Take Permit, or both. The policy was developed by the Clinton Administration in 1999.', metadata={'source': './passages.csv', 'row': 0})]


In [18]:
# passages = []

# for document in data:
    
#     lines = document.page_content.split('\n')
#     context_line = next((line for line in lines if line.startswith('context:')), None)
    
#     if context_line:
#         _, context = context_line.split('context: ', 1)
#         passages.append(context)

In [19]:
# embedded_passages = []

# for text in passages:
#     query_result = embeddings.embed_query(text)
#     embedded_passages.append(query_result)

In [20]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS

In [21]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)
vector_store = FAISS.from_documents(docs, embeddings)

In [22]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [23]:
retriever = vector_store.as_retriever()
prompt = ChatPromptTemplate.from_template("Given the context you have {context}, answer this {question}")
llm = model


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [24]:
#rag_chain.invoke("Which presidential administration developed Safe Harbor policy?")

In [26]:
qas_pairs = []


for question in questions['question']:
    answer = rag_chain.invoke(question)
    docs = retriever.get_relevant_documents(question)

    qas_pairs.append({'question': question, 'answer': answer, 'sources': docs})


answer_sources_df = pd.DataFrame(qas_pairs)

In [27]:
answer_sources_df.to_csv('val_rag_answers.csv')
answer_sources_df.head()

Unnamed: 0,question,answer,sources
0,In what century was a new settlement set up at...,10th century,[page_content='#: 500\nid: 56f837eba6d7ea1400e...
1,What monuments were the some of most visible o...,temples,[page_content='#: 501\nid: 5731aca5e99e3014001...
2,Are demand side solar technologies generally a...,passive,[page_content='#: 502\nid: 56ce5d70aab44d1400b...
3,What did the National Academy of Sciences inde...,deeply flawed,[page_content='#: 503\nid: 570c301c6b808914004...
4,Which Oppidan Houses did not change from their...,Hawtrey House,"[page_content=""#: 504\nid: 5727bad64b864d19001..."


# Task 2

## First way

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/roberta-base-nli-stsb-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/roberta-base-nli-stsb-mean-tokens')

In [None]:
passages = []

for document in data:
    
    lines = document.page_content.split('\n')
    context_line = next((line for line in lines if line.startswith('context:')), None)
    
    if context_line:
        _, context = context_line.split('context: ', 1)
        passages.append(context)

In [None]:
encoded_input = tokenizer(passages, padding=True, truncation=True, return_tensors='pt')

In [None]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
sentence_embeddings[0][0]

## Second way

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/roberta-base-nli-stsb-mean-tokens')
embeddings = model.encode(passages)

embeddings[0][0]