# QA Using LangChain

In [1]:
!pip install --quiet -U langchain-community 

In [2]:
import numpy as np
import pandas as pd
import transformers

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
#from langchain.document_loaders import CSVDLoader
#from langchain.vector_stores import FAISSVectorStore

In [3]:
model = HuggingFaceHub(
        repo_id="google/flan-t5-base",
        task="text-generation",
        model_kwargs={"temperature": 1, "max_length": 256},
        huggingfacehub_api_token="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"
    )

  warn_deprecated(


In [4]:
questions = pd.read_csv('questions.csv')

In [5]:
questions.head()

Unnamed: 0,#,question
0,0,Which presidential administration developed Sa...
1,1,How many individual colleges are part of Notre...
2,2,Where was the capital moved to?
3,3,Where could you read this information?
4,4,What did parents do when the wages were finall...


In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate



prompt = ChatPromptTemplate.from_template("Answer this {question}")
model = model
output_parser = StrOutputParser()
qa_pairs = []

chain = prompt | model | output_parser

for question in questions['question']:
    answer = chain.invoke({"question": question})

    qa_pairs.append({'question': question, 'answer': answer})


answers_df = pd.DataFrame(qa_pairs)

In [7]:
answers_df.head()

Unnamed: 0,question,answer
0,Which presidential administration developed Sa...,u.s.
1,How many individual colleges are part of Notre...,ten
2,Where was the capital moved to?,sydney
3,Where could you read this information?,in a book
4,What did parents do when the wages were finall...,repaid


In [8]:
answers_df.to_csv('baseline_answers.csv')

# Retrieval Augmentation with LangChain

In [9]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [10]:
loader = CSVLoader(file_path="./passages.csv")

data = loader.load()

In [11]:
!pip install --upgrade --quiet langchain sentence_transformers
#!pip install faiss-cpu

In [12]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [13]:
embeddings = HuggingFaceEmbeddings()

In [14]:
print(data[:1])

[Document(page_content='#: 0\nid: 572ec434c246551400ce463c\ntitle: Endangered_Species_Act\ncontext: The "Safe Harbor" agreement is a voluntary agreement between the private landowner and FWS. The landowner agrees to alter the property to benefit or even attract a listed or proposed species in exchange for assurances that the FWS will permit future "takes" above a pre-determined level. The policy relies on the "enhancement of survival" provision of Section §1539(a)(1)(A). A landowner can have either a "Safe Harbor" agreement or an Incidental Take Permit, or both. The policy was developed by the Clinton Administration in 1999.', metadata={'source': './passages.csv', 'row': 0})]


In [15]:
# passages = []

# for document in data:
    
#     lines = document.page_content.split('\n')
#     context_line = next((line for line in lines if line.startswith('context:')), None)
    
#     if context_line:
#         _, context = context_line.split('context: ', 1)
#         passages.append(context)

In [16]:
# embedded_passages = []

# for text in passages:
#     query_result = embeddings.embed_query(text)
#     embedded_passages.append(query_result)

In [17]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS

In [18]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)
vector_store = FAISS.from_documents(docs, embeddings)

In [19]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [20]:
retriever = vector_store.as_retriever()
prompt = ChatPromptTemplate.from_template("Given the context you have {context}, answer this {question}")
llm = model


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [21]:
rag_chain.invoke("Which presidential administration developed Safe Harbor policy?")

'Clinton'

In [22]:
qas_pairs = []


for question in questions['question']:
    answer = rag_chain.invoke(question)
    docs = retriever.get_relevant_documents(question)

    qas_pairs.append({'question': question, 'answer': answer, 'sources': docs})


answer_sources_df = pd.DataFrame(qas_pairs)

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/google/flan-t5-base (Request ID: DphT7KALPtQu7AfVhzTy7)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

In [None]:
answer_sources_df.to_csv('rag_answers.csv')
answer_sources_df.head()

# Task 2

## First way

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/roberta-base-nli-stsb-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/roberta-base-nli-stsb-mean-tokens')

In [None]:
passages = []

for document in data:
    
    lines = document.page_content.split('\n')
    context_line = next((line for line in lines if line.startswith('context:')), None)
    
    if context_line:
        _, context = context_line.split('context: ', 1)
        passages.append(context)

In [None]:
encoded_input = tokenizer(passages, padding=True, truncation=True, return_tensors='pt')

In [None]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
sentence_embeddings[0][0]

## Second way

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/roberta-base-nli-stsb-mean-tokens')
embeddings = model.encode(passages)

embeddings[0][0]