# QA with Langchain

In [1]:
!pip install --quiet -U langchain-community 

In [2]:
import numpy as np
import pandas as pd
import transformers

#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [3]:
model = HuggingFaceHub(
        repo_id="google/flan-t5-base",
        task="text-generation",
        model_kwargs={"temperature": 1, "max_length": 256},
        huggingfacehub_api_token="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"
    )

  warn_deprecated(


In [4]:
questions = pd.read_csv('val_questions.csv')

In [5]:
prompt = ChatPromptTemplate.from_template("Answer this {question}")
model = model
output_parser = StrOutputParser()
qa_pairs = []

chain = prompt | model | output_parser

for question in questions['question']:
    answer = chain.invoke({"question": question})

    qa_pairs.append({'question': question, 'answer': answer})


answers_df = pd.DataFrame(qa_pairs)

In [6]:
answers_df.to_csv('baseline_answers.csv')

# Retrieval Augmentation with LangChain

In [7]:
!pip install --upgrade --quiet langchain sentence_transformers

In [8]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

In [9]:
loader = CSVLoader(file_path="./passages.csv")

data = loader.load()

In [10]:
embeddings = HuggingFaceEmbeddings()

In [11]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS

In [12]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [13]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)
vector_store = FAISS.from_documents(docs, embeddings)

In [14]:
retriever = vector_store.as_retriever()
prompt = ChatPromptTemplate.from_template("Given the context you have {context}, answer this {question}")
llm = model


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [15]:
qas_pairs = []


for question in questions['question']:
    answer = rag_chain.invoke(question)
    docs = retriever.get_relevant_documents(question)

    qas_pairs.append({'question': question, 'answer': answer, 'sources': docs})


answer_sources_df = pd.DataFrame(qas_pairs)

In [16]:
answer_sources_df.to_csv('rag_answers.csv')
answer_sources_df.head()

Unnamed: 0,question,answer,sources
0,In what century was a new settlement set up at...,10th century,[page_content='#: 500\nid: 56f837eba6d7ea1400e...
1,What monuments were the some of most visible o...,temples,[page_content='#: 501\nid: 5731aca5e99e3014001...
2,Are demand side solar technologies generally a...,passive,[page_content='#: 502\nid: 56ce5d70aab44d1400b...
3,What did the National Academy of Sciences inde...,deeply flawed,[page_content='#: 503\nid: 570c301c6b808914004...
4,Which Oppidan Houses did not change from their...,Hawtrey House,"[page_content=""#: 504\nid: 5727bad64b864d19001..."
