In [None]:
!pip install langchain openai weaviate-client ragas

In [None]:
# !pip install langchain
# !pip install -U langchain-community
# !pip install weaviate-client
# !pip install openai
# !pip install tiktoken
!pip install faiss-cpu

In [None]:
OPENAI_API_KEY="<YOUR_OPENAI_API_KEY>"

In [12]:
import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

url = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt"
res = requests.get(url)
with open("state_of_the_union.txt", "w") as f:
    f.write(res.text)

sample_text = "The president addressed various national issues in his speech."
with open("state_of_the_union.txt", "w") as f:
    f.write(sample_text)

# Load the data
loader = TextLoader('./state_of_the_union.txt')
documents = loader.load()

# Chunk the data
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

In [13]:
res

<Response [404]>

In [14]:
chunks

[Document(metadata={'source': './state_of_the_union.txt'}, page_content='The president addressed various national issues in his speech.')]

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
# import weaviate
# from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv,find_dotenv

# Load OpenAI API key from .env file
load_dotenv(find_dotenv())

# Setup vector database
client = Weaviate.Client(
  embedded_options = EmbeddedOptions()
)

# Populate vector database
vectorstore = Weaviate.from_documents(
    client = client,    
    documents = chunks,
    embedding = OpenAIEmbeddings(),
    by_text = False
)

# # Setup vector database directly with Langchain's Weaviate class
# vectorstore = Weaviate.from_documents(
#     documents=chunks,
#     embedding=OpenAIEmbeddings(),
# )

# Define vectorstore as retriever to enable semantic search
retriever = vectorstore.as_retriever()

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Initialize embeddings and create vectorstore with the embeddings model
embedding_model = OpenAIEmbeddings()
embeddings = embedding_model.embed_documents(chunks)
vectorstore = FAISS.from_embeddings(embeddings, embedding=embedding_model)

# Set up retriever for semantic search
retriever = vectorstore.as_retriever()

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [12]:
from datasets import Dataset

questions = ["What did the president say about Justice Breyer?", 
             "What did the president say about Intel's CEO?",
             "What did the president say about gun violence?",
            ]
ground_truths = [["The president said that Justice Breyer has dedicated his life to serve the country and thanked him for his service."],
                ["The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion."],
                ["The president asked Congress to pass proven measures to reduce gun violence."]]
answers = []
contexts = []

# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

/Users/leiyu/opt/anaconda3/envs/myenv/lib/python3.10/site-packages/pydantic/main.py:1087: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/
/Users/leiyu/opt/anaconda3/envs/myenv/lib/python3.10/site-packages/pydantic/main.py:1087: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/
/Users/leiyu/opt/anaconda3/envs/myenv/lib/python3.10/site-packages/pydantic/main.py:1087: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/
/Users/leiyu/opt/anaconda3/envs/myenv/lib/python3.10/site-packages/pydantic/main.py:1

In [13]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|███████████████████████████████████████████████████████| 12/12 [00:04<00:00,  2.98it/s]


In [14]:
df

Unnamed: 0,question,answer,contexts,ground_truths,ground_truth,context_precision,context_recall,faithfulness,answer_relevancy
0,What did the president say about Justice Breyer?,"I'm sorry, I don't have the information about ...",[404: Not Found],[The president said that Justice Breyer has de...,The president said that Justice Breyer has ded...,0.0,0.0,1.0,0.0
1,What did the president say about Intel's CEO?,"I'm sorry, I don't have access to the specific...",[404: Not Found],[The president said that Pat Gelsinger is read...,The president said that Pat Gelsinger is ready...,0.0,0.0,0.5,0.0
2,What did the president say about gun violence?,"I'm sorry, I don't have that information as th...",[404: Not Found],[The president asked Congress to pass proven m...,The president asked Congress to pass proven me...,0.0,0.0,0.5,0.0
