In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from datasets import load_dataset

load_dotenv()

True

In [2]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="validation[:1000]", cache_dir="./cache")

In [10]:
from langchain_core.documents import Document

articles = [Document(page_content=item['article'], metadata={"id": item['id']}) for item in dataset]

In [11]:
articles[:5]

[Document(metadata={'id': 'a4942dd663020ca54575471657a0af38d82897d6'}, page_content='(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard\'s gift was data processing of genetic profiles from donor-recipient pairs. It works on a simple swapping princ

In [None]:
class RAGSystem:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings()
        self.vector_store = None
        self.qa_chain = None
        self.retrieval_chain = None
    def format_docs(self, docs):
        return "\n\n".join(doc.page_content for doc in docs)

    def load_and_process_data(self, num_articles=1000):
        dataset = load_dataset("cnn_dailymail", "3.0.0", split="validation[:1000]", cache_dir="./cache")
        self.articles = [item['article'] for item in dataset]

        print(len(self.articles), "articles loaded", "The Test article is:", self.articles[0])

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        docs = text_splitter.create_documents(self.articles)

        print(len(docs), "chunks created")

        self.vector_store = Chroma.from_documents(docs, self.embeddings)

    def create_context_prompt(self):

        # Prompt
        template = """Answer the question based only on the following context:
        {context}

        Question: {question}
        """

        prompt = ChatPromptTemplate.from_template(template)

        return prompt

    def setup_retrieval_qa(self):
        retriever = self.vector_store.as_retriever()
        llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        # Prompt
        prompt = self.create_context_prompt()
        # retrieval chain
        self.retrieval_chain = retriever | self.format_docs
        # qa chain
        self.qa_chain = (
            {"context": self.retrieval_chain, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

    def query(self, question: str) -> str:
        if not self.qa_chain:
            raise ValueError("QA chain not initialized. Call setup_retrieval_qa() first.")
        return self.qa_chain.invoke(question)

In [1]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader
dataset_name = "abisee/cnn_dailymail"
page_content_column = "article"
name = '1.0.0'


loader = HuggingFaceDatasetLoader(dataset_name, page_content_column, name, cache_dir='./cache')

# documents = loader.load()

In [2]:
documents = loader.load()



In [None]:
documents[0]

In [None]:
rag_system = RAGSystem()

In [None]:
rag_system.load_and_process_data()

In [None]:
rag_system.setup_retrieval_qa()

In [None]:
rag_system.retrieval_chain.invoke("Who decided to give one of her kidneys?")

In [None]:
# Example query
question = "Who decided to give one of her kidneys?"
answer = rag_system.query(question)
print(f"Question: {question}")
print(f"Answer: {answer}")