In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['OPEN_API_KEY']=os.getenv("OPENAI_API_KEY")
## LangSmith Tracking
os.environ['LANGCHAIN_API_KEY']=os.getenv("LANGCHAIN_API_KEY")
os.environ['LANGCHAIN_TRACKING_V2']="true"
os.environ['LANGCHAIN_PROJECT']=os.getenv("LANGCHAIN_PROJECT")

In [3]:
## Loading the document
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader('https://en.wikipedia.org/wiki/OpenAI')

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
docs = loader.load()

In [5]:
# Splitting Data
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550,chunk_overlap=150)
documents = text_splitter.split_documents(docs)

In [6]:
documents_page_content = []
for i in documents :
    documents_page_content.append(i.page_content)

In [7]:
clean_docs = [item.replace("\n", "").strip() for item in documents_page_content]

In [8]:
clean_docs

['OpenAI - WikipediaJump to contentMain menuMain menumove to sidebarhide\t\tNavigation\tMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\t\tContribute\tHelpLearn to editCommunity portalRecent changesUpload fileSpecial pagesSearchSearchAppearanceDonateCreate accountLog inPersonal toolsDonate Create account Log in\t\tPages for logged out editors learn more',
 'DonateCreate accountLog inPersonal toolsDonate Create account Log in\t\tPages for logged out editors learn moreContributionsTalkContentsmove to sidebarhide(Top)1Founding2Corporate StructureToggle Corporate Structure subsection2.1Transition from non-profit2.2Partnership with Microsoft2.3Finances2.4Firing of Altman2.5Acquisitions2.6Corporate partnerships2.7Government contracting',
 '2.3Finances2.4Firing of Altman2.5Acquisitions2.6Corporate partnerships2.7Government contracting3ServicesToggle Services subsection3.1Products3.2Development3.3Transparency3.4Alignment3.5Leaked conversations4ManagementToggle Management

In [9]:
# Embedding the document (Converting Document to Vectors of Numbers)
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [10]:
embeddings_documents = embeddings.embed_documents(clean_docs)

In [11]:
from langchain.schema import Document
# Converting the plain list of string into the document
langchain_document  = [Document(page_content=text) for text in clean_docs]

In [12]:
## Storing in the FAISS database
from langchain_community.vectorstores import FAISS
vectorstoredb = FAISS.from_documents(langchain_document,embeddings)

In [13]:
# Loading the model from OpenAI
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o") # Calling the specific model
print(llm)

client=<openai.resources.chat.completions.completions.Completions object at 0x00000179D1907380> async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x00000179D1907770> root_client=<openai.OpenAI object at 0x00000179D1A0C910> root_async_client=<openai.AsyncOpenAI object at 0x00000179D1A0D1D0> model_name='gpt-4o' model_kwargs={} openai_api_key=SecretStr('**********')


# Different type of chain

1:- stuff-> simple concatenation
Meaning: “Just stuff all the documents together into the prompt.”

Behavior: It literally concatenates all retrieved docs + your query into one big prompt.

Pros: Simple, fast.

Cons: If documents are too many or too large, you might exceed the LLM’s token limit, and it may get confused.

2:- map_reduce-> process separately and then combine
Meaning: Process each document individually with the LLM (“map”), then combine the intermediate results (“reduce”).

Pros: Handles many documents, avoids token limit issues.

Cons: Slightly slower (because LLM is called multiple times).

3:- refine-> incremental refinement
Meaning: Start with the first document as initial answer, then refine the answer with each subsequent document.

Pros: Works well if you want an iteratively improved answer.

Cons: Slower, but can give better synthesis.


In [14]:
retriver = vectorstoredb.as_retriever(search_kwargs={"k":3})

In [15]:
from langchain.chains import RetrievalQA
retriever = vectorstoredb.as_retriever(search_kwargs={"k": 3})
# Build Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

In [16]:
query1 = "Why Sam Altman fired from the Open AI and on which date he got fired ?"
result = qa_chain.run(query1)
print(result)

  result = qa_chain.run(query1)


Sam Altman was removed as CEO of OpenAI on November 17, 2023, by the board of directors, which cited a lack of confidence in him.
