In [1]:
from dotenv import load_dotenv
import os
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS, DocArrayInMemorySearch
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

In [2]:
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

In [3]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
  openai_api_key=API_KEY,
  model_name="gpt-4",
)
response_text = llm.invoke("What is coding?")
print(response_text)

  llm = ChatOpenAI(


content='Coding, also known as computer programming, is the process of creating instructions for computers using programming languages. It involves writing source code in a specific programming language which is then translated into machine language that the computer can understand and execute. Coding is used to develop software, websites, apps, and other digital platforms or tools.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 64, 'prompt_tokens': 11, 'total_tokens': 75, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run--695ed5e5-fa92-4605-ba29-31f613c9c630-0'


In [4]:
parser = StrOutputParser()
chain = llm | parser
chain.invoke("What is coding?")

'Coding, also known as programming, is the process of creating instructions for computers to follow. It involves writing scripts, functions, and algorithms that guide the computer to perform certain tasks. This can include creating software, apps, websites, video games, and more. Coding languages include Python, Java, C++, JavaScript, and many others.'

In [5]:
file_loader = PyPDFLoader("Knn and Prob-1.pdf")
page = file_loader.load_and_split()
page

[Document(metadata={'producer': 'Adobe PDF Library 20.4.68', 'creator': 'Acrobat PDFMaker 20 for PowerPoint', 'creationdate': '2022-02-21T16:21:50-05:00', 'author': 'Larry Medsker', 'moddate': '2022-02-21T16:22:38-05:00', 'title': 'PowerPoint Presentation', 'source': 'Knn and Prob-1.pdf', 'total_pages': 88, 'page': 0, 'page_label': '1'}, page_content='KNN and Probability'),
 Document(metadata={'producer': 'Adobe PDF Library 20.4.68', 'creator': 'Acrobat PDFMaker 20 for PowerPoint', 'creationdate': '2022-02-21T16:21:50-05:00', 'author': 'Larry Medsker', 'moddate': '2022-02-21T16:22:38-05:00', 'title': 'PowerPoint Presentation', 'source': 'Knn and Prob-1.pdf', 'total_pages': 88, 'page': 1, 'page_label': '2'}, page_content='Classification\n\uf0d8Definition\n\uf076Classification can take two distinct meanings in Machine Learning\n\uf0d8Unsupervised Learning \n\uf076We may be given a set of observations with the aim of establishing \nthe existence of classes or clusters in the data\n\uf0d8S

In [6]:
spliter = RecursiveCharacterTextSplitter(
    chunk_size=100, # how many small pieces you want the pdf to split into
    chunk_overlap=40)
pages = spliter.split_documents(page)

In [7]:
pages

[Document(metadata={'producer': 'Adobe PDF Library 20.4.68', 'creator': 'Acrobat PDFMaker 20 for PowerPoint', 'creationdate': '2022-02-21T16:21:50-05:00', 'author': 'Larry Medsker', 'moddate': '2022-02-21T16:22:38-05:00', 'title': 'PowerPoint Presentation', 'source': 'Knn and Prob-1.pdf', 'total_pages': 88, 'page': 0, 'page_label': '1'}, page_content='KNN and Probability'),
 Document(metadata={'producer': 'Adobe PDF Library 20.4.68', 'creator': 'Acrobat PDFMaker 20 for PowerPoint', 'creationdate': '2022-02-21T16:21:50-05:00', 'author': 'Larry Medsker', 'moddate': '2022-02-21T16:22:38-05:00', 'title': 'PowerPoint Presentation', 'source': 'Knn and Prob-1.pdf', 'total_pages': 88, 'page': 1, 'page_label': '2'}, page_content='Classification\n\uf0d8Definition\n\uf076Classification can take two distinct meanings in Machine Learning'),
 Document(metadata={'producer': 'Adobe PDF Library 20.4.68', 'creator': 'Acrobat PDFMaker 20 for PowerPoint', 'creationdate': '2022-02-21T16:21:50-05:00', 'auth

In [8]:
vector_storage = FAISS.from_documents(
    pages,
    OpenAIEmbeddings(),
)
retriever = vector_storage.as_retriever() 

In [9]:
question_retriever = """
You are a helpful assistant. You will be given a question and a context.
Your task is to answer the question based on the context provided.
If the context does not contain enough information to answer the question, say "I don't know".

context: {context}
question: {question}

"""

In [11]:
prompt = PromptTemplate.from_template(template=question_retriever)
print(prompt.format(context = 'Here is the context',
              question = 'Answer the question based on the context'
              ))


You are a helpful assistant. You will be given a question and a context.
Your task is to answer the question based on the context provided.
If the context does not contain enough information to answer the question, say "I don't know".

context: Here is the context
question: Answer the question based on the context




In [12]:
results = RunnableParallel(context = retriever, question = RunnablePassthrough())

In [13]:
chain = results | prompt | llm | parser

In [14]:
chain.invoke('what is coding?')

"I don't know."

In [15]:
chain.invoke('what is knn?')

'KNN, or K-Nearest Neighbors, is a supervised method for classification.'

In [16]:
retriever.invoke('what is knn?')

[Document(id='77fadf2c-902c-4aa7-8a4b-821d4cae1fcd', metadata={'producer': 'Adobe PDF Library 20.4.68', 'creator': 'Acrobat PDFMaker 20 for PowerPoint', 'creationdate': '2022-02-21T16:21:50-05:00', 'author': 'Larry Medsker', 'moddate': '2022-02-21T16:22:38-05:00', 'title': 'PowerPoint Presentation', 'source': 'Knn and Prob-1.pdf', 'total_pages': 88, 'page': 0, 'page_label': '1'}, page_content='KNN and Probability'),
 Document(id='9d77f66d-63b2-460e-956c-e37431526b08', metadata={'producer': 'Adobe PDF Library 20.4.68', 'creator': 'Acrobat PDFMaker 20 for PowerPoint', 'creationdate': '2022-02-21T16:21:50-05:00', 'author': 'Larry Medsker', 'moddate': '2022-02-21T16:22:38-05:00', 'title': 'PowerPoint Presentation', 'source': 'Knn and Prob-1.pdf', 'total_pages': 88, 'page': 1, 'page_label': '2'}, page_content='one of the existing classes\n\uf076k-NN is a supervised method for classification \n2'),
 Document(id='09f06267-067a-4758-b636-79b10dbd4c21', metadata={'producer': 'Adobe PDF Library 