Installing required libraries

In [1]:
%pip install --upgrade --quiet langchain openai weaviate-client tiktoken chainlit pypdf

Note: you may need to restart the kernel to use updated packages.


In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from kaggle_secrets import UserSecretsClient
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Typesense
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import weaviate
from weaviate.embedded import EmbeddedOptions
from langchain.vectorstores import Weaviate

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/books-pdf/fepw101.pdf
/kaggle/input/books-pdf/fepw1ps.pdf
/kaggle/input/books-pdf/fepw107.pdf
/kaggle/input/books-pdf/fepw104.pdf
/kaggle/input/books-pdf/fepw102.pdf
/kaggle/input/books-pdf/fepw106.pdf
/kaggle/input/books-pdf/fepw105.pdf
/kaggle/input/books-pdf/fepw103.pdf


Setting up OpenAI API Key

In [16]:
os.environ["OPENAI_API_KEY"] = UserSecretsClient().get_secret("OPENAI-API-KEY")

Getting all the PDFs in a list.

In [17]:
documents = []

In [18]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        loader = PyPDFLoader(os.path.join(dirname, filename))
        documents.extend(loader.load())

Total number of pages from all the PDFs is 40.

In [19]:
len(documents)

40

Viewing first page.

In [20]:
documents[0]

Document(page_content='THERE once lived a bir d and her two new-bor n babies in a\nforest. They had a nest in a tall, shady tree and there the\nmother bird took care of her little ones day and night.\nOne day, ther e was a big stor m. Ther e was thunder ,\nlightning and rain, and the wind blew down many trees. The\ntall tree in which the birds lived also came down. A big, heavy\nbranch hit the nest and killed the bird. Fortunately for the\nbaby birds, the strong wind blew them away to the other side\nof the forest. One of them came down near a cave where a\ngang of robbers lived. The other landed outside a rishi’s\nashram a little distance away.•A mother bird and her two young ones lived in a forest.\n•The mother was killed in a stor m and the young bir ds wer e\nseparated fr om each other .\n•Each found a dif ferent home.1 11111A Tale of Two\nBirds\nRationalised 2023-24\n', metadata={'source': '/kaggle/input/books-pdf/fepw101.pdf', 'page': 0})

Chunking the text

In [21]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()

Generating vector embeddings and storing them to Weaviate vector database.

In [22]:
client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

vectorstore = Weaviate.from_documents(
    client = client,    
    documents = docs,
    embedding = OpenAIEmbeddings(),
    by_text = False
)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            
{"level":"info","msg":"Created shard langchain_5646fc6966ad4652b1aa546ca590902e_Qw2oP3uObj0K in 2.196131ms","time":"2024-04-11T11:00:10Z"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-11T11:00:10Z","took":144079}


embedded weaviate is already listening on port 8079


/opt/conda/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/opt/conda/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


Retrieving documents

In [23]:
retriever = vectorstore.as_retriever()

Setting the prompt

In [24]:
template = """You are an assistant for question-answering tasks in context of school English 
stories textbooks. You are expected to generate answer based on a user query.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='You are an assistant for question-answering tasks in context of school English \nstories textbooks. You are expected to generate answer based on a user query.\nQuestion: {question} \nContext: {context} \nAnswer:\n'))]


Generating

In [25]:
llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

{"action":"restapi_management","level":"info","msg":"Shutting down... ","time":"2024-04-11T11:00:28Z"}
{"action":"restapi_management","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2024-04-11T11:00:28Z"}


# Method to use rag langchain for a question

In [26]:
def question_answer(question):
    answer = rag_chain.invoke(question)
    return answer

Invoking method and printing answer

In [29]:
question = """Which one of the following sums up the story best?
(i) A bird in hand is worth two in the bush.
(ii) One is known by the company one keeps.
(iii) A friend in need is a friend indeed."""

In [28]:
print(question_answer(question))

/opt/conda/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/opt/conda/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-04-11T11:00:42Z"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-04-

Embedded weaviate wasn't listening on ports http:8079 & grpc:50060, so starting embedded weaviate again
Started /root/.cache/weaviate-embedded: process ID 123


{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-04-11T11:00:42Z"}
{"level":"info","msg":"Completed loading shard langchain_5646fc6966ad4652b1aa546ca590902e_Qw2oP3uObj0K in 7.01219ms","time":"2024-04-11T11:00:42Z"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-11T11:00:42Z","took":959935}
{"level":"info","msg":"Completed loading shard langchain_dcd6d7f654b7482784c750c20fb56931_yPX2Ik58jMyV in 2.462945ms","time":"2024-04-11T11:00:43Z"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-11T11:00:43Z","took":1347091}


The story best sums up with the moral: (ii) One is known by the company one keeps. This is directly stated by the holy man (rishi) in the story when explaining the behavior of the two birds that were separated during a storm and grew up in vastly different environments—one with robbers and the other at a rishi's ashram. Their behaviors reflected the company they kept, illustrating the impact of one's environment and associations on their actions and character.


/opt/conda/lib/python3.10/site-packages/langchain_community/chat_models/openai.py:460: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/opt/conda/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
