# Install all the necessary python packages

In [None]:
%pip install langchain
%pip install python-dotenv
%pip install pinecone-client
%pip install pypdf
%pip install openai
%pip install tiktoken

In [1]:
import os
# Load env variables for Open AI and PineCone
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')

# Populating knowledge base

## Step 1 — Get data

In [3]:
FILE_NAME="ML.pdf"

In [4]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(FILE_NAME)
data = loader.load()



## Step 2 — Split data into chunks

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 10,
)
texts = text_splitter.split_documents(data)

## Step 3— Generate Embeddings

In [6]:
# Import and instantiate OpenAI embeddings

from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

## Step 4 — Store the embeddings in Vector DB

In [7]:
# Import and instantiate PineCone
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENV')
)

  from tqdm.autonotebook import tqdm


In [8]:
index_name = "starter-index"
search = Pinecone.from_documents(texts, embeddings, index_name=index_name)

# Question Answering Flow

In [9]:
query="Explain novelty detection in simple terms?"

## Step 1— Search the vector DB

In [10]:
# Perform similarity search to get 4 relevant documents
docs = search.similarity_search(query, k=4)
print(docs)

[Document(page_content='Novelty Detection is a rather ill-deﬁned problem. It describes the issue\nof determining “unusual” observations given a set of past measurements.\nClearly, the choice of what is to be considered unusual is very subjective.\nA commonly accepted notion is that unusual events occur rarely. Hence a\npossible goal is to design a system which assigns to each observation a rating', metadata={'page': 18.0, 'source': 'ML.pdf'}), Document(page_content='other hand, a hobby athlete might only care that our estimate of the heart\nrate matches the actual on average.\nNovelty Detection is a rather ill-deﬁned problem. It describes the issue\nof determining “unusual” observations given a set of past measurements.\nClearly, the choice of what is to be considered unusual is very subjective.\nA commonly accepted notion is that unusual events occur rarely. Hence a\npossible goal is to design a system which assigns to each observation a rating', metadata={'page': 18.0, 'source': 'ML.

## Step 2— Set up the LLM

In [11]:
#Load LLM
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0.0, model='gpt-3.5-turbo-16k')


## Step 3 — Pass the knowledge base and query to LLM.

In [12]:
# Load the question answering chain
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm, chain_type="stuff")
answer = chain.run(input_documents=docs, question=query)

In [13]:
print(answer)

Novelty detection is the process of identifying unusual or rare observations based on a set of past measurements. It involves determining what is considered unusual, which can be subjective. The goal is to design a system that assigns a rating or score to each observation, indicating how unusual it is compared to the past measurements. For example, in the context of optical character recognition, novelty detection could be used to identify unusual or unexpected characters in a database.
