In [1]:
# Step 1: Importing the libraries
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Step 2: Loading the data
loader = UnstructuredPDFLoader("../bookref/article/TopK_Ganesan.pdf")
data = loader.load()

print (f'You have {len(data)} documents in the given external source')
print (f'There are {len(data[0].page_content)} characters in this document')

You have 1 documents in the given external source
There are 16111 characters in this document


In [3]:
# Step 3: Splitting the data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
split_text = text_splitter.split_documents(data)

print (f'Now we have {len(split_text)} split documents using the given chunk size and overlap')

Now we have 19 split documents using the given chunk size and overlap


In [4]:
# Step 4: Importing the libraries for Embedding + Vectorization
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone, os, tqdm, openai

  from tqdm.autonotebook import tqdm


In [5]:
# Step 5: Embedding the data using OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")
ai_embeddings = OpenAIEmbeddings()

In [6]:
# Step 6: Vectorizing the data using Pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV")
)
myindex = "bookindex"
docsearch = Pinecone.from_texts(
    [data_source.page_content for data_source in split_text], 
    ai_embeddings, 
    index_name=myindex)

In [7]:
# Step 7: Querying the data
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
chain = load_qa_chain(llm, chain_type="stuff")

In [8]:
# Step 8: Running AI query with intellectual tech context
query = "What is the recommended next steps in this research?"
docs = docsearch.similarity_search(query)

chain.run(input_documents=docs, question=query)

' Extend the existing analysis pattern to the rest of commonly available financial stock exchange data.'

In [9]:
# Step 9: AI query with intellectual curiosity out of context
query = "tell me about Ganesan?"
docs = docsearch.similarity_search(query)

chain.run(input_documents=docs, question=query)

' Ganesan Senthilvel is a research scholar in the Department of Computer Science at IIT Madras. He is working on a project to analyze large data sets from the NYSE using a Big Data Hadoop framework.'