In [4]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import DeepLake


In [5]:
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
os.environ["ACTIVELOOP_TOKEN"] = getpass.getpass("Activeloop Token:")

In [6]:
llm = OpenAI(temperature=0)

In [7]:
embeddings = OpenAIEmbeddings(disallowed_special=())


In [6]:
import os
import nltk
import PyPDF2

# Directory path where the research papers are stored
directory = 'papers/'

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        return text

# Function to count tokens in text
def count_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

# Download the necessary resource for tokenization
nltk.download('punkt')

# Iterate over files in the directory
file_list = os.listdir(directory)
total_tokens = 0

for file_name in file_list:
    # Filter files with specific extensions if needed
    if file_name.endswith('.pdf'):
        file_path = os.path.join(directory, file_name)
        try:
            text = extract_text_from_pdf(file_path)
            num_tokens = count_tokens(text)
            print(f"File: {file_name}\tTokens: {num_tokens}")
            total_tokens += num_tokens
        except Exception as e:
            print(f"Error reading file: {file_path}\n{str(e)}")

print(f"Total Tokens: {total_tokens}")


[nltk_data] Downloading package punkt to /Users/laasya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


File: linnenbrink-garcia-et-al-2016-adaptive-motivation-and-emotion-in-education-research-and-principles-for-instructional.pdf	Tokens: 8271
File: Patall & Zambrano - Rationales and Enthusiasm.pdf	Tokens: 8257
File: Schmidt et al_2019_relevance statements (1).pdf	Tokens: 29957
Total Tokens: 46485


That's too many, let's split our text up into chunks so they fit into the prompt limit. I'm going a chunk size of 10,000 characters.

You can think of tokens as pieces of words used for natural language processing. For English text, 1 token is approximately 4 characters or 0.75 words. As a point of reference, the collected works of Shakespeare are about 900,000 words or 1.2M tokens.

This means the number of tokens we should expect is 10,000 / 4 = ~2,500 token chunks. But this will vary, each body of text/code will be different

In [8]:
loader = PyPDFDirectoryLoader("papers")
docs = loader.load()

In [9]:
text_splitter = CharacterTextSplitter(chunk_size=10000, chunk_overlap=500)
texts = text_splitter.split_documents(docs)

In [10]:
num_docs = len(texts)

num_tokens_first_doc = llm.get_num_tokens(texts[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 40 documents and the first one has 872 tokens


In [11]:
username = "laasya"  # replace with your username from app.activeloop.ai
db = DeepLake(
    dataset_path=f"hub://{username}/research_papers_chunk_size_10000",
    embedding_function=embeddings,
)
db.add_documents(texts)



Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/laasya/research_papers_chunk_size_10000


 

hub://laasya/research_papers_chunk_size_10000 loaded successfully.


Evaluating ingest: 100%|██████████████████████████████████████| 1/1 [00:54<00:00
 

Dataset(path='hub://laasya/research_papers_chunk_size_10000', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape      dtype  compression
  -------   -------   -------    -------  ------- 
 embedding  generic  (40, 1536)  float32   None   
    ids      text     (40, 1)      str     None   
 metadata    json     (40, 1)      str     None   
   text      text     (40, 1)      str     None   


['499a8ab0-22a6-11ee-aa35-124a06d5bb2d',
 '499a8cd6-22a6-11ee-aa35-124a06d5bb2d',
 '499a8d1c-22a6-11ee-aa35-124a06d5bb2d',
 '499a8d62-22a6-11ee-aa35-124a06d5bb2d',
 '499a8d9e-22a6-11ee-aa35-124a06d5bb2d',
 '499a8dda-22a6-11ee-aa35-124a06d5bb2d',
 '499a8e16-22a6-11ee-aa35-124a06d5bb2d',
 '499a8e5c-22a6-11ee-aa35-124a06d5bb2d',
 '499a8e98-22a6-11ee-aa35-124a06d5bb2d',
 '499a8ed4-22a6-11ee-aa35-124a06d5bb2d',
 '499a8f10-22a6-11ee-aa35-124a06d5bb2d',
 '499a8f42-22a6-11ee-aa35-124a06d5bb2d',
 '499a8f88-22a6-11ee-aa35-124a06d5bb2d',
 '499a8fc4-22a6-11ee-aa35-124a06d5bb2d',
 '499a9000-22a6-11ee-aa35-124a06d5bb2d',
 '499a9032-22a6-11ee-aa35-124a06d5bb2d',
 '499a9082-22a6-11ee-aa35-124a06d5bb2d',
 '499a90b4-22a6-11ee-aa35-124a06d5bb2d',
 '499a90f0-22a6-11ee-aa35-124a06d5bb2d',
 '499a912c-22a6-11ee-aa35-124a06d5bb2d',
 '499a9172-22a6-11ee-aa35-124a06d5bb2d',
 '499a91ae-22a6-11ee-aa35-124a06d5bb2d',
 '499a91ea-22a6-11ee-aa35-124a06d5bb2d',
 '499a9226-22a6-11ee-aa35-124a06d5bb2d',
 '499a9262-22a6-

In [16]:
db = DeepLake(
    dataset_path="hub://laasya/research_papers_chunk_size_10000",
    read_only=True,
    embedding_function=embeddings,
)

-

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/laasya/research_papers_chunk_size_10000



\

hub://laasya/research_papers_chunk_size_10000 loaded successfully.

Deep Lake Dataset in hub://laasya/research_papers_chunk_size_10000 already exists, loading from the storage
Dataset(path='hub://laasya/research_papers_chunk_size_10000', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape      dtype  compression
  -------   -------   -------    -------  ------- 
 embedding  generic  (40, 1536)  float32   None   
    ids      text     (40, 1)      str     None   
 metadata    json     (40, 1)      str     None   
   text      text     (40, 1)      str     None   


  

In [17]:
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 100
retriever.search_kwargs["mmr"] = True
retriever.search_kwargs["k"] = 10

In [12]:
import PyPDF2
from nltk.tokenize import sent_tokenize
import nltk
def chunk_pdf_by_sentence(pdf_path, chunk_size):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
        chunks = []
        for page_number in range(num_pages):
            page = reader.pages[page_number]
            text = page.extract_text()
            
            sentences = sent_tokenize(text)
            num_sentences = len(sentences)
            
            for i in range(0, num_sentences, chunk_size):
                chunk = '  '.join(sentences[i:i+chunk_size])
                chunks.append(chunk)
    
    return chunks

Using Map Reduce Chain Type

In [42]:
question_prompt_template = """
Understand the following research papers regarding how instructors support student motivation by provifing rationales, 
relevance to the real world or enthusiasm in a classroom. The question is a transcript of a lecture. 
Classify every sentence in the question as rationale, relevance, enthusiasm or none. 
Return the exact verbatim of the question, its classification and why it was classified as such. 
DO NOT change the words in the question, keep it as is.
{context}
Question: {question}
Relevant text, if any:"""
QUESTION_PROMPT = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

combine_prompt_template = """

Given the question and a classification of the question, return the question and your final classification.


QUESTION: {question}
=========
{summaries}
=========
Answer:"""
COMBINE_PROMPT = PromptTemplate(
    template=combine_prompt_template, input_variables=["summaries", "question"]
)


chunks = chunk_pdf_by_sentence("lab transcript (1).docx.pdf", 1000)  # Calling function to chunk the pdf by sentence


qa_chain = load_qa_chain(OpenAI(temperature=0), chain_type="map_reduce", return_map_steps=False, question_prompt=QUESTION_PROMPT, combine_prompt=COMBINE_PROMPT)

qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=retriever)

for chunk in chunks:
    result = qa({"input_documents":db,"query": chunk}, return_only_outputs=True)
    print(result)
    print("-----------------------------------------------")


{'result': " So, I'll post all the answers together, I'll actually post this even now just so you have them I'll post the answers even if we don't complete them this week.\nClassification: Relevance \nExplanation: This sentence is classified as relevance because it is making a connection between the science content and the students' experiences or goals outside of the immediate learning context. The teacher is suggesting that the students should still complete the problems even if they don't finish them in class, as it will be beneficial for them to have the answers for future reference."}
-----------------------------------------------
{'result': " So, you are doing indeed chemistry where you’re able to label say a cell with a certain fluorophore or certain compound that lights up right under conditions that you would like, but you’re not actually disturbing the functioning of that particular cell which is you know it’s mind blowing.\nClassification: Relevance \nReason: The sentence i

KeyboardInterrupt: 

Using Stuff chain type

In [None]:
prompt_template = """
Understand the following research papers regarding how instructors support student motivation by provifing rationales, 
relevance to the real world or enthusiasm in a classroom and
see if the text can be used to classify the question
as rationale, relevance, enthusiasm or none. 
Return the verbatim of the question, its classification and why it was classified as such.
{context}
Question: {question}
Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=retriever,
                                chain_type_kwargs=chain_type_kwargs)

chunks = chunk_pdf_by_sentence("lab transcript (1).docx.pdf", 3000)  # Calling function to chunk the pdf by sentence

for chunk in chunks:
    result = qa({"context":db,"query": chunk})
    print(result['result'])
    print("-----------------------------------------------")