In [None]:
!pip install pdf2image
!pip install pytesseract
!pip install ocrmypdf
!pip install opencv-python

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import DeepLake

In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
os.environ["ACTIVELOOP_TOKEN"] = getpass.getpass("Activeloop Token:")

In [None]:
llm = OpenAI(temperature=0)

In [None]:
embeddings = OpenAIEmbeddings(disallowed_special=())


In [None]:
import os
import nltk
import PyPDF2

# Directory path where the research papers are stored
directory = 'papers/'

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        return text

# Function to count tokens in text
def count_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

# Download the necessary resource for tokenization
nltk.download('punkt')

# Iterate over files in the directory
file_list = os.listdir(directory)
total_tokens = 0

for file_name in file_list:
    # Filter files with specific extensions if needed
    if file_name.endswith('.pdf'):
        file_path = os.path.join(directory, file_name)
        try:
            text = extract_text_from_pdf(file_path)
            num_tokens = count_tokens(text)
            print(f"File: {file_name}\tTokens: {num_tokens}")
            total_tokens += num_tokens
        except Exception as e:
            print(f"Error reading file: {file_path}\n{str(e)}")

print(f"Total Tokens: {total_tokens}")


That's too many, let's split our text up into chunks so they fit into the prompt limit. I'm going a chunk size of 10,000 characters.

In [None]:
loader = PyPDFDirectoryLoader("papers")
docs = loader.load()

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=10000, chunk_overlap=500)
texts = text_splitter.split_documents(docs)

In [None]:
num_docs = len(texts)

num_tokens_first_doc = llm.get_num_tokens(texts[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

In [None]:
username = "laasya"  # replace with your username from app.activeloop.ai
db = DeepLake(
    dataset_path=f"hub://{username}/research_papers_chunk_size_10000",
    embedding_function=embeddings,
)
db.add_documents(texts)

In [None]:
db = DeepLake(
    dataset_path="hub://laasya/research_papers_chunk_size_10000",
    read_only=True,
    embedding_function=embeddings,
)

In [None]:
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 100
retriever.search_kwargs["mmr"] = True
retriever.search_kwargs["k"] = 10

In [None]:
import PyPDF2
from nltk.tokenize import sent_tokenize
import nltk
def chunk_pdf_by_sentence(pdf_path, chunk_size):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
        chunks = []
        for page_number in range(num_pages):
            page = reader.pages[page_number]
            text = page.extract_text()
            print("TEXT: " + text)
            print("*******")
            
            sentences = sent_tokenize(text)
            num_sentences = len(sentences)
            
            for i in range(0, num_sentences, chunk_size):
                chunk = '  '.join(sentences[i:i+chunk_size])
                chunks.append(chunk)
    
    return chunks

In [None]:
def listToString(s):
 
    # initialize an empty string
    str1 = ""
 
    # traverse in the string
    for ele in s:
        str1 += ele
 
    # return string
    return str1

In [None]:
from nltk.tokenize import sent_tokenize
def chunk_text_by_sentence(file_path, chunk_size):
    with open(file_path, 'r') as file:
        text = file.readlines()

    text= listToString(text)
    chunks = []

    sentences = sent_tokenize(text)
    num_sentences = len(sentences)

    for i in range(0, num_sentences, chunk_size):
        chunk = '  '.join(sentences[i:i+chunk_size])
        chunks.append(chunk)
    return chunks


Using Map Reduce Chain Type

In [None]:
question_prompt_template = """
Understand the following research papers regarding how instructors support student motivation by provifing rationales, 
relevance to the real world or enthusiasm in a classroom. The question is a transcript of a lecture. 
Classify every sentence in the question as rationale, relevance, enthusiasm or none. 
Return the exact verbatim of the question, its classification and why it was classified as such. 
DO NOT change the words in the question, keep it as is.
{context}
Question: {question}
Relevant text, if any:"""
QUESTION_PROMPT = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

combine_prompt_template = """

Given the question and a classification of the question, return the question and your final classification based on 
the most occuring classification of this question.


QUESTION: {question}
=========
{summaries}
=========
Answer:"""
COMBINE_PROMPT = PromptTemplate(
    template=combine_prompt_template, input_variables=["summaries", "question"]
)


chunks = chunk_text_by_sentence("lab transcript (1).docx.txt", 10)  # Calling function to chunk the pdf by sentence


qa_chain = load_qa_chain(OpenAI(temperature=0), chain_type="map_reduce", return_map_steps=False, question_prompt=QUESTION_PROMPT, combine_prompt=COMBINE_PROMPT)

qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=retriever)

for chunk in chunks:
    print("CHUNK:" + chunl)
    result = qa({"input_documents":db,"query": chunk}, return_only_outputs=True)
    print(result)
    print("-----------------------------------------------")
