In [1]:
# !pip install -q gradio openai pypdf tiktoken

In [7]:
import os 
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
import tiktoken

def num_tokens_from_string(string:str, encoding_name:str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    print(encoding.encode(string))
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string("tiktoken is great!", "cl100k_base")

[83, 1609, 5963, 374, 2294, 0]


6

In [9]:
import gradio as gr 
from langchain import OpenAI, PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader

llm = OpenAI(temperature=0)

## LangChain part

Function that takes PDF file as input and return the summary of that PDF

- Langchain `PyPDFLoader` helps load the pdf.
- After that we can split the document in smaller chunks
- We then use the `load_summarizer_chain` to create a summarization chain

In [10]:
def summarize_pdf(pdf_file_path):
    loader = PyPDFLoader(pdf_file_path)
    docs = loader.load_and_split()
    chain = load_summarize_chain(llm, chain_type='map_reduce')
    summary = chain.run(docs)
    return summary

In [11]:
summarize = summarize_pdf("human-nutrition-text.pdf")
summarize

' "Human Nutrition: 2020 Edition" is a comprehensive guide on maintaining a healthy diet and lifestyle, covering topics such as nutrient requirements, dietary guidelines, and the impact of nutrition on overall health. It emphasizes the importance of a balanced diet and traditional foods, and offers information on lifespan nutrition, physical activity, and food safety. The book also discusses the importance of nutrition for different organ systems and includes interactive learning activities. It addresses issues such as food insecurity and the role of nutrition professionals. '

In [12]:
# Just to show you how it works
loader = PyPDFLoader("human-nutrition-text.pdf")
doc = loader.load_and_split()
print(len(doc))

1179


In [13]:
doc[0]

Document(page_content='Human Nutrition: 2020 Edition', metadata={'source': 'human-nutrition-text.pdf', 'page': 0})

## Create a Simple gradio UI (if you prefer UI)

In [14]:
input_pdf_path = gr.components.Textbox(label="Provide the PDF file path")
output_summary = gr.components.Textbox(label="Summary")

interface = gr.Interface(
    fn=summarize_pdf,
    inputs=input_pdf_path,
    outputs=output_summary,
    title="PDF Summarizer",
    description="Provide PDF file path to get the summary.",
).launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://81d7883a3ae1bff16c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
