#### This notebook runs in Colab Pro (Gradio and bigger size pdf summarization) with A100 GPU with 145 compute units, RAM - 83.5 GB, GPU RAM - 40 GB and Disk - 167GB

In [1]:
## GPU size
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Jan 29 18:21:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              43W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# Memory size
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [5]:
# import gradio at first before importing other langchain libraries, else there might be error in importing gradio due to a bug in Colab
#!pip install gradio
import gradio as gr
print(gr.__version__)


4.16.0


In [8]:
#!pip install -q transformers einops accelerate langchain bitsandbytes


In [10]:
 #!pip install pypdfium2

In [11]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain import LLMChain
from langchain import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFium2Loader
import transformers
import torch

This notebook using Llama-2-7b-chat model from Hugging Face. Authethication will be required from Hugging Face

In [13]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [14]:
model = "meta-llama/Llama-2-7b-chat-hf"

# Autotokenizer from transformers library used Hugging Face authentication:
tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [15]:
def summarize_pdf(pdf_filepath):
    loader = PyPDFium2Loader(pdf_filepath)
    data = loader.load()

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts_FM1 = text_splitter.split_documents(data)

    template = """
              Write a concise summary of the following text delimited by triple backquotes.
              Return your response in bullet points which covers the key points of the text.
              ```{text}```
              BULLET POINT SUMMARY:
           """

    prompt = PromptTemplate(template=template, input_variables=["text"])

    llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

    llm_chain = LLMChain(prompt=prompt, llm=llm)

    return llm_chain.run(texts_FM1)

In [17]:
# Mount the Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# Get the summary of the one page sample Fannie Mae public pdf document:
pdf_filepath = '/content/drive/MyDrive/Colab Notebooks/GenerativeAI-projects/Data/6_extracted_FM-esg-report-2022.pdf'
print(summarize_pdf(pdf_filepath))

  warn_deprecated(


 • Fannie Mae is a purpose-driven company chartered by Congress to provide liquidity and stability to the residential mortgage market.
            • Fannie Mae's mission is to provide mortgage financing to help people buy or rent a home, while promoting access to mortgage credit for low- and moderate-income families.
            • Fannie Mae does not originate mortgage loans but works with lenders to acquire and securitize loans into mortgage-backed securities that it guarantees.
            • Fannie Mae's revenues are primarily driven by guaranty fees it receives for assuming the credit risk on loans underlying its MBS.
            • As of December 31, 2022, Fannie Mae owned or guaranteed mortgage assets representing 27% of single-family mortgage debt outstanding and 21% of multifamily mortgage debt outstanding


In [19]:
# Setting up the Interface using Gradio:

input_pdf_path = gr.components.Textbox(label="Provide the PDF file path")
output_summary = gr.components.Textbox(label="Summary")


In [21]:
interface = gr.Interface(
    fn=summarize_pdf,
    inputs=input_pdf_path,
    outputs=output_summary,
    title="PDF Summarizer",
    description="Provide PDF file path to get the summary."
).queue().launch(share=True, debug = False)

# Provide the PDF file path in Gradio app : /content/drive/MyDrive/Colab Notebooks/GenerativeAI-projects/Data/6_extracted_FM-esg-report-2022.pdf
# Takes around 8s to see the Summary output

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://be926f7042403638b7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


#### Summarizing Large Documents Using Map Reduce

In [23]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [26]:
#!pip install pdfreader
#!pip install PyPDF2

In [27]:
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from typing_extensions import Concatenate

In [28]:
def summarize_large_pdf(pdf_filepath):

    # provide the path of  pdf file/files.
    pdfreader = PdfReader(pdf_filepath)

    # read text from pdf
    text = ''
    for i, page in enumerate(pdfreader.pages):
        content = page.extract_text()
        if content:
            text += content
    llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})
    print(f"The number of tokens: {llm.get_num_tokens(text)}")

    ## Splittting the text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=20)
    chunks = text_splitter.create_documents([text])
    print(f"length of chunks: {len(chunks)}")
    chain = load_summarize_chain(llm,chain_type='map_reduce',verbose=False)
    summary = chain.run(chunks)
    return summary

In [29]:
# # Get the summary of the 13 pages sample Fannie Mae public pdf document:
pdf_filepath2 = "/content/drive/MyDrive/Colab Notebooks/GenerativeAI-projects/Data/1_updated_FM-esg-report-2022.pdf"
summarize_large_pdf(pdf_filepath2)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (7937 > 1024). Running this sequence through the model will result in indexing errors


The number of tokens: 7937
length of chunks: 4




"\nFannie Mae's Social Index scores assess the proportion of loans in a pool that meet certain social criteria, such as income, borrower, property, and minority tract. The scores are calculated based on the composition of pools issued since 2010 and include eight criteria. The scores are capped at 2.5 for pool-level metrics. The summary provides historical Social Index scores for most active and inactive pools issued since 2010 and highlights the elements of the Social Index methodology."

In [None]:
# Change the model parameters to get a better summary for larger pdf documents

In [30]:
# Setting up the Interface using Gradio:

input_pdf_path = gr.components.Textbox(label="Provide the PDF file path")
output_summary = gr.components.Textbox(label="Summary")

In [1]:
interface = gr.Interface(
    fn=summarize_pdf,
    inputs=input_pdf_path,
    outputs=output_summary,
    title="PDF Summarizer",
    description="Provide PDF file path to get the summary."
).queue().launch(share=True, debug = True)

# Provide the PDF file path in Gradio app : /content/drive/MyDrive/Colab Notebooks/GenerativeAI-projects/Data/1_updated_FM-esg-report-2022.pdf
# There might be some error in Gradio : Summary output
# Stop executing this cell if you get error

In [None]:
# you might get Out of Memory error
# torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.89 GiB. GPU 0 has a total capacty of 39.56 GiB of which 7.39 GiB is free.
# Process 14479 has 32.17 GiB memory in use. Of the allocated memory 23.03 GiB is allocated by PyTorch, and 8.62 GiB is reserved by PyTorch but unallocated.
# If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.
# See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

End