In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install PyPDF2
!pip install pdfminer.six
!pip install fpdf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.0-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 27.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.1 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 54.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading PyPDF2-2.10.8-py3-none-any.whl (217 kB)
[K     |████████████████████████████████| 217 kB 24.9 M

# Importing Dependencies


Model description

BigBird relies on block sparse attention instead of normal attention (i.e. BERT's attention) and can handle sequences up to a length of 4096 at a much lower compute cost compared to BERT. It has achieved SOTA on various tasks involving very long sequences such as long documents summarization, question-answering with long contexts.




In [None]:
from transformers import pipeline
import PyPDF2
from pdfminer.high_level import extract_text
import resource
import re
import textwrap
from fpdf import FPDF

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-bigpatent")

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.51M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

# Individual Functions for summarization

In [None]:
# This function tweak the text before saving in the pdf
def prep_b4_save(text):
  text = re.sub('Gods', 'God\'s', text)
  text = re.sub('yours', 'your\'s', text)
  text = re.sub('dont', 'don\'t', text)
  text = re.sub('doesnt', 'doesn\'t', text)
  text = re.sub('isnt', 'isn\'t', text)
  text = re.sub('havent', 'haven\'t', text)
  text = re.sub('hasnt', 'hasn\'t', text)
  text = re.sub('wouldnt', 'wouldn\'t', text)
  text = re.sub('theyre', 'they\'re', text)
  text = re.sub('youve', 'you\'ve', text)
  text = re.sub('arent', 'aren\'t', text)
  text = re.sub('youre', 'you\'re', text)
  text = re.sub('cant', 'can\'t', text)
  text = re.sub('whore', 'who\'re', text)
  text = re.sub('whos', 'who\'s', text)
  text = re.sub('whatre', 'what\'re', text)
  text = re.sub('whats', 'what\'s', text)
  text = re.sub('hadnt', 'hadn\'t', text)
  text = re.sub('didnt', 'didn\'t', text)
  text = re.sub('couldnt', 'couldn\'t', text)
  text = re.sub('theyll', 'they\'ll', text)
  text = re.sub('youd', 'you\'d', text)
  return text

In [None]:
# This function convert the text into the pdf and save it at the specified location
def text_to_pdf(text, filename):
    a4_width_mm = 200
    pt_to_mm = 0.35
    fontsize_pt = 11
    fontsize_mm = fontsize_pt * pt_to_mm
    margin_bottom_mm = 10
    character_width_mm = 7 * pt_to_mm
    width_text = a4_width_mm / character_width_mm

    pdf = FPDF(orientation='P', unit='mm', format='A4')
    pdf.set_auto_page_break(True, margin=margin_bottom_mm)
    pdf.add_page()
    pdf.set_font(family='Courier', size=fontsize_pt)
    splitted = text.split('\n')

    for line in splitted:
        lines = textwrap.wrap(line, width_text)

        if len(lines) == 0:
            pdf.ln()

        for wrap in lines:
            pdf.cell(0, fontsize_mm, wrap, ln=1)

    pdf.output(filename, 'F')
    print("PDF of summary Saved!!")

In [None]:
# This function split a huge corpus of text into small chunks or portions
def text_chunking(new_text):
  max_chunk = 500
  new_text = new_text.replace('.', '.<eos>')
  new_text = new_text.replace('?', '?<eos>')
  new_text = new_text.replace('!', '!<eos>')

  sentences = new_text.split('<eos>')
  current_chunk = 0 
  chunks = []
  for sentence in sentences:
      if len(chunks) == current_chunk + 1: 
          if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
              chunks[current_chunk].extend(sentence.split(' '))
          else:
              current_chunk += 1
              chunks.append(sentence.split(' '))
      else:
          # print(current_chunk)
          chunks.append(sentence.split(' '))

  for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])
  print("Total chunks of text are: ", len(chunks))
  return chunks

In [None]:
# This function takes in all the chunks, find the summary of each chunk and return all the summaries of chunks in list form. 
def model_summary(chunks):
  print("Summarizing the text. Please wait .......")
  all_summaries = []
  count = 0
  for chunk in chunks:
    print("Summarizing Chunk NO: ", count + 1)
    res = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
    all_summaries +=res
    count +=1
  return all_summaries


# Combining all the individual parts into a single function
* Input to this function is path to the pdf
* This function do all the pre-processing, get the summary and save it in the pdf
* Parameter to this function is only the path to the pdf

In [None]:
def find_summary(pdf_path):
  raw_text = extract_text(pdf_path)  # Extract text from the path of pdf given
  chunks = text_chunking(raw_text)   # chunk the large text into small parts so it can be supplied to the model
  all_summaries = model_summary(chunks) # passing the chunks to the model for the summarization
  joined_summary = ' '.join([summ['summary_text'] for summ in all_summaries])  # combine all chunks of summaries to single
  txt_to_save = (joined_summary.encode('latin1','ignore')).decode("latin1")  # This ignore the "aphostrope" which is little problematic
  txt_to_save_prep = prep_b4_save(txt_to_save)
  spl = pdf_path.split('/') # Splitting the path based on "/" to get the name of the book or pdf
  file_name = spl[-1][:-4]+"_summary.pdf" # Summary is added at the end i.e book name is the_alchemist so it becomes -> the_alchemist_summary.pdf etc. 
  text_to_pdf(txt_to_save_prep, file_name)

In [None]:
pdf_path_malizia = "/content/drive/MyDrive/Colab Notebooks/contents/mensitieri2020.pdf"
find_summary(pdf_path_malizia)

Attention type 'block_sparse' is not possible if sequence_length: 699 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Total chunks of text are:  142
Summarizing the text. Please wait .......
Summarizing Chunk NO:  1
Summarizing Chunk NO:  2
Summarizing Chunk NO:  3
Summarizing Chunk NO:  4
Summarizing Chunk NO:  5
Summarizing Chunk NO:  6
Summarizing Chunk NO:  7
Summarizing Chunk NO:  8
Summarizing Chunk NO:  9
Summarizing Chunk NO:  10
Summarizing Chunk NO:  11
Summarizing Chunk NO:  12
Summarizing Chunk NO:  13
Summarizing Chunk NO:  14
Summarizing Chunk NO:  15
Summarizing Chunk NO:  16
Summarizing Chunk NO:  17
Summarizing Chunk NO:  18
Summarizing Chunk NO:  19
Summarizing Chunk NO:  20
Summarizing Chunk NO:  21
Summarizing Chunk NO:  22
Summarizing Chunk NO:  23
Summarizing Chunk NO:  24
Summarizing Chunk NO:  25
Summarizing Chunk NO:  26
Summarizing Chunk NO:  27
Summarizing Chunk NO:  28
Summarizing Chunk NO:  29
Summarizing Chunk NO:  30
Summarizing Chunk NO:  31
Summarizing Chunk NO:  32
Summarizing Chunk NO:  33
Summarizing Chunk NO:  34
Summarizing Chunk NO:  35
Summarizing Chunk NO:  36
