In [None]:
import re

import nltk
nltk.download('punkt')
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from PyPDF2 import PdfReader

from transformers import BartForConditionalGeneration, BartTokenizerFast


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, Markdown

In [None]:
checkpoint_summary = "facebook/bart-large-cnn"
tokenizer = BartTokenizerFast.from_pretrained(checkpoint_summary)
model = BartForConditionalGeneration.from_pretrained(checkpoint_summary)

In [None]:
model_dict = {
  'model': model, 
  'max_length': 512,
  'min_length': 120
}

tokenizer_dict = {
  'tokenizer': tokenizer, 
  'max_length': 1024
}


def get_extractive_summary_from_text(text: str, language: str = 'ita', sentence_count: int = 15) -> str:
  parser = PlaintextParser.from_string(text, Tokenizer(language))
  stemmer = Stemmer(language)
  # summarizer = LsaSummarizer(stemmer)
  # summarizer.stop_words = get_stop_words(language)
  summarizer = EdmundsonSummarizer(stemmer)
  summarizer.null_words = get_stop_words(language)
  summarizer.bonus_words = ['importante', 'incredibile']
  summarizer.stigma_words = ['impossibile', 'difficile', 'complicato']
  extractive_summary_from_url = ''.join([sent._text for sent in summarizer(parser.document, sentence_count)])
  return extractive_summary_from_url

def get_summary(text_content):
  tokenizer = tokenizer_dict['tokenizer']
  model = model_dict['model']

  inputs = tokenizer(text_content, max_length=tokenizer_dict['max_length'], truncation=True, return_tensors="pt")
  outputs = model.generate(
      inputs["input_ids"], max_length=model_dict['max_length'], min_length=model_dict['min_length'], 
  )

  summarized_text = tokenizer.decode(outputs[0])
  match = re.search(r"<s>(.*)</s>", summarized_text)
  if match is not None: summarized_text = match.group(1)

  return summarized_text.replace('<s>', '').replace('</s>', '') 

In [None]:
# p1 = reader.pages[0].extract_text()
# p1 = p1.split('Abstract')[1].split('1 Introduction')
# abstract = p1[0]
# intro = p1[1].split('2 Model')[0]

In [None]:
fname = "prova.pdf"
reader = PdfReader(fname)
len(reader.pages)

text = ' '.join([p.extract_text() for p in reader.pages])

# LANG = 'ita'
# parser = PlaintextParser.from_string(text, Tokenizer(LANG))
# stemmer = Stemmer(LANG)
# summarizer = Summarizer(stemmer)
# summarizer.stop_words = get_stop_words(LANG)
# s = [s._text for s in summarizer(parser.document, 20)]
# len(s)
# ss = ''.join(s)
# ss

In [None]:
fname = "prova.pdf"
reader = PdfReader(fname)
len(reader.pages)

idxs = [
    ('1 intro', 5, 17, '2 Sistema Traffico AS-IS '), 
    ('2 sistema traffico as-is', 17, 28, '3 Sistema Traffico TO-BE'), 
    ('3 sistema traffico to-be', 28, 46, '4 Oggetto della fornitura'),
    ('4 oggetto della fornitura', 46, 51, '5 Figure e competenze richieste'),
    ('5 figure e competenze richieste', 51, 57, '6 Condizioni generali del contratto')]

total = {}
for t in idxs:
    total[t[0]] = {'original': '', 'extractive_summary': '', 'abstractive_summary': ''}
    for i in range(t[1], t[2]):
        page = reader.pages[i].extract_text()
        page = page.replace("è", "e'").replace("à", "a'")
        total[t[0]]['original'] += page
        # total[t[0]]['original'] = total[t[0]]['original'].\
        #     replace('\n\n\n\n', '\n').\
        #     replace('\n\n', '\n').\
        #     replace('\n', '').\
        total[t[0]]['original'] = total[t[0]]['original'].replace('ASPI Programma di ammodernamento  Area Traffico  Capitolato tecnico', '')
        total[t[0]]['original'] = total[t[0]]['original'].split(t[-1])[0]
        total[t[0]]['extractive_summary'] = get_extractive_summary_from_text(total[t[0]]['original'])
        # total[t[0]]['abstractive_summary'] =  get_summary(total[t[0]]['extractive_summary'])

In [None]:
f = open("exctractive_summary.txt", '+w')
for key in total.keys():
    s = ' ----- ' + key.upper() + ' ------ ' + total[key]['extractive_summary']
    f.write(s)
f.close()

# f = open("abstractive_summary.txt", '+w')
# for key in total.keys():
#     s = ' ----- ' + key.upper() + ' ------ ' + total[key]['abstractive_summary']
#     f.write(s)
# f.close()