# BART LARGE CNN

In [None]:
import os
import sys
import json

import torch
torch.random.manual_seed(0)

import spacy

from transformers import AutoTokenizer, BartForConditionalGeneration, BartTokenizerFast
from transformers import logging as T_LOGGER
T_LOGGER.set_verbosity_error()

In [None]:
from newspaper import Article

from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-pubmed")

# by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed")

# decoder attention type can't be changed & will be "original_full"
# you can change `attention_type` (encoder only) to full attention like this:
# model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed", attention_type="original_full")

# you can change `block_size` & `num_random_blocks` like this:
# model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed", block_size=16, num_random_blocks=2)


inputs = tokenizer(text, return_tensors='pt')
prediction = model.generate(**inputs)
prediction = tokenizer.batch_decode(prediction)


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# # checkpoint = "google/bigbird-pegasus-large-pubmed"  # NO
# checkpoint = "knkarthick/MEETING_SUMMARY" # interessante
# checkpoint = "philschmid/bart-large-cnn-samsum" # sembra molto simile al normale bart-large-cnn ma pi√π grosso
# checkpoint = "facebook/bart-large-cnn"

# tokenizer = BartTokenizerFast.from_pretrained(checkpoint)
# model = BartForConditionalGeneration.from_pretrained(checkpoint)

In [None]:
nlp = spacy.load("en_core_web_sm")

def get_summary_pegasus(sentence, model, tokenizer):
    input_ids = tokenizer(sentence, return_tensors='pt')
    preds = model.generate(**input_ids)
    preds = tokenizer.batch_decode(preds)
    return preds

def get_summary(sentence, model, tokenizer):
    input_ids = tokenizer.encode(sentence, truncation=True, padding=True, return_tensors="pt")
    preds = model.generate(input_ids, **model.config.task_specific_params['summarization'])
    return tokenizer.decode(preds[0], skip_special_tokens=True)

def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length = 1024):
    """
    Starting from a large document, a list of sequential string is computed, such that each string has
    a number of tokens equal to token_max_length.

    ---Params
    - document: the long text (str)
    - tokenizer: the pre-trained tokenizer to be used.
    - token_max_length: the maximum number of token has required by the NLP model (int)
    """
    sents = []
    length = 0
    doc = nlp(document)
    s = ''
    for sentence in doc.sents:
        # print(f'Sentence: {sentence}')
        tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0]
        length += len(tokens_in_sentence) # how many tokens the current sentence have summed to the previous
        # print(f'length: {length}')
        if length <= token_max_length:
            s += sentence.text
        else:
            sents.append(s)
            s = sentence.text
            length = 0
    # append last string with less # of tokens than token_max_length
    sents.append(s)
    print(f'Returning {len(sents)} number of chunk strings')
    return sents

In [None]:
message = """Solar physics is the branch of astrophysics that specializes in the study of the Sun. 
It deals with detailed measurements that are possible only for our closest star. It intersects with many disciplines of pure physics, 
astrophysics, and computer science, including fluid dynamics, plasma physics including magnetohydrodynamics, seismology, particle physics, atomic physics, nuclear physics, 
stellar evolution, space physics, spectroscopy, radiative transfer, applied optics, signal processing, computer vision, computational physics, stellar physics and solar astronomy.

Because the Sun is uniquely situated for close-range observing (other stars cannot be resolved with anything like the spatial or temporal resolution that the Sun can), 
there is a split between the related discipline of observational astrophysics (of distant stars) and observational solar physics.

The study of solar physics is also important as it provides a "physical laboratory" for the study of plasma physics. Babylonians were keeping a record of solar eclipses, 
with the oldest record originating from the ancient city of Ugarit, in modern-day Syria. This record dates to about 1300 BC.[2] Ancient Chinese astronomers were also observing solar phenomena 
(such as solar eclipses and visible sunspots) with the purpose of keeping track of calendars, which were based on lunar and solar cycles. 
Unfortunately, records kept before 720 BC are very vague and offer no useful information.
However, after 720 BC, 37 solar eclipses were noted over the course of 240 years. Astronomical knowledge flourished in the Islamic world during medieval times. 
Many observatories were built in cities from Damascus to Baghdad, where detailed astronomical observations were taken. Particularly, a few solar parameters were measured 
and detailed observations of the Sun were taken. Solar observations were taken with the purpose of navigation, but mostly for timekeeping. Islam requires its followers 
to pray five times a day, at specific position of the Sun in the sky. As such, accurate observations of the Sun and its trajectory on the sky were needed. 
In the late 10th century, Iranian astronomer Abu-Mahmud Khojandi built a massive observatory near Tehran. There, he took accurate measurements of a series of meridian transits of the Sun, 
which he later used to calculate the obliquity of the ecliptic.[4] 
Following the fall of the Western Roman Empire, Western Europe was cut from all sources of ancient scientific knowledge, 
especially those written in Greek. This, plus de-urbanisation and diseases such as the Black Death led to a decline in scientific knowledge in Medieval Europe, 
especially in the early Middle Ages. During this period, observations of the Sun were taken either in relation to the zodiac, or to assist in building places 
of worship such as churches and cathedrals. 
In astronomy, the renaissance period started with the work of Nicolaus Copernicus. 
He proposed that planets revolve around the Sun and not around the Earth, as it was believed at the time. 
This model is known as the heliocentric model.[6] His work was later expanded by Johannes Kepler and Galileo Galilei. 
Particularly, Galilei used his new telescope to look at the Sun. 
In 1610, he discovered sunspots on its surface. In the autumn of 1611, Johannes Fabricius wrote the first book on sunspots, De Maculis in Sole Observatis ("On the spots observed in the Sun").
Modern day solar physics is focused towards understanding the many phenomena observed with the help of modern telescopes and satellites. 
Of particular interest are the structure of the solar photosphere, 
the coronal heat problem and sunspots. 

The Solar Physics Division of the American Astronomical Society boasts 555 members (as of May 2007), 
compared to several thousand in the parent organization.[8]
A major thrust of current (2009) effort in the field of solar physics is integrated understanding of the entire Solar System including the Sun and its effects 
throughout interplanetary space within the heliosphere and on planets and planetary atmospheres. Studies of phenomena that affect multiple systems in the heliosphere, 
or that are considered to fit within a heliospheric context, are called heliophysics, a new coinage that entered usage in the early years of the current millennium.
Helios-A and Helios-B are a pair of spacecraft launched in December 1974 and January 1976 from Cape Canaveral, as a joint venture between the German Aerospace Center and NASA. 
Their orbits approach the Sun closer than Mercury. They included instruments to measure the solar wind, 
magnetic fields, cosmic rays, and interplanetary dust. Helios-A continued to transmit data until 1986
"""

In [None]:
from typing import List, Dict

import os
import sys
import logging
logging.basicConfig(stream=sys.stdout, format='%(asctime)-15s %(message)s',
                level=logging.INFO, datefmt=None)
logger = logging.getLogger("Summarizer")


from operator import attrgetter
from collections import Counter, namedtuple
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

m1 = """
Solar physics is the branch of astrophysics that specializes in the study of the Sun.
It deals with detailed measurements that are possible only for our closest star.It intersects with many disciplines of pure physics, astrophysics, and computer science, 
including fluid dynamics, plasma physics including magnetohydrodynamics, seismology, particle physics, atomic physics, nuclear physics, 
stellar evolution, space physics, spectroscopy, radiative transfer, applied optics, signal processing, computer vision, computational physics, stellar physics and solar astronomy.
Because the Sun is uniquely situated for close-range observing (other stars cannot be resolved with anything like the spatial or temporal resolution that the Sun can), 
there is a split between the related discipline of observational astrophysics (of distant stars) and observational solar physics. The study of solar physics is also important 
as it provides a "physical laboratory" for the study of plasma physics.Babylonians were keeping a record of solar eclipses, 
with the oldest record originating from the ancient city of Ugarit, in modern-day Syria.This record dates to about 1300 BC.
Ancient Chinese astronomers were also observing solar phenomena \n(such as solar eclipses and visible sunspots) with the purpose of keeping track of calendars, 
which were based on lunar and solar cycles.\nUnfortunately, records kept before 720 BC are very vague and offer no useful information.
However, after 720 BC, 37 solar eclipses were noted over the course of 240 years.Astronomical knowledge flourished in the Islamic world during medieval times.
Many observatories were built in cities from Damascus to Baghdad, where detailed astronomical observations were taken.Particularly, a few solar parameters were measured 
and detailed observations of the Sun were taken.
"""
SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",))

def get_significant_words_list(doc: spacy.tokens.doc.Doc) -> List[str]:
   """
   Get a list contained words that are important for the speech (PROPN; ADJ; NOUN; VERB): excluding stop words, punctations
   """
   words = []
   stopwords = list(STOP_WORDS)
   pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
   for token in doc:
       if (token.text in stopwords or token.text in punctuation):
           continue
       if (token.pos_ in pos_tag):
           words.append(token.text)
   return words

def get_frequency_words(words: List[str]) -> Counter:
   """Get a counter with the frequency of each word normalized to one."""
   freq_word = Counter(words)
   max_freq = freq_word.most_common(1)[0][1]
   for word in freq_word.keys():
       freq_word[word] = (freq_word[word] / max_freq)
   return freq_word
   
def get_sent_strenght(doc: spacy.tokens.doc.Doc, freq_word: Counter) -> Dict:
    """Get a dictionary where the keys are sentence (str) and the values are float indicating the importance score of the sentence, based on most high frequencies words."""
    sent_strenght = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strenght.keys():
                    sent_strenght[sent] += freq_word[word.text]
                else:
                    sent_strenght[sent] = freq_word[word.text]
    return sent_strenght

def get_extractive_summary(sent_strenght: Dict, n_sents: int = 5):
    infos = (SentenceInfo(s, o, sent_strenght.get(s)) 
        for o, s in enumerate(sent_strenght.keys()))

    infos = sorted(infos, key=attrgetter("rates"), reverse=True)[:n_sents]
    infos = sorted(infos, key=attrgetter("order"))
    logger.info(f"Extracted {len(infos)} sentences ...")
    return tuple(i.sentence.text for i in infos)


def extractive_summary_pipeline(doc: str, n_sents: int = 5) -> str:
    """Get a final summary of a doc, using a maximum number n_sents of top sentences."""
    doc = nlp(doc)
    logger.info(f"Starting to compute summary from {len(list(doc.sents))} sentences ...")
    words = get_significant_words_list(doc)
    freq_word = get_frequency_words(words)
    sent_strenght = get_sent_strenght(doc, freq_word)

    summaries = get_extractive_summary(sent_strenght, n_sents=n_sents)
    start_sentence = list(doc.sents)[0].text
    total_summary = ' '.join(summaries)
    if start_sentence in summaries:
        return total_summary
    return start_sentence + total_summary

In [None]:
summaries = extractive_summary_pipeline(doc=message, n_sents=5)
print(summaries)