In [None]:
from googlesearch import search

import re
import nltk
nltk.download('punkt')
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from transformers import BartForConditionalGeneration, BartTokenizerFast

from typing import Optional

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
checkpoint_summary = "facebook/bart-large-cnn"
tokenizer = BartTokenizerFast.from_pretrained(checkpoint_summary)
model = BartForConditionalGeneration.from_pretrained(checkpoint_summary)

In [None]:
def get_lsa_extractive_summary(input_str: str, url: bool = True, sentence_count: Optional[int] = 15, language: Optional[str] = "english") -> str:
    """"Get an exctractive summary using the LSA (Latent Semantic Analysys) algorithm from an URL or from a Text.
    
    --Parameters
     - input_Str (str): the http url of the article to parse or a text.
     - url (bool): if the input_str is an url or not.
     - sentence_count (int): the number of sentences to extract.
     - language (str): the used language for setting the stemmer and getting the stop words

     return (str) the extractive summary as a string
    """
    parser = HtmlParser.from_url(input_str, Tokenizer(language)) if url else PlaintextParser.from_string(input_str, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)
    extractive_summary = ' '.join([sent._text for sent in summarizer(parser.document, sentence_count)])
    return extractive_summary

def get_summary(dict_summarizer_model, dict_tokenizer, text_content):
  # text_content = get_extractive_summary(text_content, EXTRACTED_ARTICLE_SENTENCES_LEN)
  tokenizer = dict_tokenizer['tokenizer']
  model = dict_summarizer_model['model']

  inputs = tokenizer(text_content, max_length=dict_tokenizer['max_length'], truncation=True, return_tensors="pt")
  outputs = model.generate(
      inputs["input_ids"], max_length=dict_summarizer_model['max_length'], min_length=dict_summarizer_model['min_length'], 
  )

  summarized_text = tokenizer.decode(outputs[0])
  match = re.search(r"<s>(.*)</s>", summarized_text)
  if match is not None: summarized_text = match.group(1)

  return summarized_text.replace('<s>', '').replace('</s>', '') 
  

model_dict = {
  'model': model, 
  'max_length': 512,
  'min_length': 120
}

tokenizer_dict = {
  'tokenizer': tokenizer, 
  'max_length': 1024
}

In [None]:
def get_summaries_from_search(search_str: str) -> str:
    results = [x for x in search(query=search_str, num=5, stop=5, pause=2, tbs="qdr:d")]
    summaries = [(x, get_lsa_extractive_summary(input_str=x)) for x in results]
    summaries = list(filter(lambda x: len(x[1] >= 700, summaries)))
    

In [None]:
text = 'bandi sotto i 200 mila euro start-up tecnologiche'
results = [x for x in search(query=text, num=5, stop=5, pause=2, tbs="qdr:d")]
results

In [None]:
summaries = [(x, get_lsa_extractive_summary(input_str=x)) for x in results]
summaries = list(filter(lambda x: len(x[1]) >= 700, summaries))
summaries

In [None]:
len(summaries)

In [None]:
abstract_summaries = [(x[0], get_summary(model_dict, tokenizer_dict, text_content=x[1])) if len(x[1]) > 800 else x for x in summaries]

In [None]:
abstract_summaries