In [None]:
import json

import numpy as np
from keybert import KeyBERT
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
import transformers
# transformers.logging.set_verbosity_error()

import time

import requests
from newspaper import Article
import spacy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
url = 'https://en.wikipedia.org/wiki/Dante_Alighieri'

article = Article(url)
article.download()
article.parse()
len(article.text)

In [None]:
# load model token config
tokenizer_checkpoint = 'facebook/bart-large-mnli'
model_checkpoint = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# s = "VDT/Addetti ad AttivitÃ  AttivitÕ di Ufficio Turnisti - DT, à"
# s.encode('utf-8')

In [None]:
def my_nest_sentences(document: str, token_max_length = 1024):
  sents = []
  length = 0
  tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
  doc = nlp(document)
  s = ''
  for sentence in doc.sents:
    tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0]
    length += len(tokens_in_sentence) # how many tokens the current sentence have summed to the previous
    if length < token_max_length:
      s += sentence.text
    else:
      sents.append(s)
      tokens_in_sentence = tokenizer(str(s), truncation=False, padding=False)[0]
      s = sentence.text
      length = 0
  sents.append(s) # be sure to append even string with less number of tokens than the maximum one
  return sents

headers = {"Authorization": f"Bearer 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"}
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"

def download_text(url: str):
    article = Article(url)
    article.download()
    article.parse()
    return article

def get_hf_inference_data_input(article_text):
    payload = {'inputs': article_text, 'parameters': {'do_sample': False}}
    data = json.dumps(payload)
    return data


def generate_summary(url: str):
    article = download_text(url)
    data = get_hf_inference_data_input(article.text)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    summary = json.loads(response.content.decode("utf-8"))
    summary = summary[0]['summary_text']
    return summary

def generate_msummary(text: str):
    data = get_hf_inference_data_input(text)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    summary = json.loads(response.content.decode("utf-8"))
    summary = summary[0]['summary_text']
    return summary

In [None]:
# spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(article.text)
len(list(doc.sents))
for sentence in doc.sents:
    s = tokenizer(str(sentence), truncation=False, padding=False, return_tensors='pt')
    break

In [None]:
output = model(**s)

In [None]:
chunks = my_nest_sentences(article.text)
len(chunks)

In [None]:
class ModelInference:
    def __init__(self, tokenizer_checkpoint, model_checkpoint, quantize=False):
        torch.set_num_threads(1)
        torch.set_grad_enabled(False)

        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

        if quantize:
            model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
        
        self.model = model

    def predict(self, msg: str):
        with torch.no_grad():
            inputs = self.tokenizer(str(msg), max_length_data=1024, )


In [None]:
nested = my_nest_sentences(article.text)
chunks = [x for x in nested]
len(chunks)

In [None]:
summaries = []
for chunk in chunks:
    summary = generate_msummary(chunk)
    time.sleep(1)
    print(summary)
    summaries.append(summary)

In [None]:
keybert = KeyBERT()

In [None]:
total_summary = ''.join(summaries)
keywords = keybert.extract_keywords(
  total_summary, 
  keyphrase_ngram_range=(1, 1),
  stop_words='english', 
  use_mmr=True, 
  diversity=0.5,
  top_n=5)

In [None]:
keywords

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")


In [None]:
candidate_labels = ['literature', 'cooking', 'dancing', 'exploration', 'finance', 'technology', 'science', 'programming']
k = classifier(total_summary, candidate_labels, multi_label=True)
k
# setting candidate labels to interested topics and allowing the research of a summary by the topic

In [None]:
from spacy.matcher import Matcher
pattern = [{"TEXT": 'Alcuni'}, {"TEXT": 'versi'}, {"TEXT": 'del'}, {"TEXT": "Paradiso"}]
matcher = Matcher(nlp.vocab)
matcher.add('SEGNO_PATTERN', [pattern])
matches = matcher(doc)
for match_id,start,end in matches:
    print(doc[start:end])

In [None]:
summaries = []
for chunk in chunks:
    summary = generate_msummary(chunk)
    print(summary)
    summaries.append(summary)

In [None]:
final_summary = ''.join(summaries)
len(article.text)
len(final_summary)

In [None]:
with open('new_file.txt', '+w') as f:
    f.write(final_summary)