In [None]:
import re

import nltk
nltk.download('punkt')
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from transformers import BartForConditionalGeneration, BartTokenizerFast, BertForSequenceClassification, BertTokenizerFast, pipeline


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, Markdown

In [None]:
checkpoint_summary = "facebook/bart-large-cnn"
# checkpoint_summary = "sshleifer/distilbart-xsum-12-6"
checkpoint_sentiment = "yiyanghkust/finbert-tone"
tokenizer = BartTokenizerFast.from_pretrained(checkpoint_summary)
model = BartForConditionalGeneration.from_pretrained(checkpoint_summary)

finbert = BertForSequenceClassification.from_pretrained(checkpoint_sentiment, num_labels=3)
tokenizer_sentiment = BertTokenizerFast.from_pretrained(checkpoint_sentiment)
nlp_sentiment = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer_sentiment)

In [None]:
def get_extractive_summary_from_url(url: str, language: str, sentence_count: int) -> str:
  parser = HtmlParser.from_url(url, Tokenizer(language))
  stemmer = Stemmer(language)
  summarizer = Summarizer(stemmer)
  summarizer.stop_words = get_stop_words(language)
  extractive_summary_from_url = ' '.join([sent._text for sent in summarizer(parser.document, sentence_count)])
  return extractive_summary_from_url


def get_summary(dict_summarizer_model, dict_tokenizer, text_content):
  # text_content = get_extractive_summary(text_content, EXTRACTED_ARTICLE_SENTENCES_LEN)
  tokenizer = dict_tokenizer['tokenizer']
  model = dict_summarizer_model['model']

  inputs = tokenizer(text_content, max_length=dict_tokenizer['max_length'], truncation=True, return_tensors="pt")
  outputs = model.generate(
      inputs["input_ids"], max_length=dict_summarizer_model['max_length'], min_length=dict_summarizer_model['min_length'], 
  )

  summarized_text = tokenizer.decode(outputs[0])
  match = re.search(r"<s>(.*)</s>", summarized_text)
  if match is not None: summarized_text = match.group(1)

  return summarized_text.replace('<s>', '').replace('</s>', '') 
  

model_dict = {
  'model': model, 
  'max_length': 512,
  'min_length': 120
}

tokenizer_dict = {
  'tokenizer': tokenizer, 
  'max_length': 1024
}

LANGUAGE = "english"
SENTENCE_COUNT = 15

url = "https://news.iobanker.com/2022/07/01/bitcoin-will-see-long-bear-market-says-trader-with-btc-price-stuck-at-19k/"

extractive_summary_from_url = get_extractive_summary_from_url(url, LANGUAGE, SENTENCE_COUNT)
display(Markdown('### Extractive summary:'))
print(extractive_summary_from_url)
print()
display(Markdown('### Abstractive summary:'))
abstractive_summary = get_summary(model_dict, tokenizer_dict, extractive_summary_from_url)
print(abstractive_summary)
display(Markdown('### Sentiment analysys'))
print(nlp_sentiment(abstractive_summary))

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer_nem = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model_nem = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model_nem, tokenizer=tokenizer_nem)
ner_results = nlp(abstractive_summary)
print(ner_results)

In [None]:
ner_results[0]