# Text summarizatin using libraries

# Preparing the environment

In [None]:
import nltk
import re
nltk.download('punkt')

In [None]:
!pip install goose3

In [None]:
from goose3 import Goose
g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [None]:
article.cleaned_text

In [None]:
original_sentences = [sentence for sentence in nltk.sent_tokenize(article.cleaned_text)]
original_sentences

In [None]:
from IPython.core.display import HTML
def visualize(title, sentence_list, best_sentences):
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

## sumy

- https://pypi.org/project/sumy/

In [None]:
!pip install sumy

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer

In [None]:
parser = PlaintextParser.from_string(article.cleaned_text, Tokenizer('english'))

In [None]:
summarizer = LuhnSummarizer()

In [None]:
summary = summarizer(parser.document, 120)

In [None]:
summary

In [None]:
best_sentences = []
for sentence in summary:
  #print(sentence)
  best_sentences.append(str(sentence))

In [None]:
visualize(article.title, original_sentences, best_sentences)

## pysummarization

- https://pypi.org/project/pysummarization/

In [None]:
!pip install pysummarization

In [None]:
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

In [None]:
auto_abstractor = AutoAbstractor()
auto_abstractor.tokenizable_doc = SimpleTokenizer()
auto_abstractor.delimiter_list = [".", "\n"]
abstractable_doc = TopNRankAbstractor()

In [None]:
summary = auto_abstractor.summarize(article.cleaned_text, abstractable_doc)

In [None]:
summary

In [None]:
best_sentences = []
for sentence in summary['summarize_result']:
  #print(sentence)
  best_sentences.append(re.sub(r'\s+', ' ', sentence).strip())

In [None]:
best_sentences

In [None]:
visualize(article.title, original_sentences, best_sentences)

## BERT

- https://pypi.org/project/bert-extractive-summarizer/

In [None]:
!pip install bert-extractive-summarizer

In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
from transformers import BigBirdTokenizer

In [None]:
from summarizer import Summarizer

In [None]:
summarizer = Summarizer()
summary = summarizer(article.cleaned_text)

In [None]:
summary

In [None]:
summary_tokenized = [sentence for sentence in nltk.sent_tokenize(summary)]
summary_tokenized

In [None]:
visualize(article.title, original_sentences, summary_tokenized)