# Text summarization - Frequency based algorithm

# Preprocessing the texts

In [None]:
import re # relugar expression
import nltk # natural language toolkit
import string

In [None]:
# I added the word machine at the end of the last sentence
original_text = """Artificial intelligence is human like intelligence. 
                   It is the study of intelligent artificial agents. 
                   Science and engineering to produce intelligent machines. 
                   Solve problems and have intelligence. 
                   Related to intelligent behavior. 
                   Developing of reasoning machines. 
                   Learn from mistakes and successes. 
                   Artificial intelligence is related to reasoning in everyday situations."""

In [None]:
original_text

In [None]:
original_text = re.sub(r'\s+', ' ', original_text)

In [None]:
original_text

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

In [None]:
len(stopwords)

In [None]:
string.punctuation

In [None]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  #print(tokens)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [None]:
formatted_text = preprocess(original_text)
formatted_text

# Word frequency

In [None]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
word_frequency

In [None]:
word_frequency['intelligence']

In [None]:
word_frequency.keys()

In [None]:
len(word_frequency.keys())

In [None]:
highest_frequency = max(word_frequency.values())
highest_frequency

In [None]:
for word in word_frequency.keys():
  #print(word)
  word_frequency[word] = (word_frequency[word] / highest_frequency)

In [None]:
word_frequency

# Sentence tokenization

In [None]:
'Phd John went home. He arrived early.'.split('.')

In [None]:
'Ph.d John went home. He arrived early.'.split('.')

In [None]:
nltk.sent_tokenize('Ph.d John went home. He arrived early.')

In [None]:
sentence_list = nltk.sent_tokenize(original_text)
sentence_list

# Generate the summary (score for sentences)

In [None]:
word_frequency

In [None]:
score_sentences = {}
for sentence in sentence_list:
  #print(sentence)
  for word in nltk.word_tokenize(sentence.lower()):
    #print(word)
    if sentence not in score_sentences.keys():
      score_sentences[sentence] = word_frequency[word]
    else:
      score_sentences[sentence] += word_frequency[word]

In [None]:
score_sentences

In [None]:
score_sentences['Solve problems and have intelligence.']

In [None]:
score_sentences.keys()

In [None]:
import heapq
best_sentences = heapq.nlargest(3, score_sentences, key = score_sentences.get)

In [None]:
best_sentences

In [None]:
summary = ' '.join(best_sentences)
summary

In [None]:
original_text

# Visualizing the summary in HTML

In [None]:
from IPython.core.display import HTML

In [None]:
text = ''
display(HTML(f'<h2>Summary</h2>'))  #Display the header name
for sentence in sentence_list:
  #print(sentence)
  #text += sentence
  if sentence in best_sentences:
    text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>")
  else:
    text += ' ' + sentence

display(HTML(f"""{text}"""))

# Extracting texts from the Internet

In [None]:
!pip3 install goose3

In [None]:
from goose3 import Goose

In [None]:
g = Goose()
#url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
url = 'https://en.wikipedia.org/wiki/Natural_language_processing'
article = g.extract(url)

In [None]:
article.infos

In [None]:
article.title

In [None]:
article.cleaned_text

In [None]:
len(article.cleaned_text)

In [None]:
formatted_article = preprocess(article.cleaned_text)
formatted_article

In [None]:
len(formatted_article)

In [None]:
def summarize(text, number_of_sentences, percentage):
  original_text = text
  formatted_text = preprocess(original_text)

  word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
  highest_frequency = max(word_frequency.values())
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word] / highest_frequency)
  sentence_list = nltk.sent_tokenize(original_text)
  
  score_sentences = {}
  for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
      if word in word_frequency.keys():
        if sentence not in score_sentences.keys():
          score_sentences[sentence] = word_frequency[word]
        else:
          score_sentences[sentence] += word_frequency[word]

  import heapq
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(sentence_list) * percentage), score_sentences, key=score_sentences.get)
  else:
    best_sentences = heapq.nlargest(number_of_sentences, score_sentences, key=score_sentences.get)

  return sentence_list, best_sentences, word_frequency, score_sentences

In [None]:
len(sentence_list)

In [None]:
(50 / len(sentence_list)) * 100

In [None]:
sentence_list, best_sentences, word_frequency, score_sentences = summarize(article.cleaned_text, 50)

In [None]:
sentence_list

In [None]:
best_sentences

In [None]:
word_frequency

In [None]:
score_sentences

In [None]:
def visualize(title, sentence_list, best_sentences):
  from IPython.core.display import HTML
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [None]:
visualize(article.title, sentence_list, best_sentences)

# Summarizing multiple texts

In [None]:
article_list = ['https://en.wikipedia.org/wiki/Automatic_summarization',
                'https://en.wikipedia.org/wiki/Natural_language_processing',
                'https://en.wikipedia.org/wiki/Lemmatisation']

In [None]:
for url in article_list:
  #print(article)
  g = Goose()
  article = g.extract(url)
  sentence_list, best_sentences, _, _ = summarize(article.cleaned_text, 100, percentage=0.5)
  #print(len(sentence_list), len(best_sentences))
  visualize(article.title, sentence_list, best_sentences)