# Text Summarization - Luhn algorithm

- https://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf

# Preparing the environment

In [None]:
import re
import nltk
import string
import heapq

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
original_text = """Artificial intelligence is human like intelligence machines. 
                   It is the study of intelligent artificial agents. 
                   Science and engineering to produce intelligent machines. 
                   Solve problems and have intelligence. 
                   Related to intelligent behavior machines. 
                   Developing of reasoning machines. 
                   Learn from mistakes and successes. 
                   Artificial intelligence is related to reasoning in everyday situations."""
original_text = re.sub(r'\s+', ' ', original_text)
original_text

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

In [None]:
stopwords.append('explanation')

In [None]:
print(stopwords)

In [None]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [None]:
formatted_text = preprocess(original_text)
formatted_text

# Function to calculate sentences score

In [None]:
def calculate_sentences_score(sentences, important_words, distance):
  scores = []
  sentence_index = 0

  for sentence in [nltk.word_tokenize(sentence) for sentence in sentences]:
    #print('------------')
    #print(sentence)

    word_index = []
    for word in important_words:
      #print(word)
      try:
        word_index.append(sentence.index(word))
      except ValueError:
        pass

    word_index.sort()
    #print(word_index)

    if len(word_index) == 0:
      continue

    # [0, 1, 5]
    groups_list = []
    group = [word_index[0]]
    i = 1 # 3
    while i < len(word_index): # 3
      # first execution: 1 - 0 = 1
      # second execution: 2 - 1 = 1
      if word_index[i] - word_index[i - 1] < distance:
        group.append(word_index[i])
        #print('group', group)
      else:
        groups_list.append(group[:])
        group = [word_index[i]]
        #print('group', group)
      i += 1
    groups_list.append(group)
    #print('all groups', groups_list)

    max_group_score = 0
    for g in groups_list:
      #print(g)
      important_words_in_group = len(g)
      total_words_in_group = g[-1] - g[0] + 1
      score = 1.0 * important_words_in_group**2 / total_words_in_group
      #print('group score', score)

      if score > max_group_score:
        max_group_score = score

    scores.append((max_group_score, sentence_index))
    sentence_index += 1

  #print('final scores', scores)
  return scores

In [None]:
test = ['a', 'b', 'c']
test.index('a')

In [None]:
test = [0,1,3,4,6,9]
test[-1], test[0] + 1

In [None]:
3*3

# Function to summarize the texts

In [None]:
def summarize(text, top_n_words, distance, number_of_sentences, percentage = 0):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  #print(original_sentences)
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  #print(formatted_sentences)
  words = [word for sentence in formatted_sentences for word in nltk.word_tokenize(sentence)]
  #print(words)
  frequency = nltk.FreqDist(words)
  #print(frequency)
  #return frequency
  top_n_words = [word[0] for word in frequency.most_common(top_n_words)]
  #print(top_n_words)
  sentences_score = calculate_sentences_score(formatted_sentences, top_n_words, distance)
  #print(sentences_score)
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(formatted_sentences) * percentage), sentences_score)
  else:  
    best_sentences = heapq.nlargest(number_of_sentences, sentences_score)
  #print(best_sentences)
  best_sentences = [original_sentences[i] for (score, i) in best_sentences]
  #print(best_sentences)
  return original_sentences, best_sentences, sentences_score

In [None]:
original_sentences, best_sentences, sentences_score = summarize(original_text, 5, 2, 3)

In [None]:
original_sentences

In [None]:
best_sentences

In [None]:
sentences_score

In [None]:
from IPython.core.display import HTML

In [None]:
def visualize(title, sentence_list, best_sentences):
  from IPython.core.display import HTML
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

# Extracting texts from the Internet

In [None]:
!pip install goose3

In [None]:
from goose3 import Goose

In [None]:
g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [None]:
article.cleaned_text

In [None]:
original_sentences, best_sentences, score_sentences = summarize(article.cleaned_text, 300, 10, 120)

In [None]:
len(original_sentences)

In [None]:
(120 / len(original_sentences)) * 100

In [None]:
original_sentences

In [None]:
best_sentences

In [None]:
score_sentences

In [None]:
visualize(article.title, original_sentences,  best_sentences)

# Extracting articles from RSS feeds
- RSS : It is a webfeed that allows users and applications to access the latest websites in a computer redable format
- The feeds can keep a track of different websites
- AI feeds: https://blog.feedspot.com/ai_rss_feeds/

In [None]:
!pip install feedparser

In [None]:
import feedparser

In [None]:
from bs4 import BeautifulSoup  # This is used to remove the html codes extracted from the articles
import os
import json # Common format used to send information in the internet

In [None]:
url = 'https://www.aitrends.com/feed/'
feed = feedparser.parse(url)

In [None]:
feed.entries

In [None]:
for e in feed.entries:
  print(e.title)   # Displays the title that are in the feed
  print(e.content[0].value) # Selects the values in the content. Value contains the required text needed for sumarization.
  print('-------')

In [None]:
def clean_html(text): # Removes the html codes in the text
  if text == '':
    return ''
  else:
    return BeautifulSoup(text, 'html5lib').get_text()   

In [None]:
e.content[0].value

In [None]:
clean_html(e.content[0].value)

In [None]:
articles = []
for e in feed.entries:
  articles.append({'title': e.title, 'content': clean_html(e.content[0].value)})

In [None]:
articles

In [None]:
save_file = os.path.join('feed.json')
feed = open(save_file, 'w+')
feed.write(json.dumps(articles, indent=1))
feed.close()

In [None]:
blog_articles = json.loads(open('feed.json').read())
blog_articles

# Word cloud

In [None]:
feed_content = ''
for article in blog_articles:
  feed_content += article['content']

In [None]:
feed_content

In [None]:
formatted_feed_content = preprocess(feed_content)
formatted_feed_content

In [None]:
len(feed_content), len(formatted_feed_content)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
plt.axis('off')
plt.imshow(WordCloud().generate(formatted_feed_content));

# Extracting named entities

- Acronyms: https://spacy.io/api/annotation#named-entities

In [None]:
import spacy

In [None]:
nlp = spacy.load('en')
nlp

In [None]:
document = nlp(formatted_feed_content)

In [None]:
from spacy import displacy
displacy.render(document, style = 'ent', jupyter = True)

In [None]:
for entity in document.ents:
  if entity.label_ == 'GPE':
    print(entity.text, entity.label_)

# Summarizing the articles

In [None]:
for article in blog_articles:
  #print(article)
  original_sentences, best_sentences, _ = summarize(article['content'], 150, 10, number_of_sentences=5, percentage=0.5)
  visualize(article['title'], original_sentences, best_sentences)
  save_summary(article['title'], original_sentences, best_sentences)

# Saving the summary

In [None]:
def save_summary(title, original_sentences, best_sentences):
  HTML_TEMPLATE = """<html>
    <head>
      <title>{0}</title>
      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
    </head>
    <body>{1}</body>

  </html>"""
  text = ''
  for sentence in original_sentences:
    if sentence in best_sentences:
      text += str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += sentence
  
  save_file = open(os.path.join(title + '.html'), 'wb')
  html_file = HTML_TEMPLATE.format(title, text)
  save_file.write(html_file.encode('utf-8'))
  save_file.close()