# Text Summarization with Frequency based algorithm

In [1]:
import re
import nltk
import string

### Preprocessing the text

In [2]:
original_text = '''Artificial intelligence is human like intelligence. 
                   It is the study of intelligent artificial agents. 
                   Science and engineering to produce intelligent machines. 
                   Solve problems and have intelligence. 
                   Related to intelligent behavior. 
                   Developing of reasoning machines. 
                   Learn from mistakes and successes. 
                   Artificial intelligence is related to reasoning in everyday situations.'''

In [3]:
original_text

'Artificial intelligence is human like intelligence. \n                   It is the study of intelligent artificial agents. \n                   Science and engineering to produce intelligent machines. \n                   Solve problems and have intelligence. \n                   Related to intelligent behavior. \n                   Developing of reasoning machines. \n                   Learn from mistakes and successes. \n                   Artificial intelligence is related to reasoning in everyday situations.'

In [4]:
# To remove the one or more spaces and replace with a single space.
original_text = re.sub(r'\s+',' ',original_text)

In [5]:
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

### Lowercase the letters and remove the stop words(which are not so useful)

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chirag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
len(stopwords)

179

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
def preprocess(text):
    #lowercase the sentences
    formatted_text = text.lower()
    tokens = []
    for token in nltk.word_tokenize(formatted_text):
        tokens.append(token)
    # print(tokens)

    #remove stopwords and remove punctuation
    tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
    formatted_text = ' '.join(element for element in tokens)
    # return tokens
    return formatted_text

In [12]:
formatted_text = preprocess(original_text)

In [13]:
formatted_text

'artificial intelligence human like intelligence study intelligent artificial agents science engineering produce intelligent machines solve problems intelligence related intelligent behavior developing reasoning machines learn mistakes successes artificial intelligence related reasoning everyday situations'

### Word Frequency

In [14]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))


In [15]:
word_frequency

FreqDist({'intelligence': 4, 'artificial': 3, 'intelligent': 3, 'machines': 2, 'related': 2, 'reasoning': 2, 'human': 1, 'like': 1, 'study': 1, 'agents': 1, ...})

In [16]:
word_frequency['artificial']

3

In [17]:
word_frequency['intelligence']

4

In [18]:
word_frequency.keys()

dict_keys(['artificial', 'intelligence', 'human', 'like', 'study', 'intelligent', 'agents', 'science', 'engineering', 'produce', 'machines', 'solve', 'problems', 'related', 'behavior', 'developing', 'reasoning', 'learn', 'mistakes', 'successes', 'everyday', 'situations'])

In [19]:
len(word_frequency.keys())

22

In [20]:
highest_frequency = max(word_frequency.values())
highest_frequency

4

In [21]:
for word in word_frequency.keys():
    # print(word)
    word_frequency[word] = (word_frequency[word]/highest_frequency)

In [22]:
word_frequency

FreqDist({'intelligence': 1.0, 'artificial': 0.75, 'intelligent': 0.75, 'machines': 0.5, 'related': 0.5, 'reasoning': 0.5, 'human': 0.25, 'like': 0.25, 'study': 0.25, 'agents': 0.25, ...})

### Sentence Tokenization

In [23]:
'Mr Ibn La-Ahad went home. He arrived early.'.split('.')

['Mr Ibn La-Ahad went home', ' He arrived early', '']

In [24]:
'Mr. Ibn La-Ahad went home. He arrived early.'.split('.')

['Mr', ' Ibn La-Ahad went home', ' He arrived early', '']

In [25]:
nltk.sent_tokenize('Mr Ibn La-Ahad went home. He arrived early.')

['Mr Ibn La-Ahad went home.', 'He arrived early.']

In [26]:
sentence_list = nltk.sent_tokenize(original_text)

In [27]:
sentence_list

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

In [28]:
word_frequency

FreqDist({'intelligence': 1.0, 'artificial': 0.75, 'intelligent': 0.75, 'machines': 0.5, 'related': 0.5, 'reasoning': 0.5, 'human': 0.25, 'like': 0.25, 'study': 0.25, 'agents': 0.25, ...})

In [31]:
score_sentences = {}
for sentence in sentence_list:
    # print(sentence)
    for word in nltk.word_tokenize(sentence.lower()):
        # print(word)

        if sentence not in score_sentences.keys():
            score_sentences[sentence] = word_frequency[word]
        else:
            score_sentences[sentence] += word_frequency[word]


In [32]:
score_sentences

{'Artificial intelligence is human like intelligence.': 3.25,
 'It is the study of intelligent artificial agents.': 2.0,
 'Science and engineering to produce intelligent machines.': 2.0,
 'Solve problems and have intelligence.': 1.5,
 'Related to intelligent behavior.': 1.5,
 'Developing of reasoning machines.': 1.25,
 'Learn from mistakes and successes.': 0.75,
 'Artificial intelligence is related to reasoning in everyday situations.': 3.25}

In [33]:
score_sentences['Artificial intelligence is related to reasoning in everyday situations.']

3.25

In [34]:
score_sentences.keys()

dict_keys(['Artificial intelligence is human like intelligence.', 'It is the study of intelligent artificial agents.', 'Science and engineering to produce intelligent machines.', 'Solve problems and have intelligence.', 'Related to intelligent behavior.', 'Developing of reasoning machines.', 'Learn from mistakes and successes.', 'Artificial intelligence is related to reasoning in everyday situations.'])

In [35]:
import heapq
best_sentences = heapq.nlargest(3,score_sentences,key = score_sentences.get)

In [36]:
best_sentences

['Artificial intelligence is human like intelligence.',
 'Artificial intelligence is related to reasoning in everyday situations.',
 'It is the study of intelligent artificial agents.']

In [37]:
summary = ' '.join(best_sentences)
summary

'Artificial intelligence is human like intelligence. Artificial intelligence is related to reasoning in everyday situations. It is the study of intelligent artificial agents.'

In [38]:
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

### Visualizing the summary in HTML

In [39]:
from IPython.core.display import HTML

In [47]:
text = ''
display(HTML(f'<h1>Summary</h1>'))
for sentence in sentence_list:
    # print(sentence)
    # text += sentence
    if sentence in best_sentences:
        text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>")
    else:
        text += ' ' + sentence
display(HTML(f'''{text}'''))

### Extracting texts from Internet

In [48]:
from goose3 import Goose

In [50]:
g = Goose()
url = 'https://en.wikipedia.org/wiki/Baldwin_IV_of_Jerusalem'

article = g.extract(url)

In [51]:
article.infos

{'meta': {'description': '',
  'lang': 'en',
  'keywords': '',
  'favicon': '/static/apple-touch/wikipedia.png',
  'canonical': 'https://en.wikipedia.org/wiki/Baldwin_IV_of_Jerusalem',
  'encoding': 'UTF-8'},
 'image': None,
 'domain': 'en.wikipedia.org',
 'title': 'Baldwin IV of Jerusalem - Wikipedia',
 'cleaned_text': 'Baldwin IV (French: Baudouin; 1161–1185), called the Leper King, was King of Jerusalem from 1174 until his death in 1185. He was admired by his contemporaries and later historians for his willpower and dedication to the Latin kingdom in the face of debilitating leprosy, which eventually left him blind and unable to use either his hands or his feet. Choosing competent advisers, Baldwin ruled a thriving realm and succeeded in protecting it from the Ayyubid Muslim ruler Saladin.\n\nBaldwin developed the first symptoms of his leprosy as a child but was only diagnosed after his accession on the death of his father, King Amalric. Thereafter his hands and face became increasi

In [52]:
article.title

'Baldwin IV of Jerusalem - Wikipedia'

In [53]:
article.cleaned_text

'Baldwin IV (French: Baudouin; 1161–1185), called the Leper King, was King of Jerusalem from 1174 until his death in 1185. He was admired by his contemporaries and later historians for his willpower and dedication to the Latin kingdom in the face of debilitating leprosy, which eventually left him blind and unable to use either his hands or his feet. Choosing competent advisers, Baldwin ruled a thriving realm and succeeded in protecting it from the Ayyubid Muslim ruler Saladin.\n\nBaldwin developed the first symptoms of his leprosy as a child but was only diagnosed after his accession on the death of his father, King Amalric. Thereafter his hands and face became increasingly disfigured. Count Raymond III of Tripoli ruled the kingdom in Baldwin\'s name until the king reached the age of majority in 1176. As soon as he assumed government, Baldwin planned an invasion of Egypt, which fell through due to his vassals\' uncooperativeness. Saladin in turn attacked Baldwin\'s kingdom in 1177, but

In [54]:
len(article.cleaned_text)

29316

In [55]:
formatted_article = preprocess(article.cleaned_text)
formatted_article

"baldwin iv french baudouin 1161–1185 called leper king king jerusalem 1174 death 1185. admired contemporaries later historians willpower dedication latin kingdom face debilitating leprosy eventually left blind unable use either hands feet choosing competent advisers baldwin ruled thriving realm succeeded protecting ayyubid muslim ruler saladin baldwin developed first symptoms leprosy child diagnosed accession death father king amalric thereafter hands face became increasingly disfigured count raymond iii tripoli ruled kingdom baldwin 's name king reached age majority 1176. soon assumed government baldwin planned invasion egypt fell due vassals uncooperativeness saladin turn attacked baldwin 's kingdom 1177 king nobleman raynald châtillon repelled montgisard earning baldwin fame young king mastered horse riding despite gradually losing sensation extremities able fight battles last years leprosy precluded baldwin marrying hoped abdicate sister sibylla married william montferrat 1176 wil

In [56]:
len(formatted_article)

20901

In [57]:
def summarize(text, number_of_best_sentences, percentage = 0):
  original_text = text
  formatted_text = preprocess(original_text)

  word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
  highest_frequency = max(word_frequency.values())
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word] / highest_frequency)
  sentence_list = nltk.sent_tokenize(original_text)
  
  score_sentences = {}
  for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
      if word in word_frequency.keys():
        if sentence not in score_sentences.keys():
          score_sentences[sentence] = word_frequency[word]
        else:
          score_sentences[sentence] += word_frequency[word]

  import heapq
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(sentence_list) * percentage), score_sentences, key=score_sentences.get)
  else:
    best_sentences = heapq.nlargest(number_of_best_sentences, score_sentences, key=score_sentences.get)

  return sentence_list, best_sentences, word_frequency, score_sentences

In [67]:
sentence_list , best_sentences, word_frequency, score_sentences = summarize(article.cleaned_text,100)

In [68]:
len(sentence_list)

253

In [69]:
# If we select 120 sentences then our summary will contain thirty-percent of the text
(120/len(sentence_list)) * 100

47.43083003952569

In [70]:
sentence_list

['Baldwin IV (French: Baudouin; 1161–1185), called the Leper King, was King of Jerusalem from 1174 until his death in 1185.',
 'He was admired by his contemporaries and later historians for his willpower and dedication to the Latin kingdom in the face of debilitating leprosy, which eventually left him blind and unable to use either his hands or his feet.',
 'Choosing competent advisers, Baldwin ruled a thriving realm and succeeded in protecting it from the Ayyubid Muslim ruler Saladin.',
 'Baldwin developed the first symptoms of his leprosy as a child but was only diagnosed after his accession on the death of his father, King Amalric.',
 'Thereafter his hands and face became increasingly disfigured.',
 "Count Raymond III of Tripoli ruled the kingdom in Baldwin's name until the king reached the age of majority in 1176.",
 "As soon as he assumed government, Baldwin planned an invasion of Egypt, which fell through due to his vassals' uncooperativeness.",
 "Saladin in turn attacked Baldwin

In [71]:
best_sentences

["Christian defeat at Hattin two years after Baldwin's death marred the king's legacy, with historians tracing fatal discord to Baldwin's reign.",
 "The proposal of the king's mother that Sibylla's five-year-old son, Baldwin, be made co-king was accepted, and the boy was crowned on 20 November.",
 "Saladin's nephew Farrukh Shah was sent to investigate the king's movement but suddenly ran into him, and a skirmish followed.",
 "In the winter of 1177–78, the king's widowed sister, Sibylla, gave birth to a son, Baldwin, named after the king.",
 "William of Montferrat married Baldwin's sister, Sibylla, in November 1176 despite misgivings of the nobles who no longer trusted his cousin Emperor Frederick's ability to aid the kingdom.",
 'He began contemplating her marriage to Duke Hugh III of Burgundy, and wrote to the king of France: "To be deprived of the use of one\'s limbs is of little help to one in carrying out the work of government ...',
 "He immediately dismissed Guy from regency and 

In [72]:
word_frequency

FreqDist({'baldwin': 1.0, "'s": 0.6204379562043796, 'king': 0.4744525547445255, 'saladin': 0.25547445255474455, 'jerusalem': 0.22627737226277372, 'guy': 0.21897810218978103, 'kingdom': 0.2116788321167883, 'sibylla': 0.1897810218978102, 'raymond': 0.16058394160583941, 'amalric': 0.11678832116788321, ...})

In [73]:
score_sentences

{'Baldwin IV (French: Baudouin; 1161–1185), called the Leper King, was King of Jerusalem from 1174 until his death in 1185.': 0.0948905109489051,
 'He was admired by his contemporaries and later historians for his willpower and dedication to the Latin kingdom in the face of debilitating leprosy, which eventually left him blind and unable to use either his hands or his feet.': 0.5985401459854014,
 'Choosing competent advisers, Baldwin ruled a thriving realm and succeeded in protecting it from the Ayyubid Muslim ruler Saladin.': 0.14598540145985403,
 'Baldwin developed the first symptoms of his leprosy as a child but was only diagnosed after his accession on the death of his father, King Amalric.': 0.29927007299270075,
 'Thereafter his hands and face became increasingly disfigured.': 0.1532846715328467,
 "Count Raymond III of Tripoli ruled the kingdom in Baldwin's name until the king reached the age of majority in 1176.": 1.467153284671533,
 "As soon as he assumed government, Baldwin pla

In [74]:
def visualize(title, sentence_list, best_sentences):
  from IPython.core.display import HTML
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [75]:
visualize(article.title,sentence_list,best_sentences)

### Summarizing multiple texts

In [76]:
article_list = ['https://en.wikipedia.org/wiki/Balian_of_Ibelin',
                'https://en.wikipedia.org/wiki/Baldwin_IV_of_Jerusalem',
                'https://en.wikipedia.org/wiki/Saladin']

In [82]:
for url in article_list:
    # print(url)
    g = Goose()
    article = g.extract(url)
    sentence_list, best_sentences, _, _ = summarize(article.cleaned_text, 100,percentage = 0.5)
    # print(len(sentence_list),len(best_sentences))
    visualize(article.title,sentence_list,best_sentences)