# Text Summarization

This experiment shows how text can be summarized by choosing the most important sentences from the text. 

In [4]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
from nltk.corpus import stopwords
stop = stopwords.words('english')

from bs4 import BeautifulSoup
from urllib.request import urlopen

from gensim.models import Phrases
from gensim.models.phrases import Phraser

import os
from collections import Counter
import string
punctuations = list(string.punctuation)

punctuations.extend(['”', '–', '``', "''"])
stop = stop + punctuations

In [None]:
# Import necessary libraries
from bs4 import BeautifulSoup
from urllib.request import urlopen

# Open the URL and read the HTML content
url = urlopen('http://news.sky.com/story/snap-election-to-be-held-in-march-after-northern-ireland-government-collapses-10731488')
soup = BeautifulSoup(url.read().decode('utf8'), "lxml")

# Extract the text from the <p> tags
text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p')))

# Select the text starting from a specific point in the article
text = text[text.find('An early election'):]

# Extract the title of the article
title = soup.find('h1').text.strip()

# Print the title, separator, and text
print(title, '\n', '_' * 60, '\n', text)

## Calculating the Similarity between Sentences

In [43]:
def intersection(sent1, sent2):
    """
    Calculate the intersection score between two sentences.
    """
    s1 = sent1.split(' ')
    s2 = sent2.split(' ')

    intersection = [i for i in s1 if i in s2]
    
    # Normalization
    return len(intersection) / ((len(s1) + len(s2)) / 2)

In [44]:
sentences = sent_tokenize(text)
matrix = [[intersection(sentences[i], sentences[j]) for i in range(0,len(sentences))] for j in range(0,len(sentences))]
matrix[:2]

[[1.0,
  0.40816326530612246,
  0.1568627450980392,
  0.08695652173913043,
  0.0,
  0.10256410256410256,
  0.15384615384615385,
  0.10285714285714286,
  0.1111111111111111,
  0.1875,
  0.3018867924528302,
  0.12121212121212122,
  0.0,
  0.13793103448275862,
  0.08888888888888889,
  0.17857142857142858,
  0.10256410256410256,
  0.34782608695652173,
  0.4,
  0.0],
 [0.24489795918367346,
  1.0,
  0.10714285714285714,
  0.11764705882352941,
  0.0,
  0.09090909090909091,
  0.17543859649122806,
  0.022222222222222223,
  0.1016949152542373,
  0.21621621621621623,
  0.20689655172413793,
  0.21052631578947367,
  0.0,
  0.19047619047619047,
  0.0,
  0.19672131147540983,
  0.09090909090909091,
  0.3137254901960784,
  0.24,
  0.0]]

In [45]:
scores = {sentences[i]: sum(matrix[i]) for i in range(len(matrix))}
scores

{'An early election will be held in Northern Ireland on 2 March after the collapse of its government, it has been announced.': 3.988741497650454,
 'Northern Ireland Secretary James Brokenshire said the devolved Northern Ireland Assembly will sit for the last time on 25 January, before it is dissolved the following day.': 3.525423866813436,
 'The break-up of the power-sharing government comes amid a dispute between Sinn Fein and the DUP over a botched renewable energy scheme that could have cost the taxpayer £500m.': 4.232533141096488,
 'The "cash for ash" scandal prompted the resignation of deputy first minister Martin McGuinness, who called for DUP first minister Arlene Foster to quit.': 3.5893899865889365,
 'She refused, calling Mr McGuinness\' actions "not principled" and "purely political".': 1.4450041056799579,
 'On Monday afternoon, Sinn Fein announced it would not replace Mr McGuinness - triggering the snap election.': 3.2601549390922244,
 'Despite a last-ditch attempt by Theres

In [46]:
sents = sorted(scores, key=scores.__getitem__, reverse=True)[:5]
sents

['He added that the collapse of the power-sharing government was the "greatest challenge to face the Northern Ireland peace process in a decade".',
 'The break-up of the power-sharing government comes amid a dispute between Sinn Fein and the DUP over a botched renewable energy scheme that could have cost the taxpayer £500m.',
 'He said: "This is essential for the operation of devolved government.',
 'Please use Chrome browser for a more accessible video player\n\n\n\nSky News Ireland Correspondent David Blevins said the relationship between Sinn Fein and the DUP had been "slowly breaking down for a period of months".',
 'An early election will be held in Northern Ireland on 2 March after the collapse of its government, it has been announced.']

In [47]:
tuples = [(i, text.find(i)) for i in sents]
sorted_tuples = sorted(tuples, key=lambda x: x[0])

best_sents = [i[0] for i in sorted_tuples]
best_sents

['An early election will be held in Northern Ireland on 2 March after the collapse of its government, it has been announced.',
 'He added that the collapse of the power-sharing government was the "greatest challenge to face the Northern Ireland peace process in a decade".',
 'He said: "This is essential for the operation of devolved government.',
 'Please use Chrome browser for a more accessible video player\n\n\n\nSky News Ireland Correspondent David Blevins said the relationship between Sinn Fein and the DUP had been "slowly breaking down for a period of months".',
 'The break-up of the power-sharing government comes amid a dispute between Sinn Fein and the DUP over a botched renewable energy scheme that could have cost the taxpayer £500m.']

In [48]:
def intersection(sent1, sent2):
    """
    Calculate the intersection score between two sentences.
    """
    s1 = sent1.split(' ')
    s2 = sent2.split(' ')
    intersection = [i for i in s1 if i in s2]
    return len(intersection) / ((len(s1) + len(s2)) / 2)

def get_summary(text, limit=3):
    """
    Generate a summary of the given text.
    """
    sentences = sent_tokenize(text)
    matrix = [[intersection(sentences[i], sentences[j]) for i in range(0,len(sentences))] for j in range(0,len(sentences))]
    scores = {sentences[i]: sum(matrix[i]) for i in range(len(matrix))}
    sents = sorted(scores, key=scores.__getitem__, reverse=True)[:limit]
    best_sents = [i[0] for i in sorted([(i, text.find(i)) for i in sents], key=lambda x: x[0])]
    return best_sents

def summarize(text, limit=3):
    """
    Summarize the given text and print the result.
    """
    summary = get_summary(text, limit)
    print(title)
    print()
    print(' '.join(summary))

In [49]:
summarize(text,5)

Snap election to be held in March after Northern Ireland government collapses

An early election will be held in Northern Ireland on 2 March after the collapse of its government, it has been announced. He added that the collapse of the power-sharing government was the "greatest challenge to face the Northern Ireland peace process in a decade". He said: "This is essential for the operation of devolved government. Please use Chrome browser for a more accessible video player



Sky News Ireland Correspondent David Blevins said the relationship between Sinn Fein and the DUP had been "slowly breaking down for a period of months". The break-up of the power-sharing government comes amid a dispute between Sinn Fein and the DUP over a botched renewable energy scheme that could have cost the taxpayer £500m.


## Calculating the Frequencies of Words

In [66]:
from collections import defaultdict
from heapq import nlargest

def score_sentences(sentences, words, stopwords):
    word_frequencies = defaultdict(int)
    sentence_scores = defaultdict(int)

    for word in words:
        word_frequencies[word] += 1

    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word not in stopwords:
                sentence_scores[sentence] += word_frequencies[word]

    return sentence_scores

def get_summary(text, limit=3):
    sentences = sent_tokenize(text)
    stopwords_list = set(stopwords.words("english"))

    words = []
    for sentence in sentences:
        words += [word.lower() for word in word_tokenize(sentence) if word.isalpha() and word.lower() not in stopwords_list]

    sentence_scores = score_sentences(sentences, words, stopwords_list)
    summary_sentences = nlargest(limit, sentence_scores, key=sentence_scores.get)
    summary = [sentence for sentence in sentences if sentence in summary_sentences]

    return summary

def summarize(text, limit=3):
    summary = get_summary(text, limit)
    print('\n'.join(summary))

In [65]:
summarize(text, 5)

Northern Ireland Secretary James Brokenshire said the devolved Northern Ireland Assembly will sit for the last time on 25 January, before it is dissolved the following day.
Ben Wallace 'considering resigning' as defence secretary in expected autumn reshuffle
                


                  Westminster Accounts: Number 10 says MPs should 'focus on serving constituencies' after Sky News reveal second jobs earnings
                


                  Politics latest: Downing Street responds to latest Westminster Accounts revelations - as Tory warns against 'smearing' MPs
                

He said: "We have had scandal after scandal, allegations of corruption need to be investigated properly and the people responsible need to be held to account."
Mrs Foster, who presided over the controversial renewable energy scheme as enterprise minister, claimed Sinn Fein "did not like the election result last May and are therefore looking to have another go".
Please use Chrome browser for a more 