In [1]:
import re
import urllib.request
import bs4 as BeautifulSoup
import nltk
import heapq
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\efecank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\efecank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
stop_words = stopwords.words('english')

In [4]:
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Automatic_summarization')

In [5]:
article = scraped_data.read()

In [6]:
article_parsed = BeautifulSoup.BeautifulSoup(article,'lxml')

In [7]:
paragraphs = article_parsed.find_all('p')

In [8]:
article_content = ''
for p in paragraphs:
    article_content += p.text

In [9]:
article_content = re.sub(r'\[[0-9]*\]', ' ', article_content)
article_content = re.sub(r'\s+', ' ', article_content)

In [10]:
formatted_article_content = re.sub('[^a-zA-Z]', ' ', article_content )
formatted_article_content = re.sub(r'\s+', ' ', formatted_article_content)

In [11]:
words = word_tokenize(formatted_article_content)

In [12]:
word_frequencies = {}

for word in words:
    word = word.lower()
    if word not in stop_words:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [13]:
word_frequencies

{'automatic': 14,
 'summarization': 76,
 'process': 7,
 'shortening': 1,
 'set': 19,
 'data': 7,
 'computationally': 2,
 'create': 8,
 'subset': 3,
 'summary': 28,
 'represents': 2,
 'important': 13,
 'relevant': 5,
 'information': 16,
 'within': 3,
 'original': 15,
 'content': 13,
 'artificial': 1,
 'intelligence': 1,
 'algorithms': 12,
 'commonly': 1,
 'developed': 4,
 'employed': 2,
 'achieve': 3,
 'specialized': 1,
 'different': 10,
 'types': 2,
 'text': 50,
 'usually': 2,
 'implemented': 1,
 'natural': 6,
 'language': 6,
 'processing': 7,
 'methods': 13,
 'designed': 2,
 'locate': 1,
 'informative': 2,
 'sentences': 31,
 'given': 8,
 'document': 40,
 'hand': 2,
 'visual': 2,
 'summarized': 1,
 'using': 16,
 'computer': 2,
 'vision': 1,
 'image': 7,
 'subject': 1,
 'ongoing': 1,
 'research': 5,
 'existing': 4,
 'approaches': 6,
 'typically': 5,
 'attempt': 1,
 'display': 1,
 'representative': 4,
 'images': 8,
 'collection': 7,
 'generate': 4,
 'video': 12,
 'includes': 2,
 'entire'

In [14]:
maximum_frequency = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

In [15]:
maximum_frequency

76

In [16]:
sentence_list = sent_tokenize(article_content)

In [17]:
sentence_scores = {}

for sent in sentence_list:
    for word, frequency in word_frequencies.items():
        if word in sent.lower():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = frequency
            else:
                sentence_scores[sent] += frequency

In [18]:
sentence_scores

{'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.': 3.6052631578947345,
 'Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.': 1.3815789473684208,
 'Text summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.': 3.986842105263156,
 'On the other hand, visual content can be summarized using computer vision algorithms.': 1.263157894736842,
 'Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.': 3.0263157894736823,
 'Video summarization algorithms identify and extr

In [19]:
avg_score = sum([sentence_scores[sent] for sent in sentence_scores]) / len(sentence_scores)

In [20]:
avg_score

2.05751034890597

In [27]:
summary = []
for sent in sentence_list:
    if (sent in sentence_scores) and (sentence_scores[sent] > (1.2 * avg_score)):
        summary.append(sent)

In [28]:
final = " ".join(summary)

In [29]:
final

'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content. Text summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document. Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection. Video summarization algorithms identify and extract from the original video content the most important frames (key-frames), and/or the most important video segments (key-shots), normally in a temporally ordered fashion. Video summaries simply retain a carefully selected subset of the original video frames and, therefore, are not identical to the output of video synopsis algorithm

In [30]:
len(final)

14525

In [31]:
len(formatted_article_content)

32311