In [4]:
import re
import urllib.request
import bs4 as BeautifulSoup
import nltk
import heapq
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\efecank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\efecank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
stop_words = stopwords.words('english')

In [7]:
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')

In [8]:
article = scraped_data.read()

In [9]:
article_parsed = BeautifulSoup.BeautifulSoup(article,'lxml')

In [10]:
paragraphs = article_parsed.find_all('p')

In [11]:
article_content = ''
for p in paragraphs:
    article_content += p.text

In [12]:
article_content = re.sub(r'\[[0-9]*\]', ' ', article_content)
article_content = re.sub(r'\s+', ' ', article_content)

In [13]:
formatted_article_content = re.sub('[^a-zA-Z]', ' ', article_content )
formatted_article_content = re.sub(r'\s+', ' ', formatted_article_content)

In [14]:
words = word_tokenize(formatted_article_content)

In [15]:
word_frequencies = {}

for word in words:
    word = word.lower()
    if word not in stop_words:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [16]:
word_frequencies

{'python': 168,
 'high': 3,
 'level': 2,
 'general': 2,
 'purpose': 1,
 'programming': 23,
 'language': 32,
 'design': 6,
 'philosophy': 5,
 'emphasizes': 1,
 'code': 16,
 'readability': 2,
 'use': 16,
 'significant': 1,
 'indentation': 6,
 'dynamically': 3,
 'typed': 5,
 'garbage': 4,
 'collected': 1,
 'supports': 4,
 'multiple': 3,
 'paradigms': 2,
 'including': 15,
 'structured': 2,
 'particularly': 3,
 'procedural': 1,
 'object': 10,
 'oriented': 6,
 'functional': 4,
 'often': 8,
 'described': 1,
 'batteries': 2,
 'included': 1,
 'due': 5,
 'comprehensive': 1,
 'standard': 15,
 'library': 12,
 'guido': 4,
 'van': 7,
 'rossum': 7,
 'began': 2,
 'working': 1,
 'late': 3,
 'successor': 2,
 'abc': 3,
 'first': 4,
 'released': 9,
 'major': 5,
 'revision': 1,
 'completely': 1,
 'backward': 1,
 'compatible': 1,
 'earlier': 1,
 'versions': 8,
 'last': 1,
 'release': 5,
 'consistently': 2,
 'ranks': 1,
 'one': 9,
 'popular': 4,
 'languages': 18,
 'gained': 1,
 'widespread': 1,
 'machine': 3

In [17]:
maximum_frequency = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

In [18]:
maximum_frequency

168

In [19]:
sentence_list = sent_tokenize(article_content)

In [20]:
sentence_scores = {}

for sent in sentence_list:
    for word, frequency in word_frequencies.items():
        if word in sent.lower():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = frequency
            else:
                sentence_scores[sent] += frequency

In [21]:
sentence_scores

{' Python is a high-level, general-purpose programming language.': 1.5535714285714284,
 'Its design philosophy emphasizes code readability with the use of significant indentation.': 0.6785714285714284,
 'Python is dynamically typed and garbage-collected.': 1.5238095238095233,
 'It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming.': 1.0238095238095235,
 'It is often described as a "batteries included" language due to its comprehensive standard library.': 0.869047619047619,
 'Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.': 2.0238095238095233,
 'Python 2.0 was released in 2000.': 1.1607142857142856,
 'Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions.': 1.5476190476190472,
 'Python 2.7.18, released in 2020, was the last release of Python 2.': 1.

In [22]:
avg_score = sum([sentence_scores[sent] for sent in sentence_scores]) / len(sentence_scores)

In [23]:
avg_score

1.3385819845179445

In [31]:
summary = []
for sent in sentence_list:
    if (sent in sentence_scores) and (sentence_scores[sent] > (1.5 * avg_score)):
        summary.append(sent)

In [32]:
final = " ".join(summary)

In [33]:
final

'Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0. Python consistently ranks as one of the most popular programming languages, and has gained widespread use in the machine learning community. Python was invented in the late 1980s by Guido van Rossum at Centrum Wiskunde & Informatica (CWI) in the Netherlands as a successor to the ABC programming language, which was inspired by SETL, capable of exception handling and interfacing with the Amoeba operating system. Python 2.0 was released on 16 October 2000, with many major new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. In 2021 (and again twice in 2022), security updates were expedited, since all Python versions were insecure (including 2.7 ) because of security issues leading to possible remote code execution and web-cache poisoning. Python 3.12 adds syntax (and in fa

In [34]:
len(final)

6883

In [35]:
len(formatted_article_content)

26151