#               **Wikipedia Article Summarizer** _(Python project based on NLP techniques)_

#                                      Text Summarization with NLTK
*(NLTK, or Natural Language Toolkit, is a Python package that you can use for NLP.)*

 *Since I like F-Secure and wishing to attend their trainings, I search for them and with this simple Wikipedia Article summarizer, I also practise NLP and Python, meanwhile learning more about F-Secure, its history, culture, etc.*

In [3]:
# Install these as requirements if you need. You may also try "pip3 install beautifulsoup4" if "pip" encounters errors.
!pip install beautifulsoup4
!pip install lxml



### Scrapping Wikipedia Article

In [4]:
import bs4 as bs
import urllib.request
import re

# Normally, in Jupyter Notebooks, you may prefer to give a fixed URL, change the URL when you need it
# and not ask for user input.But I wanted to see which articles,
# I can get a better summary and when the NLTK does "so so":)
userLink = input("Which Wikipedia article would you want me to summarize: ")
# Provide the Wikipedia URL like this: https://
raw_data = urllib.request.urlopen(userLink)
document = raw_data.read()

parsed_document = bs.BeautifulSoup(document,'lxml')

article_paras = parsed_document.find_all('p')

scrapped_data = ""

for para in article_paras:
    scrapped_data += para.text

Which Wikipedia article would you want me to summarize: https://en.wikipedia.org/wiki/List_of_2023_box_office_number-one_films_in_the_United_States


In [5]:
print(scrapped_data[:1500])


This is a list of films which ranked number one at the weekend box office for the year 2023.[1]
Highest-grossing films of 2023 by Calendar Gross[54]




### Text Cleaning

In [6]:
scrapped_data = re.sub(r'\[[0-9]*\]', ' ',  scrapped_data)
scrapped_data = re.sub(r'\s+', ' ',  scrapped_data)

In [7]:
formatted_text = re.sub('[^a-zA-Z]', ' ', scrapped_data)
formatted_text = re.sub(r'\s+', ' ', formatted_text)

In [8]:
!pip install nltk



### Finding Word Frequencies

In [9]:
import nltk #if you don't have it, then>> python3 -m pip install nltk
nltk.download('punkt')
all_sentences = nltk.sent_tokenize(scrapped_data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
# Stop Words are the words that you will most probably ignore, so we filter them out of the text.
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

word_freq = {}
for word in nltk.word_tokenize(formatted_text):
    if word not in stopwords:
        if word not in word_freq.keys():
            word_freq[word] = 1
        else:
            word_freq[word] += 1

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
max_freq = max(word_freq.values())

for word in word_freq.keys():
    word_freq[word] = (word_freq[word]/max_freq)

### Finding Sentence Scores

In [12]:
sentence_scores = {}
for sentence in all_sentences:
    for token in nltk.word_tokenize(sentence.lower()):
        if token in word_freq.keys():
            if len(sentence.split(' ')) <25:
                if sentence not in sentence_scores.keys():
                    sentence_scores[sentence] = word_freq[token]
                else:
                    sentence_scores[sentence] += word_freq[token]

### Printing Summaries

In [13]:
import heapq
selected_sentences= heapq.nlargest(5, sentence_scores, key=sentence_scores.get)

text_summary = ' '.join(selected_sentences)
print(text_summary)

 This is a list of films which ranked number one at the weekend box office for the year 2023. Highest-grossing films of 2023 by Calendar Gross


In [14]:
selected_sentences
stopwords
scrapped_data

' This is a list of films which ranked number one at the weekend box office for the year 2023. Highest-grossing films of 2023 by Calendar Gross '

In [16]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [17]:
selected_sentences


[' This is a list of films which ranked number one at the weekend box office for the year 2023.',
 'Highest-grossing films of 2023 by Calendar Gross']