In [10]:
from bs4 import BeautifulSoup
import requests

In [11]:
# Text Summarization with Word Frequencies

In [12]:
# 1.1 - web-scraping technique

def get_page_content(url):
    result = ""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "lxml")
    content = soup.find(id = "content")
    pars = content.find_all("p")
    for p in pars:
        result += p.text
    return result.lower()

URL = "https://en.wikipedia.org/wiki/Natural_language_processing"
content = get_page_content(URL)
content

'natural language processing (nlp) is an interdisciplinary subfield of computer science and linguistics. it is primarily concerned with giving computers the ability to support and manipulate speech. it involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. the goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. the technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.\nchallenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.\nnatural language processing has its roots in the 1950s. already in 1950, alan turing published an article titled "computing machinery and 

In [23]:
# 1.2 Preprocess text data (tokenize, remove stop words, remove punct.)

from nltk import wordpunct_tokenize
import string
from nltk.corpus import stopwords

def remove_punctuation(text):
    return "".join([c for c in text if c not in string.punctuation])

def get_clean_words(text):
    stop_words = stopwords.words("english")
    words = wordpunct_tokenize(text)
    words = [remove_punctuation(word) for word in words if remove_punctuation(word)!= ""]
    words = [word for word in words if word not in stop_words+["e","g"]]
    return words

words = get_clean_words(content)    
print(words[0:20])

['natural', 'language', 'processing', 'nlp', 'interdisciplinary', 'subfield', 'computer', 'science', 'linguistics', 'primarily', 'concerned', 'giving', 'computers', 'ability', 'support', 'manipulate', 'speech', 'involves', 'processing', 'natural']


In [25]:
# 1.3 Calculate Word Frequencies

from nltk.probability import FreqDist

def calculate_words_frequency(words):
    fdist_words = FreqDist(words)
    max_count = fdist_words.most_common(1)[0][1]
    for word in fdist_words.keys():
        fdist_words[word] = fdist_words[word]/max_count
    return fdist_words

words_frequency = calculate_words_frequency(words)
print(words_frequency.most_common(20))
    

[('language', 1.0), ('natural', 0.6956521739130435), ('nlp', 0.6521739130434783), ('cognitive', 0.5652173913043478), ('processing', 0.5217391304347826), ('linguistics', 0.391304347826087), ('based', 0.391304347826087), ('tasks', 0.391304347826087), ('approach', 0.391304347826087), ('statistical', 0.34782608695652173), ('neural', 0.30434782608695654), ('machine', 0.30434782608695654), ('learning', 0.2608695652173913), ('approaches', 0.2608695652173913), ('rules', 0.2608695652173913), ('speech', 0.21739130434782608), ('rule', 0.21739130434782608), ('understanding', 0.17391304347826086), ('intelligence', 0.17391304347826086), ('symbolic', 0.17391304347826086)]


In [28]:
# 1.4 Score the sentences

from nltk import sent_tokenize

def score_sentence(sentence):
    words = wordpunct_tokenize(sentence.lower())
    score = 0
    for word in words:
        if word in words_frequency.keys():
            score += words_frequency[word]
    return score

sentences = sent_tokenize(content)
sent_dict = {}
for sentence in sentences:
    score = score_sentence(sentence)
    sent_dict[sentence] = score
    
# sort by scores

sent_dict = {key:value for key, value in sorted(sent_dict.items(), key=lambda item: item[1], reverse = True)}
sent_dict



{'as an example, george lakoff offers a methodology to build natural language processing (nlp) algorithms through the perspective of cognitive science, along with the findings of cognitive linguistics,[47] with two defining aspects:\nties with cognitive linguistics are part of the historical heritage of nlp, but they have been less frequently addressed since the statistical turn during the 1990s.': 7.913043478260872,
 'machine learning approaches, which include both statistical and neural networks, on the other hand, have many advantages over the symbolic approach: \nalthough rule-based systems for manipulating symbols were still in use in 2020, they have become mostly obsolete with the advance of llms in 2023. \nbefore that they were commonly used:\nin the late 1980s and mid-1990s, the statistical approach ended a period of ai winter, which was caused by the inefficiencies of the rule-based approaches.': 6.5652173913043494,
 'challenges in natural language processing frequently involv

In [34]:
# 1.5 Build a summary based on sentence count, word count, and percentage

def build_summary_based_on_sentence_count(sentence_count):
    result = ""
    i = 0
    for sentence, score in sent_dict.items():
        if i >= sentence_count:
            break
        if result != "":
            result += "\n"
        result += sentence
        i += 1
    return result

def build_summary_based_on_word_count(word_count):
    result = ""
    sent_index = 0
    word_index = 0
    word_counter = 0
    sent_words = []
    space = ""
    new_line = ""
    sentences = list(sent_dict)
    while word_counter < word_count:
        if len(sent_words) == 0:
            sent_words = wordpunct_tokenize(sentences[sent_index])
            word_index = 0
            space = ""
            if len(result)>0:
                new_line = "\n"
        if word_index < len(sent_words):
            result += new_line + space + sent_words[word_index]
            word_index += 1
            word_counter += 1
            space = " "
            new_line = ""
        else:
            sent_words = []
            sent_index += 1
    return result

def build_summary_based_on_percentage(percentage):
    total_words = len(words)
    word_count = (percentage/100)*total_words
    return build_summary_based_on_word_count(word_count) 

print("summary based on sentence count")
print(build_summary_based_on_sentence_count(3))

print("\n\n\nsummary based on word count")
print(build_summary_based_on_word_count(100))

print("\n\n\nsummary based on percentage")
print(build_summary_based_on_percentage(15))

summary based on sentence count
as an example, george lakoff offers a methodology to build natural language processing (nlp) algorithms through the perspective of cognitive science, along with the findings of cognitive linguistics,[47] with two defining aspects:
ties with cognitive linguistics are part of the historical heritage of nlp, but they have been less frequently addressed since the statistical turn during the 1990s.
machine learning approaches, which include both statistical and neural networks, on the other hand, have many advantages over the symbolic approach: 
although rule-based systems for manipulating symbols were still in use in 2020, they have become mostly obsolete with the advance of llms in 2023. 
before that they were commonly used:
in the late 1980s and mid-1990s, the statistical approach ended a period of ai winter, which was caused by the inefficiencies of the rule-based approaches.
challenges in natural language processing frequently involve speech recognition,