Preprocessing

In [None]:
%pip install nltk
%pip install lxml
%pip install nltk beautifulsoup4 requests

In [42]:
# Import necessary libraries for web scraping, text processing, and summarization
import bs4 as bs
import urllib.request
import re
import nltk
import heapq
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [None]:
# Update NLTK data path to avoid issues with missing resources
nltk.data.path.append(r"C:/Users/MSI-NB/AppData/Roaming/nltk_data")

# Download necessary NLTK resources (tokenizers and stopwords)
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Add NLTK data path again after downloading
nltk.data.path.append(r"C:/Users/MSI-NB/AppData/Roaming/nltk_data")

# Check the NLTK data paths to confirm correct setup
print(nltk.data.path)

In [45]:
# Scrape the article from Wikipedia using the URL
url = 'https://en.wikipedia.org/wiki/Turkey'
scraped_data = urllib.request.urlopen(url)

# Read the HTML content and decode it as UTF-8
article = scraped_data.read().decode('utf-8')

In [None]:
# Parse the HTML content using BeautifulSoup with 'lxml' parser
parsed_article = bs.BeautifulSoup(article, 'lxml', from_encoding="utf-8")

In [47]:
# Find all the paragraphs in the article
paragraphs = parsed_article.find_all('p')

In [48]:
# Initialize an empty string to store article text
article_text = ""
for p in paragraphs:
    article_text += p.text  # Extract text from each paragraph

In [49]:
# Clean the text by removing references (e.g., [1], [2], etc.)
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  # Remove reference numbers
article_text = re.sub(r'\s+', ' ', article_text)  # Remove extra spaces

# Keep only alphabetic characters (ignoring digits and special symbols)
formatted_article_text = re.sub('[^a-zA-ZçğıöşüÇĞİÖŞÜ]', ' ', article_text)

# Normalize spaces to avoid multiple consecutive spaces
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

Converting Text To Sentences

In [None]:
# Converting Text to Sentences
from nltk.tokenize import sent_tokenize

# Manually add NLTK data path for tokenizers
nltk.data.path.append('C:/Users/MSI-NB/AppData/Roaming/nltk_data/tokenizers')

# Split the cleaned article text into sentences using a regular expression
sentence_list = re.split(r'(?<=[.!?]) +', formatted_article_text)

# Display the first 5 sentences to check the split
print(sentence_list[:5])  # Print the first 5 sentences

Find Weighted Frequency of Occurrence

In [51]:
# Load English stopwords from NLTK
stop_words = set(stopwords.words('english'))

In [52]:
# Initialize a dictionary to store word frequencies
word_frequencies = {}

# Calculate word frequencies, excluding stopwords and non-alphabetic characters
for sentence in sentence_list:
    for word in sentence.split():
        word = word.lower()  # Convert to lowercase for consistency
        if word not in stop_words and word.isalpha():  # Exclude stopwords and non-alphabetic words
            word_frequencies[word] = word_frequencies.get(word, 0) + 1

In [53]:
# Find the highest frequency word in the dictionary
max_frequency = max(word_frequencies.values())

In [54]:
# Normalize the word frequencies to adjust them relative to the highest frequency
for word in word_frequencies:
    word_frequencies[word] = (word_frequencies[word] / max_frequency)

Calculating Sentence Scores

In [55]:
# Initialize a dictionary to store sentence scores
sentence_scores = {}

# Calculate sentence scores based on word frequencies
for sentence in sentence_list:
    sentence_score = 0
    for word in sentence.split():
        word = word.lower()
        if word in word_frequencies:
            sentence_score += word_frequencies[word]  # Add word frequency to the sentence score
    sentence_scores[sentence] = sentence_score

Getting the Summary

In [56]:
# Select the top 7 sentences with the highest scores to form the summary
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

In [None]:
# Generate the summary by joining the top sentences
summary = ' '.join(summary_sentences)

# Statistics: Calculate the number of sentences and characters before and after summarization
original_sentence_count = len(sentence_list)
original_character_count = len(article_text)
summary_sentence_count = len(summary_sentences)
summary_character_count = len(summary)

# Print the summary and statistics
print("Summary:\n")
print(summary)

print("\nText Statistics:")
print(f"Number of sentences before summarization: {original_sentence_count}")
print(f"Number of characters before summarization: {original_character_count}")
print(f"Number of sentences in the summary: {summary_sentence_count}")
print(f"Number of characters in the summary: {summary_character_count}")