Web Data ETL Pipeline: Using Beautiful Soup and NLTK to extract article text from a web page, cleaning and processing the text, calculating word frequencies and generating a sort data frame.

In [1]:
#imports
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ellieballard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#extract text from any online article

class WebScraper:
    def __init__(self, url):
        self.url = url

    def extract_article_text(self):
        response = requests.get(self.url)
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")
        article_text = soup.get_text()
        return article_text

In [3]:
#clean and preprocess the data to store the frequency of each word
class TextProcessor:
    def __init__(self, nltk_stopwords):
        self.nltk_stopwords = nltk_stopwords

    def tokenize_and_clean(self, text):
        words = text.split()
        filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
        return filtered_words

In [4]:
#Define a class for the entire ETL process
class ETLPipeline:
    def __init__(self, url):
        self.url = url
        self.nltk_stopwords = set(stopwords.words("english"))

    def run(self):
        scraper = WebScraper(self.url)
        article_text = scraper.extract_article_text()

        processor = TextProcessor(self.nltk_stopwords)
        filtered_words = processor.tokenize_and_clean(article_text)

        word_freq = Counter(filtered_words)
        df = pd.DataFrame(word_freq.items(), columns=["Words", "Frequencies"])
        df = df.sort_values(by="Frequencies", ascending=False)
        return df

In [5]:
if __name__ == "__main__":
    article_url = "https://www.bbc.com/news/uk-wales-66580020"
    pipeline = ETLPipeline(article_url)
    result_df = pipeline.run()
    print(result_df.head())

        Words  Frequencies
0        swan            6
4     traffic            6
177  reported            3
58       bird            3
187    wagner            3
