In [5]:
import os
from pathlib import Path
import json
import spacy
from spacy_download import load_spacy

# Will download the model if it isn't installed yet
nlp = load_spacy("en_core_web_lg")  

In [6]:
# process news articles retreived from the crawler
news_directory = '/Users/chrisperumal/news-please-repo/data/2024/04/17'
news_content = []

news_sources = os.listdir(news_directory)

for source in news_sources:
    news_path = Path(news_directory, source)
    json_content = [f for f in news_path.glob('*.json') if f.is_file()]

    for json_path in json_content:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # if the news article didn't have any text data
        if data['maintext'] is None or data['language'] != 'en':
            continue

        # remove the following substrings from the data:
        main_text = data['maintext']
        main_text = main_text.replace("Editorial Note: We earn a commission from partner links on Forbes Advisor. Commissions do not affect our editors' opinions or evaluations.", "", 1)
        main_text = main_text.replace("You might be using an unsupported or outdated browser. To get the best possible experience please use the latest version of Chrome, Firefox, Safari, or Microsoft Edge to view this website.", "", 1)
        main_text = main_text.replace("Our editors are committed to bringing you unbiased ratings and information. Our editorial content is not influenced by advertisers. We use data-driven methodologies to evaluate financial products and companies, so all are measured equally. You can read more about our editorial guidelines and the credit card methodology for the ratings below.", "", 1)

        if main_text == "":
            continue

        doc = nlp(main_text)
        # remove punctuation, stop words and any additional complex punctuation characters
        tokens = [token.text for token in doc if token.text not in '\n\n \n\n\n\t\n ']
        cleaned_text = ' '.join(tokens)
        # initial cleanup of the article
        news_content.append(cleaned_text)


In [None]:
remove_duplicates = set(news_content)
news_content = list(remove_duplicates)

In [8]:
output_filename = "article_content.json"

# Write the list to the file as JSON so that I don't have to process it multiple times
with open(output_filename, 'w', encoding='utf-8') as file:
    json.dump(news_content, file, indent=4)