In [3]:
import json
import re
from collections import Counter
from bs4 import BeautifulSoup

# load text data from texts.json
with open('texts.json', 'r', encoding='utf-8') as file:
    text_data = json.load(file)

# clean and split text into words
def extract_words(text):
    # remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    # find words, excluding numbers and single letter words
    words = re.findall(r'\b[a-zA-Z]{2,}\b', clean_text.lower())
    return words

# counter for all words
word_counter = Counter()

# process every text
for url, text in text_data.items():
    words = extract_words(text)
    word_counter.update(words)

# get thecommon words
most_common_words = word_counter.most_common()

# to dictionary format
word_frequency = {word: count for word, count in most_common_words}

# save the word data to JSON
with open('text_common_words.json', 'w', encoding='utf-8') as json_file:
    json.dump(word_frequency, json_file, ensure_ascii=False, indent=4)

# save the word data to HTML
with open('text_common_words.html', 'w', encoding='utf-8') as html_file:
    html_file.write('<html><body>\n')
    html_file.write('<h1>Word Frequency</h1>\n')
    html_file.write('<ul>\n')
    for word, count in most_common_words:
        html_file.write(f'<li>{word}: {count}</li>\n')
    html_file.write('</ul>\n')
    html_file.write('</body></html>\n')

print("Word frequency data saved to word_frequency.json and word_frequency.html.")


Word frequency data saved to word_frequency.json and word_frequency.html.
