Word Cloud - with pos tags 

In [2]:
# Required Libraries (try again with spacy)
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import json
from general_functions.translation import translate_to_spanish

# Constants
STOP_WORDS_EN = set(stopwords.words('english'))
STOP_WORDS_ES = set(stopwords.words('spanish'))
STOP_WORDS = STOP_WORDS_EN.union(STOP_WORDS_ES)

# POS Tagging to general category mapping
POS_MAP = {
    'NN': 'noun', 'NNS': 'noun', 'NNP': 'noun', 'NNPS': 'noun',
    'VB': 'verb', 'VBD': 'verb', 'VBG': 'verb', 'VBN': 'verb', 'VBP': 'verb', 'VBZ': 'verb',
    'JJ': 'adjective', 'JJR': 'adjective', 'JJS': 'adjective',
    'RB': 'adverb', 'RBR': 'adverb', 'RBS': 'adverb',
    'IN': 'preposition',
    'DT': 'determiner',
    'PRP': 'pronoun', 'PRP$': 'pronoun', 'WP': 'pronoun', 'WP$': 'pronoun',
    'MD': 'modal',
    'CC': 'conjunction',
    'CD': 'number',
    'EX': 'existential',
    'FW': 'foreign',
    'POS': 'possessive',
    'PDT': 'predeterminer',
    'RP': 'particle',
    'SYM': 'symbol',
    'TO': 'to',
    'UH': 'interjection',
    'WRB': 'wh-adverb'
}

def get_pos(word):
    tag = pos_tag([word])[0][1]
    return POS_MAP.get(tag, 'unknown')

def tokenize_and_filter(text, language='english'):
    words = nltk.word_tokenize(text)
    if language == 'english':
        stop_words = STOP_WORDS_EN
    else:
        stop_words = STOP_WORDS_ES
    return [word.lower() for word in words if word.isalpha() and word not in stop_words]

def word_frequencies(words):
    return Counter(words)

def generate_wordcloud_from_text(text, language='english'):
    words = tokenize_and_filter(text, language)
    word_freq = word_frequencies(words)
    if language == 'english':
        stop_words = STOP_WORDS_EN
    else:
        stop_words = STOP_WORDS_ES
    wc = WordCloud(stopwords=stop_words, background_color='white', max_words=100, width=800, height=400).generate_from_frequencies(word_freq)
    return wc

def save_to_json(data, output_path):
    formatted_data = [
        {
            'word': key,
            'translation': translate_to_spanish(key),
            'value': value,
            'category': get_pos(key)
        }
        for key, value in data.items()
    ]
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(formatted_data, f)

# Main Script
input_path = "../data/lady_tiger.txt"
output_path = "../data/wcloud.json"

with open(input_path, 'r', encoding='utf-8') as file:
    text_data = file.read().replace('\n', ' ')

wordcloud = generate_wordcloud_from_text(text_data)

# Display the word cloud 

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Save word frequencies in JSON
word_freq_data = word_frequencies(tokenize_and_filter(text_data))
save_to_json(word_freq_data, output_path)
print(f"Word cloud data saved to {output_path}")


Word cloud data saved to ../data/wcloud.json


In [None]:
from general_functions.chart_data.wordcloud_data import generate_wordcloud_data

input_path = "../data/lady_tiger.txt"
output_path = "../data/wcloud.json"

generate_wordcloud_data(input_path, output_path, 50)