Word Cloud - for one chapter or whole text 

In [3]:
from general_functions.chart_data.wordcloud_data import generate_wordcloud_data


input_path = "../data/perrault.txt"
output_path = "../data/wcloud_perrault.json"

generate_wordcloud_data(input_path, output_path, 50)

Reading input file...
Tokenizing and filtering stopwords...
Getting top 50 words...
Translate words and format output process started...
Processed 10 words...
Processed 20 words...
Processed 30 words...
Processed 40 words...
Processed 50 words...
Writing to output file...
Word cloud data saved to ../data/wcloud_perrault.json


In [5]:
import json
import nltk
from nltk.corpus import stopwords
import string

def get_top_words(input_path: str, output_path: str, top_n: int):
    stop_words = set(stopwords.words('english'))

    with open(input_path, "r") as f:
        text = f.read().lower()

    words = text.split()
    word_freq = {}
    for word in words:
        word = word.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
        if word and word not in stop_words:  # check if word is not empty after removing punctuation
            if word not in word_freq:
                word_freq[word] = 1
            else:
                word_freq[word] += 1

    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    top_words = sorted_word_freq[:top_n]

    top_words_dict = {}
    for word, freq in top_words:
        top_words_dict[word] = freq

    with open(output_path, "w") as f:
        json.dump(top_words_dict, f, indent=4)


input_path = "../data/beaumont.txt"
output_path = "../data/top_words_beaumont.json"
top_n = 20

get_top_words(input_path, output_path, top_n)

## Sentiment analysis

In [2]:
#Sentiment chart 

#Required libraries 
import pandas as pd
import spacy
import json

# Constants
lexicon_path = "../data/NRC-VAD-Lexicon.csv"
story_path = "../data/beaumont.txt"
output = "../data/sentiment_beaumont.json"


# Load the English model for SpaCy; used for tokenizing the story into sentences/words
nlp = spacy.load("en_core_web_sm")

# Reads the lexicon data; returns it as dataframe
def load_lexicon(file_path):
    return pd.read_csv(file_path, sep="\t")

# Calculates a score for each word in a sentence based on valence and arousal values from lexicon
# Final score for the sentence is the average of all word scores
# If the word doesn't exist in lexicon or isn't alphanumeric, it's skipped.
def get_negative_valence_arousal_score(sentence, lexicon_df):
    doc = nlp(sentence)
    scores = []
    for token in doc:
        word = token.text
        if not word.isalnum():
            continue
        word_data = lexicon_df.loc[lexicon_df['Word'] == word]
        if not word_data.empty:
            valence = word_data['Valence'].values[0]
            arousal = word_data['Arousal'].values[0]
            negative_valence = 1 - valence
            scores.append(negative_valence + arousal)
    if scores:
        return sum(scores) / len(scores)
    else:
        return None

# Reads the story from file; splits it into chunks of 500 words; returns a list of chunks
def read_and_split_story(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    words = text.split()
    return [' '.join(words[i:i+500]) for i in range(0, len(words), 500)]

# Analyzes each chunk in the story. Tokenizes each chunk into sentences
# Computes sentiment for each sentence
# Result for each chunk is the average sentiment score of its sentences 
# + sentence with the highest score
def analyze_paragraphs(chunks, lexicon_df):
    chunk_scores = {}
    for index, chunk in enumerate(chunks):
        sentence_scores = []
        doc = nlp(chunk)
        max_sent = ""
        max_score = -100
        for sent in doc.sents:
            score = get_negative_valence_arousal_score(sent.text, lexicon_df)
            if score and score > max_score:
                max_score = score
                max_sent = sent.text
            sentence_scores.append(score)
        sentence_scores = [score for score in sentence_scores if score is not None]
        if not sentence_scores:
            print(chunk)
            continue
        average_score = sum(sentence_scores) / len(sentence_scores)
        chunk_scores[index * 500] = average_score
    return chunk_scores

# Saves a python dictionary as JSON file
def write_to_json(data, output_path):
    with open(output_path, "w") as json_file:
        json.dump(data, json_file)

# Main script 

# Load lexicon data
lexicon_df = load_lexicon(lexicon_path)

# Extract paragraphs from the story
paragraphs = read_and_split_story(story_path)

# Analyze the sentiment of paragraphs
results = analyze_paragraphs(paragraphs, lexicon_df)


# Save results to a JSON file
write_to_json(results, output)
