In [None]:
# Get Data
from general_functions.file_operations import read_txt_file
data_folder = "../data/"
text_file_location = data_folder + "extracted_chapters/Chapter_1.txt"
text = read_txt_file(text_file_location)
print(text)

Run BookNLP pipeline, create 

In [None]:
## BookNLP Stuff
from general_functions.booknlp import run_booknlp

# Constants 
# Input file to process
input_file="../data/dahl-boy-4.txt"
# Output directory to store resulting files in 
output="../data/booknlp_output/boy/"
# File within this directory will be named ${book_id}.entities, ${book_id}.tokens, etc.
id="boy"
pipeline="entity,quote,supersense,event,coref"

#Call functions
run_booknlp(input_file_location=input_file, output_directory=output, book_id=id,pipeline=pipeline)


In [None]:
#Suspense chart 

#Required libraries 
import pandas as pd
import spacy
import json

# Constants
lexicon_path = "../data/NRC-VAD-Lexicon.csv"
story_path = "../data/dahl-boy-4.txt"
output = "../data/suspense.json"


# Load the English model for SpaCy; used for tokenizing the story into sentences/words
nlp = spacy.load("en_core_web_sm")

# Reads the lexicon data; returns it as dataframe
def load_lexicon(file_path):
    return pd.read_csv(file_path, sep="\t")

# Reads the story from file; splits it into paragraphs; returns a list of paragraphs
def read_and_split_story(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return [para.strip() for para in text.split("\n\n") if para]

# Calculates a score for each word in a sentence based on valence and arousal values from lexicon
# Final score for the sentence is the average of all word scores
# If the word doesn't exist in lexicon or isn't alphanumeric, it's skipped.
def get_negative_valence_arousal_score(sentence, lexicon_df):
    doc = nlp(sentence)
    scores = []
    for token in doc:
        word = token.text
        if not word.isalnum():
            continue
        word_data = lexicon_df.loc[lexicon_df['Word'] == word]
        if not word_data.empty:
            valence = word_data['Valence'].values[0]
            arousal = word_data['Arousal'].values[0]
            negative_valence = 1 - valence
            scores.append(negative_valence + arousal)
    if scores:
        return sum(scores) / len(scores)
    else:
        return None

# Analyzes each paragraph in the story. Tokenizes each paragraph into sentences
# Computes sentiment for each sentence
# Result for each paragraph is the average sentiment score of its sentences 
# + sentence with the highest score
def analyze_paragraphs(paragraphs, lexicon_df):
    para_scores = {}
    for index, para in enumerate(paragraphs):
        sentence_scores = []
        doc = nlp(para)
        max_sent = ""
        max_score = -100
        for sent in doc.sents:
            score = get_negative_valence_arousal_score(sent.text, lexicon_df)
            if score and score > max_score:
                max_score = score
                max_sent = sent.text
            sentence_scores.append(score)
        sentence_scores = [score for score in sentence_scores if score is not None]
        if not sentence_scores:
            print(para)
            continue
        average_score = sum(sentence_scores) / len(sentence_scores)
        para_scores[index + 1] = [average_score, max_sent]
    return para_scores

# Saves a python dictionary as JSON file
def write_to_json(data, output_path):
    with open(output_path, "w") as json_file:
        json.dump(data, json_file)

# Main script 

# Load lexicon data
lexicon_df = load_lexicon(lexicon_path)

# Extract paragraphs from the story
paragraphs = read_and_split_story(story_path)

# Analyze the sentiment of paragraphs
results = analyze_paragraphs(paragraphs, lexicon_df)


# Save results to a JSON file
write_to_json(results, output)
