In [14]:
import pandas as pd
import json
import re
from general_functions.file_operations import write_json_file

# Read the csv file
df = pd.read_csv('../data/christmas_carol.csv')

# Convert the dataframe to json
data = json.loads(df.to_json(orient='records'))

def remove_bracket_content(text):
    # This will remove content within any type of brackets, including nested brackets
    return re.sub(r'\[[^\]]*\]', '', text)


only_dialogues = []
only_dialogues_without_sound_descriptions = []
for i in range(len(data)):
    # print(data[i])
    if data[i]["type"] == "dialogue":
        only_dialogues.append(data[i])
        data[i]["text"] = remove_bracket_content(data[i]["text"])
        only_dialogues_without_sound_descriptions.append(data[i])
        

# Write the json file
write_json_file('../data/christmas_carol_only_dialogues.json', only_dialogues)
write_json_file('../data/christmas_carol_only_dialogues_without_sound_descriptions.json', only_dialogues_without_sound_descriptions)


Successfully written to file.
Successfully written to file.


In [15]:
import json
from collections import Counter
import re
from general_functions.translation import translate_to_spanish
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from general_functions.file_operations import write_json_file


stop_words = set(stopwords.words('english'))

# POS Tagging to general category mapping
POS_MAP = {
    'NN': 'noun', 'NNS': 'noun', 'NNP': 'noun', 'NNPS': 'noun',
    'VB': 'verb', 'VBD': 'verb', 'VBG': 'verb', 'VBN': 'verb', 'VBP': 'verb', 'VBZ': 'verb',
    'JJ': 'adjective', 'JJR': 'adjective', 'JJS': 'adjective',
    'RB': 'adverb', 'RBR': 'adverb', 'RBS': 'adverb',
    'IN': 'preposition',
    'DT': 'determiner',
    'PRP': 'pronoun', 'PRP$': 'pronoun', 'WP': 'pronoun', 'WP$': 'pronoun',
    'MD': 'modal',
    'CC': 'conjunction',
    'CD': 'number',
    'EX': 'existential',
    'FW': 'foreign',
    'POS': 'possessive',
    'PDT': 'predeterminer',
    'RP': 'particle',
    'SYM': 'symbol',
    'TO': 'to',
    'UH': 'interjection',
    'WRB': 'wh-adverb'
}

def get_pos(word):
    tag = pos_tag([word])[0][1]
    return POS_MAP.get(tag, 'unknown')


def count_words(text):
    words = re.findall(r'\w+', text.lower())
    return Counter(words)

def find_scene(scenes, scene_name):
    for scene in scenes:
        if scene['scene'] == scene_name:
            return scene
    return None

def find_act(acts, act_name):
    for act in acts:
        if act['act'] == act_name:
            return act
    return None

def process_dialogues(dialogues, categories, top_n):
    acts = []
    translations = {}  # Store previously translated words

    for i, dialogue in enumerate(dialogues):
        print(f"Processing dialogue {i+1} of {len(dialogues)}...")

        act_name = dialogue['act']
        scene_name = dialogue['scene']
        text = dialogue['text']

        act = find_act(acts, act_name)
        if not act:
            act = {'act': act_name, 'scenes': []}
            acts.append(act)

        scene = find_scene(act['scenes'], scene_name)
        if not scene:
            scene = {'scene': scene_name, 'words': []}
            act['scenes'].append(scene)

        word_tokens = word_tokenize(text.lower())
        filtered_text = [word for word in word_tokens if word not in stop_words]
        
        for word in filtered_text:
            pos = get_pos(word)
            if pos in categories:
                word_counts = count_words(word)
                for word, count in word_counts.items():
                    word_entry = next((entry for entry in scene['words'] if entry['word'] == word), None)
                    if word_entry:
                        word_entry['value'] += count
                        if text not in word_entry['utterances']:
                            word_entry['utterances'].append(text)
                    else:
                        scene['words'].append({'word': word, 'value': count, 'utterances': [text], 'category': pos})

        # Sort the words in the scene based on their count and keep only the top 'n'
        scene['words'] = sorted(scene['words'], key=lambda x: x['value'], reverse=True)[:top_n]

        # Translate the top 'n' words
        for word_entry in scene['words']:
            word = word_entry['word']
            if word not in translations:
                print(f"Translating word '{word}'...")
                translations[word] = translate_to_spanish(word)
            word_entry['translation'] = translations[word]

    print("Processing completed.")
    return acts

with open('../data/christmas_carol_only_dialogues_without_sound_descriptions.json') as f:
    dialogues = json.load(f)
    dialogues_of_scrooge = [dialogue for dialogue in dialogues if dialogue['character'] == 'Scrooge']
    categories =["noun", "adjective", "adverb", "verb", "pronoun"]
    result = process_dialogues(dialogues_of_scrooge, categories, 100)
    write_json_file('../data/christmas_carol_scrooge_top_words.json', result)

Processing dialogue 1 of 182...
Translating word 'owe'...
Translating word 'money'...
Translating word 'collect'...
Translating word 'jailed'...
Translating word 'due'...
Processing dialogue 2 of 182...
Translating word 'cratchit'...
Translating word 'coal'...
Translating word 'acting'...
Translating word 'cold'...
Translating word 'next'...
Translating word 'asking'...
Translating word 'replenish'...
Translating word 'box'...
Translating word 'well'...
Translating word 'save'...
Translating word 'breath'...
Translating word 'prepared'...
Translating word 'find'...
Translating word 'employ'...
Translating word 'elsewhere'...
Processing dialogue 3 of 182...
Translating word 'bah'...
Translating word 'humbug'...
Processing dialogue 4 of 182...
Translating word 'merry'...
Translating word 'christmas'...
Translating word 'right'...
Translating word 'reason'...
Translating word 'poor'...
Translating word 'enough'...
Processing dialogue 5 of 182...
Processing dialogue 6 of 182...
Translating