In [None]:
%pip install spacy
%pip install textblob
%pip install spacytextblob
!python -m spacy download en_core_web_sm
!python -m textblob.download_corpora
%pip install ipywidgets

## import packages

In [364]:
import os
import glob
import spacy
from textblob import TextBlob
from scipy.integrate import odeint
from tqdm.notebook import tqdm
import numpy as np
import logging
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import matplotlib.pyplot as plt
import math
from transformers import BertTokenizer, BertModel
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


# Load the English tokenizer, POS tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

In [365]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [204]:
def parse_xml_to_dataframe(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Extracting play title
    play_title = root.find('TITLE').text

    # Extracting speaker and speech
    data = []
    for speech in root.iter('SPEECH'):
        speaker = speech.find('SPEAKER').text
        lines = [line.text for line in speech.findall('LINE') if line.text]
        speech_text = ' '.join(lines)
        data.append({'title':play_title,'speaker': speaker, 'speech': speech_text})

    # Creating DataFrame
    df = pd.DataFrame(data, columns=['title','speaker', 'speech'])
    return df


In [64]:
# Get all XML files in the zip folder
zip_file_path = 'data/corpora/shaks200.zip'
zip_file_name = os.path.basename(zip_file_path)
zip_file_name_no_ext = os.path.splitext(zip_file_name)[0]
zip_file_dir = os.path.dirname(zip_file_path)
zip_file_extracted_dir = os.path.join(zip_file_dir, zip_file_name_no_ext)
zip_file_extracted_dir = Path(zip_file_extracted_dir)

# Extract the zip file if it hasn't been extracted already
if not zip_file_extracted_dir.exists():
    import zipfile
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(zip_file_extracted_dir)

# Get all XML files in the extracted folder
xml_files = glob.glob(f'{zip_file_extracted_dir}/**/*.xml', recursive=True)
logging.info(f"Found {len(xml_files)} XML files")
# Parse each XML file into a DataFrame
dfs = []
for xml_file in tqdm(xml_files, desc="Parsing XML files"):
    df = parse_xml_to_dataframe(xml_file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
df


Parsing XML files:   0%|          | 0/37 [00:00<?, ?it/s]

Unnamed: 0,title,speaker,speech
0,The Second Part of Henry the Sixth,SUFFOLK,As by your high imperial majesty I had in char...
1,The Second Part of Henry the Sixth,KING HENRY VI,"Suffolk, arise. Welcome, Queen Margaret: I can..."
2,The Second Part of Henry the Sixth,QUEEN MARGARET,"Great King of England and my gracious lord, Th..."
3,The Second Part of Henry the Sixth,KING HENRY VI,"Her sight did ravish; but her grace in speech,..."
4,The Second Part of Henry the Sixth,ALL,happiness!
...,...,...,...
31023,The Tragedy of Antony and Cleopatra,First Guard,"O Caesar, This Charmian lived but now; she sto..."
31024,The Tragedy of Antony and Cleopatra,OCTAVIUS CAESAR,O noble weakness! If they had swallow'd poison...
31025,The Tragedy of Antony and Cleopatra,DOLABELLA,"Here, on her breast, There is a vent of blood ..."
31026,The Tragedy of Antony and Cleopatra,First Guard,This is an aspic's trail: and these fig-leaves...


In [25]:
# For testing purposes, use a subset of the data
df = df.head(10)

In [65]:
# process each speech using spaCy
df['speech_processed'] = df['speech'].apply(lambda x: nlp(x))
df.head() # Displaying the first few rows of the DataFrame

Unnamed: 0,title,speaker,speech,speech_processed
0,The Second Part of Henry the Sixth,SUFFOLK,As by your high imperial majesty I had in char...,"(As, by, your, high, imperial, majesty, I, had..."
1,The Second Part of Henry the Sixth,KING HENRY VI,"Suffolk, arise. Welcome, Queen Margaret: I can...","(Suffolk, ,, arise, ., Welcome, ,, Queen, Marg..."
2,The Second Part of Henry the Sixth,QUEEN MARGARET,"Great King of England and my gracious lord, Th...","(Great, King, of, England, and, my, gracious, ..."
3,The Second Part of Henry the Sixth,KING HENRY VI,"Her sight did ravish; but her grace in speech,...","(Her, sight, did, ravish, ;, but, her, grace, ..."
4,The Second Part of Henry the Sixth,ALL,happiness!,"(happiness, !)"


In [187]:

def process_speeches(input_df, senticnet_path,):
    # Caricare SenticNet data
    senticnet_data = pd.read_csv(senticnet_path, delimiter="\t")
    categories = ['INTROSPECTION', 'TEMPER', 'ATTITUDE', 'SENSITIVITY']

    # Preparare il DataFrame per i risultati
    results = []
    

    for _, row in tqdm(input_df.iterrows(), total=input_df.shape[0]):
        speech_processed = row['speech_processed']
        speaker = row['speaker']
        speech = row['speech']
        title = row['title']

        # Accumulators for each category and emotion
        accumulators = {}
        polaritylist = []
        for sent in speech_processed.sents:
            for token in sent:
                token_text = token.text.lower()
                matching_row = senticnet_data[senticnet_data['CONCEPT'] == token_text]
                logging.debug(f"Token: {token_text}, matching row: {matching_row}")

                if not matching_row.empty:
                    # Find max and min categories
                    max_category = matching_row[categories].astype(float).idxmax(axis=1).iloc[0]
                    min_category = matching_row[categories].astype(float).idxmin(axis=1).iloc[0]
                    
                    primary_emotion = matching_row['PRIMARY EMOTION'].iloc[0]
                    secondary_emotion = matching_row['SECONDAY EMOTION'].iloc[0]
    
                    if pd.isna(primary_emotion):
                        max_emotion = max_category
                    else:
                        max_emotion = f"{max_category}{matching_row['PRIMARY EMOTION'].iloc[0]}"
    
                    if pd.isna(secondary_emotion):
                        min_emotion = min_category
                    else:
                        min_emotion = f"{min_category}{matching_row['SECONDAY EMOTION'].iloc[0]}"

                    if max_emotion not in accumulators:
                        accumulators[max_emotion] = []
                    if min_emotion not in accumulators:
                        accumulators[min_emotion] = []

                    accumulators[max_emotion].append(matching_row[max_category].iloc[0])
                    if max_emotion != min_emotion:
                        accumulators[min_emotion].append(matching_row[min_category].iloc[0])
                        
                    polarity = matching_row["POLARITY INTENSITY"].astype(float).iloc[0]
                    logging.debug(f"Token: {token_text}, polarity: {polarity}")
                    polaritylist.append(polarity)
                else:
                    logging.debug(f"Token: {token_text}, no matching row found")
                    
                   

        # Calculate averages for each emotion category
        emotion_avg = {emotion: sum(values) / len(values) if values else 0 for emotion, values in accumulators.items()}
        
        # Calculate average polarity for the speech
        polarity_avg = {"POLARITY": sum(polaritylist) / len(polaritylist) if polaritylist else 0}
        
        
        
        result_row = {'title':title,"speaker": speaker, "speech": speech, **emotion_avg, **polarity_avg}
        results.append(result_row)
    # drop columns with all 0
    results = pd.DataFrame(results).fillna(0)
    results = results.loc[:, (results != 0).any(axis=0)]
    
    # Verify that directory exists otherwise create it
    Path("results/speeches_processed.csv").parent.mkdir(parents=True, exist_ok=True)
    results.to_csv('results/speeches_processed.csv', index=False)
    return results
   

In [67]:
processed_df = process_speeches(df, 'data/senticnet/senticnet.tsv')
processed_df

  0%|          | 0/31028 [00:00<?, ?it/s]

Unnamed: 0,title,speaker,speech,TEMPER#serenity,INTROSPECTION#acceptance,ATTITUDE#delight,INTROSPECTION,INTROSPECTION#ecstasy,INTROSPECTION#terror,SENSITIVITY#loathing,...,INTROSPECTION#sadness,ATTITUDE#responsiveness,ATTITUDE#contentment,SENSITIVITY#terror,SENSITIVITY#annoyance,ATTITUDE#dislike,INTROSPECTION#serenity,ATTITUDE#disgust,SENSITIVITY#fear,TEMPER#contentment
0,The Second Part of Henry the Sixth,SUFFOLK,As by your high imperial majesty I had in char...,0.26750,0.0000,0.934000,-0.027417,0.913000,0.000000,-0.965,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Second Part of Henry the Sixth,KING HENRY VI,"Suffolk, arise. Welcome, Queen Margaret: I can...",0.25000,0.0000,0.000000,0.000000,0.899000,0.000000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The Second Part of Henry the Sixth,QUEEN MARGARET,"Great King of England and my gracious lord, Th...",0.25725,0.0000,0.914500,0.000000,0.944200,0.000000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,The Second Part of Henry the Sixth,KING HENRY VI,"Her sight did ravish; but her grace in speech,...",0.00000,0.0000,0.931000,-0.266667,0.927333,0.000000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,The Second Part of Henry the Sixth,ALL,happiness!,0.00000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31023,The Tragedy of Antony and Cleopatra,First Guard,"O Caesar, This Charmian lived but now; she sto...",0.19600,0.0000,0.999000,-0.514667,0.934333,0.000000,-0.849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31024,The Tragedy of Antony and Cleopatra,OCTAVIUS CAESAR,O noble weakness! If they had swallow'd poison...,0.26000,0.0000,0.933000,-0.415500,0.858000,0.000000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31025,The Tragedy of Antony and Cleopatra,DOLABELLA,"Here, on her breast, There is a vent of blood ...",0.00000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31026,The Tragedy of Antony and Cleopatra,First Guard,This is an aspic's trail: and these fig-leaves...,0.00000,0.0000,0.000000,0.000000,0.886000,0.000000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate the mean sentiment value for each subcategory aggregated by main category

In [316]:

# Creating a DataFrame
df_sample = processed_df.copy()

# Extracting columns that contain emotion categories
emotion_columns = [col for col in df_sample.columns if '#' in col]

# Grouping columns by their main category
category_groups = {}
for col in emotion_columns:
    category, _ = col.split('#')
    if category not in category_groups:
        category_groups[category] = []
    category_groups[category].append(col)

# Calculating mean values for each category
for category, subcategories in category_groups.items():
    df_sample[category] = df_sample[subcategories].mean(axis=1)

# Dropping the subcategory columns
df_sample = df_sample.drop(columns=emotion_columns)

df_sample


Unnamed: 0,title,speaker,speech,INTROSPECTION,TEMPER,POLARITY,SENSITIVITY,ATTITUDE
0,The Second Part of Henry the Sixth,SUFFOLK,As by your high imperial majesty I had in char...,0.044187,0.016719,0.424476,-0.039231,0.116844
1,The Second Part of Henry the Sixth,KING HENRY VI,"Suffolk, arise. Welcome, Queen Margaret: I can...",0.065687,0.075938,0.657238,0.110154,0.061750
2,The Second Part of Henry the Sixth,QUEEN MARGARET,"Great King of England and my gracious lord, Th...",0.058887,0.016078,0.487429,0.052577,0.147469
3,The Second Part of Henry the Sixth,KING HENRY VI,"Her sight did ravish; but her grace in speech,...",0.062708,0.000000,0.296400,0.000000,0.099375
4,The Second Part of Henry the Sixth,ALL,happiness!,0.041188,0.000000,0.659000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
31023,The Tragedy of Antony and Cleopatra,First Guard,"O Caesar, This Charmian lived but now; she sto...",0.040740,0.012250,0.038909,-0.065308,0.062437
31024,The Tragedy of Antony and Cleopatra,OCTAVIUS CAESAR,O noble weakness! If they had swallow'd poison...,0.053625,0.016250,0.106375,0.000000,0.099500
31025,The Tragedy of Antony and Cleopatra,DOLABELLA,"Here, on her breast, There is a vent of blood ...",0.000000,0.000000,-0.834000,0.000000,0.000000
31026,The Tragedy of Antony and Cleopatra,First Guard,This is an aspic's trail: and these fig-leaves...,0.055375,0.000000,0.010167,0.073615,0.000000


In [372]:
df_sample.columns.difference(['title','speaker', 'speech', 'POLARITY'])

Index(['ATTITUDE', 'INTROSPECTION', 'SENSITIVITY', 'TEMPER'], dtype='object')

## Navier-Stokes Sentiment Flow

In [205]:

def navier_stokes_sentiment_flow(rho_sent:float, p_sent: float, nu_sent, g_context, s):
    """
Calculate the sentiment flow based on the Navier-Stokes equation.

The Navier-Stokes equation is a differential equation that describes the flow of fluid substances like liquids and gases. This function applies the same principles to model the flow of sentiment in a text.

Parameters:
rho_sent (float): The sentiment density, representing the amount of sentiment per unit of text.
p_sent (float): The sentiment pressure, representing the intensity of the sentiment.
nu_sent (float): The sentiment viscosity, representing the resistance to flow of the sentiment.
g_context (float): The external contextual force, representing the influence of the surrounding context on the sentiment.
s (float): The current sentiment state.

Returns:
float: The sentiment flow.
"""
    if rho_sent == 0:
        pressure_term = np.zeros_like(s)  # or some other appropriate handling
    else:
        pressure_term = -1 / rho_sent * np.gradient(p_sent)
        
    grad_s = np.gradient(s)
    laplacian_s = np.gradient(grad_s)
    
    if np.any(np.isnan(s)) or np.any(np.isinf(s)) or np.any(np.isnan(grad_s)) or np.any(np.isinf(grad_s)):
        logging.warning("Warning: NaN or inf detected in s or grad_s. Skipping this iteration.")
        return None  # Return None to indicate an issue

    convective_term = s * grad_s
    #pressure_term = -1 / rho_sent * np.gradient(p_sent)
    viscous_term = nu_sent * laplacian_s
    
    rhs = convective_term + pressure_term + viscous_term + g_context
    
    # Clamping: Prevent values from going to infinity
    np.clip(rhs, -1e10, 1e10, out=rhs)
    
    if np.any(np.isnan(rhs)) or np.any(np.isinf(rhs)):
        logging.warning("Warning: NaN or inf detected in rhs. Skipping this iteration.")
        return None  # Return None to indicate an issue
    
    logging.debug(f"Sentiment flow: {rhs}")
    
    return rhs

In [206]:
import numpy as np

def calculate_sentiment_density(sentiment_scores):
    """Calculate the density of sentiment from a list or array of sentiment scores."""
    return np.sum(np.abs(sentiment_scores))

def calculate_sentiment_pressure(score, keywords, text):
    """Calculate the sentiment pressure of a single sentiment score."""
    pressure = 0
    if any(keyword.lower() in text.lower() for keyword in keywords):
        pressure += score
    return pressure

def calculate_sentiment_viscosity(sentiment_scores):
    """Calculate the viscosity of sentiment from a list or array of sentiment scores."""
    return np.std(sentiment_scores)

def calculate_external_contextual_force(polarity):
    """
    Calculate the external contextual force for a given review using its POLARITY value.
    The POLARITY value directly represents the external contextual force.

    Args:
        polarity (float): The POLARITY value of the review.

    Returns:
        float: The external contextual force for the review.
    """
    return polarity


# Assuming the navier_stokes_sentiment_flow and other calculation functions are defined elsewhere

def differential_equation_old(s, t, rho_sent, p_sent, nu_sent, g_context):
    # Check the sizes of s and p_sent
    if len(s) != len(p_sent):
        raise ValueError("Mismatch in dimensions of s and p_sent")

    # Use the navier_stokes_sentiment_flow function to compute the derivative
    dsdt = navier_stokes_sentiment_flow(rho_sent, p_sent, nu_sent, g_context, s)

    # Ensure dsdt is the same size as s
    if len(dsdt) != len(s):
        raise ValueError("Mismatch in dimensions of dsdt and s")

    return dsdt


In [263]:
def differential_equation(s, t, speech_info):
    rho_sent, p_sent, nu_sent, g_context = speech_info
    # Compute the sentiment flow using the updated parameters
    dsdt = navier_stokes_sentiment_flow(rho_sent, p_sent, nu_sent, g_context, s)

    if dsdt is None:
        raise ValueError("Invalid sentiment flow calculation")

    return dsdt


In [264]:
negative_words = [
    "abhorrent", "abrasive", "abrupt", "absurd", "abusive", "accidental", 
    "accusatory", "acerbic", "adverse", "aggressive", "alarmist", "alienating", 
    "angry", "annoyed", "annoying", "anxious", "apathetic", "appalling", 
    "arrogant", "ashamed", "awful", "awkward", "bad", "belligerent", 
    "bitter", "bizarre", "blame", "boring", "bothersome", "burdensome", 
    "callous", "chaotic", "clumsy", "coarse", "cold", "complacent", 
    "complaint", "complicated", "concerned", "confused", "contemptible", 
    "cruel", "crushing", "cynical", "damaging", "dangerous", "dark", 
    "deadly", "deceitful", "deceptive", "degrading", "dejected", "delinquent", 
    "deplorable", "depressed", "depressing", "desolate", "desperate", 
    "destructive", "detached", "detrimental", "devastating", "difficult", 
    "disappointing", "disastrous", "disdainful", "disgraceful", "disgusting", 
    "dishonest", "disillusioned", "dismissive", "displeased", "disruptive", 
    "dissatisfied", "distasteful", "distraught", "distressed", "disturbed", 
    "doubtful", "dreary", "dull", "dysfunctional", "embarrassing", "enraged", 
    "envy", "erratic", "evil", "excessive", "exhausting", "fearful", 
    "fearsome", "flawed", "foolish", "frantic", "frightening", "frustrating", 
    "futile", "gloomy", "grave", "greedy", "grim", "guilty", "harmful", 
    "hateful", "haunting", "hazardous", "helpless", "hopeless", "hostile"
]
positive_words = [
    "admirable", "adorable", "adventurous", "agreeable", "amazing", "ambitious", 
    "amiable", "amusing", "appealing", "appreciative", "articulate", "artistic", 
    "astonishing", "astute", "attentive", "attractive", "auspicious", "authentic", 
    "awesome", "beautiful", "beneficial", "blissful", "bountiful", "brave", 
    "bright", "brilliant", "bubbly", "calm", "capable", "captivating", "careful", 
    "charismatic", "charming", "cheerful", "cherished", "clever", "comforting", 
    "compassionate", "competent", "confident", "congenial", "considerate", 
    "content", "convivial", "courageous", "courteous", "creative", "cute", 
    "dazzling", "decent", "dedicated", "delightful", "dependable", "determined", 
    "diligent", "diplomatic", "dynamic", "eager", "earnest", "easygoing", 
    "ebullient", "educated", "effective", "efficient", "elegant", "eloquent", 
    "empathetic", "enchanting", "encouraging", "energetic", "engaging", 
    "enjoyable", "entertaining", "enthusiastic", "excellent", "exceptional", 
    "exciting", "exemplary", "exquisite", "extraordinary", "exuberant", 
    "fabulous", "fair", "faithful", "fantastic", "fascinating", "fearless", 
    "fine", "flourishing", "focused", "forgiving", "fortunate", "friendly", 
    "fun", "funny", "generous", "genial", "gentle", "genuine", "gifted", 
    "glorious", "good", "gracious", "grateful", "great", "happy", "harmonious", 
    "helpful", "hilarious", "honest", "honorable", "hopeful", "hospitable", 
    "humorous", "idealistic", "illustrious", "imaginative", "impressive", 
    "incredible", "independent", "industrious", "innovative", "insightful", 
    "inspiring", "intelligent", "intuitive", "inventive", "joyful", "jubilant", 
    "keen", "kind", "knowledgeable", "laudable", "lively", "lovable", "lovely", 
    "loving", "loyal", "lucky", "magnificent", "marvelous", "masterful", 
    "meticulous", "mindful", "miraculous", "modest", "motivated", "optimistic", 
    "outstanding", "passionate", "patient", "peaceful", "perfect", "persevering", 
    "persistent", "philanthropic", "playful", "pleasant", "pleasing", "poised", 
    "polished", "popular", "positive", "powerful", "praiseworthy", "precious", 
    "precise", "preeminent", "prestigious", "productive", "professional", 
    "proficient", "profound", "prolific", "prominent", "prosperous", "protective", 
    "proud", "prudent", "punctual", "purposeful", "qualified", "quintessential", 
    "radiant", "rational", "realistic", "reassuring", "receptive", "remarkable", 
    "resilient", "resolute", "resourceful", "respectable", "respectful", 
    "resplendent", "responsible", "responsive", "revered", "rewarding", "rich", 
    "righteous", "robust", "romantic", "sagacious", "satisfying", "savvy", 
    "scholarly", "scrupulous", "self-assured", "self-reliant", "sensible", 
    "sensitive", "serene", "sharp", "shining", "sincere", "skillful", "smart", 
    "smiling", "smooth", "sociable", "solid",
    "sophisticated", "spirited", "splendid", "steadfast", "stimulating",
    "stupendous", "stunning", "stupendous", "sturdy", "stylish", "suave",
    "sublime", "successful", "succinct", "super", "superb", "supportive",
    "surprising", "sustained", "sweet", "talented", "tenacious", "terrific",
    "thankful", "thoughtful", "thriving", "timely", "tireless", "tolerant",
    "top", "tranquil", "trusting", "truthful", "ultimate", "unbiased",
    "uncommon", "understanding", "unequaled", "unflappable", "unique"]

keywords_example = positive_words + negative_words

In [366]:
def remove_duplicates(all_results):
    unique_results = []
    seen_speeches = set()

    for sim_result, current_speech in all_results:
        # Check if the speech is not in the seen set
        if current_speech not in seen_speeches:
            seen_speeches.add(current_speech)
            unique_results.append((sim_result, current_speech))
        # If the speech is already seen, it's a duplicate and won't be added

    return unique_results


def calculate_navier_stocker(data: pd.DataFrame) -> dict:
    """
    Calculate the Navier-Stokes sentiment flow for each speaker in a dataset.

    This function applies the principles of fluid dynamics, specifically the Navier-Stokes equations,
    to model the flow of sentiment in speeches. It processes each speaker's speeches sequentially,
    updating the sentiment analysis based on the evolving context and sentiment parameters.

    Args:
        data (DataFrame): A pandas DataFrame containing the speeches and sentiment data. 
            The DataFrame should have columns titled 'title', 'speaker', 'speech', and 'POLARITY', 
            along with other sentiment-related columns.

    Returns:
        dict: A dictionary where each key is a play title and each value is a list of dictionaries.
            Each dictionary in the list contains 'speaker', 'speech', and 'simulation' keys. 
            'speaker' is the name of the speaker, 'speech' is a list of speeches, and 'simulation' 
            is a numpy array with the results of the sentiment simulation for each speech.

    Note:
        This function assumes that the sentiment data is already preprocessed and available 
        in the DataFrame. The 'POLARITY' column is used to calculate the external contextual force. 
        The Navier-Stokes equations are adapted to sentiment analysis, considering factors like 
        sentiment density, pressure, viscosity, and external context.
    """
    
    all_s = {}
    sentiment_columns = data.columns.difference(['title','speaker', 'speech', 'POLARITY'])
    
    for speaker in tqdm(data['speaker'].unique(), desc="Processing speakers"):
        speaker_data = data[data['speaker'] == speaker]
        title = speaker_data.iloc[0]['title']
        logging.info(f"Processing speaker: {speaker}, number of speeches: {len(speaker_data)}")

        # Initialize variables for the first speech
        initial_speaker = speaker_data.iloc[0]
        s0 = initial_speaker[sentiment_columns].apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy()
        s0_g_context = calculate_external_contextual_force(initial_speaker['POLARITY'])
        initial_speech = initial_speaker['speech']
        current_time = 0
        all_results = []
        unique_all_results = []
        for i in range(1, len(speaker_data)):
            # Get the next speech
            current_speech_data = speaker_data.iloc[i]
            if i == 1:
                # Use initial contextual force for the first iteration
                g_context = s0_g_context
                current_speech = initial_speech
            else:
                # Update contextual force for subsequent speeches
                g_context = calculate_external_contextual_force(current_speech_data['POLARITY'])
                current_speech = current_speech_data['speech']


            # Define time array for this chunk
            t = np.array([current_time, current_time + 1])
            # Prepare arguments for the differential equation
            speech_info = (
                calculate_sentiment_density(s0), 
                np.array([calculate_sentiment_pressure(score, keywords_example, current_speech) for score in s0]), 
                calculate_sentiment_viscosity(s0), 
                g_context
            )
            # Solve ODE for this chunk with the current speech
            s = odeint(differential_equation, s0, t, args=(speech_info,))

            for sim_result in s.tolist():
                all_results.append((sim_result, current_speech))
            # Example usage
            unique_all_results = remove_duplicates(all_results)

            # Update initial conditions for the next chunk
            s0 = s[-1]
            # Update time
            current_time += 1
        
        # Separare i risultati della simulazione dai discorsi
        simulation_results, speeches = zip(*unique_all_results) if unique_all_results else ([], [])
        simulation_results = np.vstack(simulation_results) if simulation_results else np.array([])

        # Combine all chunks
        #combined_result = np.vstack(all_results) if all_results else s0[None, :]
        # Modifica per salvare i discorsi insieme ai loro risultati di simulazione
        if title not in all_s:
            all_s[title] = []
        all_s[title].append({
            'speaker': speaker,
            'speech': speeches,  # Salva i discorsi
            'simulation': simulation_results,  # Salva i risultati della simulazione
        })

    return all_s




# Example usage
#all_s = calculate_navier_stocker(df_sample[df_sample['title'] == 'The Tragedy of Romeo and Juliet'])
all_s = calculate_navier_stocker(df_sample)

Processing speakers:   0%|          | 0/981 [00:00<?, ?it/s]



In [377]:
all_s['The Tragedy of Romeo and Juliet'][0]['simulation']

array([[ 6.60312500e-02,  2.34501488e-02, -3.46153846e-04,
         1.36458333e-02],
       [ 4.00505579e-01,  2.69056427e-01, -5.14876495e-03,
        -1.65245654e-01],
       [ 5.94363101e-01,  5.31441712e-01,  3.18488491e-01,
         1.33007149e-01],
       [ 8.08952527e-01,  7.55779204e-01,  5.86786695e-01,
         4.31671607e-01],
       [ 9.18405999e-01,  8.51772381e-01,  6.99254505e-01,
         5.71689478e-01]])

In [367]:
# Save the simulations to a CSV file
def save_all_s_to_csv(all_s, filename):
    # Create a list to hold the flattened data
    flattened_data = []

    # Iterate over each title and its speakers
    for title, speakers_data in all_s.items():
        for speaker_data in speakers_data:
            # Flatten the structure
            for i, simulation_value in enumerate(speaker_data['simulation']):
                flattened_data.append({
                    'Title': title,
                    'Speaker': speaker_data['speaker'],
                    'Speech': speaker_data['speech'][i],
                    'Time': i,
                    'Simulation': simulation_value
                })

    # Convert to a DataFrame
    df = pd.DataFrame(flattened_data)
    
    # Save to CSV
    df.to_csv(filename, index=False)

# Example usage
save_all_s_to_csv(all_s, 'simulations.csv')


In [368]:
# Save simulations to a pickle file
import pickle

all_s_file = 'simulations.pkl'
with open(all_s_file, 'wb') as f:
    pickle.dump(all_s, f)


In [285]:
# Read the simulations from the pickle file
import pickle

all_s_file = 'simulations.pkl'
with open(all_s_file, 'rb') as f:
    all_s = pickle.load(f)
    

## Plot the sentiment flow for each speaker

In [402]:
import numpy as np
import matplotlib.pyplot as plt
import math
from pathlib import Path

def normalize_data(data):
    min_val = np.min(data)
    max_val = np.max(data)
    # Center data around 0 and scale to range from -1 to 1
    normalized = 2 * ((data - min_val) / (max_val - min_val)) - 1 if max_val > min_val else -np.ones_like(data)
    #normalized = 2 * (data - min_val) / (max_val - min_val) - 1

    return normalized

def plot_speaker_simulations(all_s):
    for title, speakers_data in all_s.items():
        filtered_speakers_data = [d for d in speakers_data if d['simulation'].size]

        num_speakers = len(filtered_speakers_data)
        num_cols = 3
        num_rows = math.ceil(num_speakers / num_cols)

        fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
        if num_rows * num_cols > 1:
            axes = axes.flatten()
        else:
            axes = [axes]

        subplot_index = 0

        for speaker_data in filtered_speakers_data:
            ax = axes[subplot_index]
            subplot_index += 1

            simulations = speaker_data['simulation']
            #print(speaker_data["speaker"], simulations)
            
            x_axis = range(1, simulations.shape[0] + 1)

            simulation_labels = ['ATTITUDE', 'INTROSPECTION', 'SENSITIVITY', 'TEMPER']
            for i, label in enumerate(simulation_labels):
                normalized_sim_step = normalize_data(simulations[:, i])
                ax.plot(x_axis, normalized_sim_step, label=label)
            
            # Plot only the ONE dimension
            #attitude_data = simulations[:, 0]  # Assuming ATTITUDE is the first column
            #normalized_attitude_data = normalize_data(attitude_data)
            #ax.bar(x_axis, attitude_data, label='ATTITUDE')

            
            ax.set_title(f'{speaker_data["speaker"]}')
            ax.set_xlabel('Simulation Step')
            ax.set_ylabel('Normalized Simulation Value')
            ax.legend(title="Simulation Components")

            ax.set_xticks([x for x in x_axis if x == 1 or x % 5 == 0])
            ax.set_xticklabels([x for x in x_axis if x == 1 or x % 5 == 0])

        for j in range(subplot_index, len(axes)):
            axes[j].axis('off')

        fig.suptitle(title)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        Path("results/sentiment_flow_plots").mkdir(parents=True, exist_ok=True)
        plt.savefig(f'results/sentiment_flow_plots/{title}.png')
        logging.info(f"Saved plot for {title}")
        plt.close()

# Example usage with your input data (assuming all_s is defined as per your input)
plot_speaker_simulations(all_s)

## plot the mean of sentiment flow for each speaker

In [107]:

def normalize_data(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized = (data - min_val) / (max_val - min_val) if max_val != min_val else data
    return normalized

def plot_speaker_simulations(all_s):
    for title, speakers_data in all_s.items():
        num_speakers = len(speakers_data)
        num_cols = 3
        num_rows = math.ceil(num_speakers / num_cols)

        fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
        if num_rows * num_cols > 1:
            axes = axes.flatten()
        else:
            axes = [axes]

        for i, speaker_data in enumerate(speakers_data):
            if i >= len(axes):
                break

            ax = axes[i]

            speeches = speaker_data['speech']
            simulations = speaker_data['simulation']

            if not speeches or not any(len(sim) > 0 for sim in simulations):
                ax.text(0.5, 0.5, 'No Data', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
                ax.set_title(f'{speaker_data["speaker"]}')
                continue

            # Normalize and plot every second simulation
            normalized_simulation=[]
            for j in range(0, len(simulations),2):
                normalized_simulation.append(normalize_data(simulations[j]))
            mean_normalized_simulation= np.mean(normalized_simulation, axis=0)
            ax.plot(mean_normalized_simulation)

            ax.set_title(f'{speaker_data["speaker"]}')
            ax.set_xlabel('Text Segment')
            ax.set_ylabel('Normalized Sentiment Value')

            # Set x-ticks to correspond to the number of speech segments
            #ax.set_xticks(range(len(speeches)))

        fig.suptitle(title)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        Path("results/sentiment_flow_plots_mean").mkdir(parents=True, exist_ok=True)
        plt.savefig(f'results/sentiment_flow_plots_mean/{title}.png')
        logging.info(f"Saved plot for {title}")
        plt.close()

# Example usage with your input data (assuming all_s is defined as per your input)
plot_speaker_simulations(all_s)


## Choose the simulation that shows the greatest variation or most interesting dynamics in feelings. 

In [102]:


def plot_speaker_simulations(all_s):
    for title, speakers_data in all_s.items():
        # Calculate standard deviation for each speaker and find the one with the highest
        max_std = float('inf')
        #max_std = 0
        most_variable_speaker = None
        for speaker_data in speakers_data:
            simulation = speaker_data['simulation']
            std_dev = np.std(simulation)
            if std_dev > max_std:
                max_std = std_dev
                most_variable_speaker = speaker_data
            else:
                logging.warning(f"Speaker {speaker_data['speaker']} has std dev {std_dev}. Skipping.")

        # Proceed only if a speaker is found
        if most_variable_speaker:
            # Plot the simulation data of the speaker with the greatest variation
            plt.figure(figsize=(10, 6))
            plt.plot(most_variable_speaker['simulation'])
            plt.title(f'{most_variable_speaker["speaker"]} in {title}')
            plt.xlabel('Time or Text Segment')
            plt.ylabel('Sentiment Value')
            plt.grid(True)

            # Save the plot
            Path("results/sentiment_flow_plots_most_variable_speaker").mkdir(parents=True, exist_ok=True)
            plt.savefig(f'results/sentiment_flow_plots_most_variable_speaker/{title}_{most_variable_speaker["speaker"]}.png')
            logging.info(f"Saved plot for {most_variable_speaker['speaker']} in {title}")
            plt.close()

# Example usage
plot_speaker_simulations(all_s)




In [145]:
for title, speakers_data in all_s.items():
    for speaker_data in speakers_data:
        if speaker_data['speaker'] == 'ROMEO':
            print(speaker_data['speaker'])
            print(f"Lunghezza speech: {len(speaker_data['speech'])}")
            print(f"Lunghezza simulation: {len(speaker_data['simulation'])}")

ROMEO
Lunghezza speech: 163
Lunghezza simulation: 324


##  Get the speech with the highest sentiment value

In [110]:

# Iterate over each title
for title, speakers_data in all_s.items():
    # Calculate the mean sentiment value for each speaker
    mean_sentiment_values = []
    for speaker_data in speakers_data:
        simulation = np.array(speaker_data['simulation'])
        mean_sentiment_value = np.mean(simulation)
        mean_sentiment_values.append(mean_sentiment_value)

    # Find the speaker with the highest mean sentiment value
    max_mean_sentiment_value = max(mean_sentiment_values)
    max_mean_sentiment_value_index = mean_sentiment_values.index(max_mean_sentiment_value)
    speaker_with_highest_sentiment_value = speakers_data[max_mean_sentiment_value_index]

    # Get the speech of the speaker with the highest sentiment value
    speech_with_highest_sentiment_value = speaker_with_highest_sentiment_value['speech']

    # Save the speech to a text file
    Path("results/speech_with_highest_sentiment_value").mkdir(parents=True, exist_ok=True)
    with open(f'results/speech_with_highest_sentiment_value/{title}.txt', 'w') as f:
        f.write(speech_with_highest_sentiment_value)
    logging.info(f"Saved speech with highest sentiment value for {title}")
    

TypeError: write() argument must be str, not list