In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
character_metadata = pd.read_csv('character.metadata.tsv', sep='\t', names=[
    'movie_id', 'freebase_id', 'release_date', 'character_name', 'actor_dob', 'actor_gender', 
    'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'character_actor_map_id', 
    'character_freebase_id', 'actor_freebase_id'
])
character_metadata.head()

Unnamed: 0,movie_id,freebase_id,release_date,character_name,actor_dob,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age,character_actor_map_id,character_freebase_id,actor_freebase_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [3]:
plot_summaries = pd.read_csv('plot_summaries.txt', sep='\t', names=['movie_id', 'plot_summary'])
plot_summaries.head()

Unnamed: 0,movie_id,plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [4]:
name_clusters = pd.read_csv('name.clusters.txt', sep='\t', header=None)
name_clusters.head()

Unnamed: 0,0,1
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn


In [5]:
# Merge plot summaries and character metadata on movie_id
merged_data = pd.merge(plot_summaries, character_metadata, on='movie_id')
merged_data

Unnamed: 0,movie_id,plot_summary,freebase_id,release_date,character_name,actor_dob,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age,character_actor_map_id,character_freebase_id,actor_freebase_id
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,1990-09-07,,,,,,Natalia Koliakanova,,/m/0gby7pd,,/m/0gby7pj
1,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,1990-09-07,,1951-04-14,M,,,Pyotr Mamonov,39.0,/m/07lld1w,,/m/06trhc
2,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,1990-09-07,,1919-10-08,M,,/m/0x67,Hal Singer,70.0,/m/0gc0hbm,,/m/01n4sp6
3,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,1990-09-07,,1926-10-26,,,,Vladimir Kashpur,63.0,/m/0gc3tz0,,/m/08087zv
4,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,1990-09-07,,,,,,Pyotr Zaychenko,,/m/0gcjqgq,,/m/0clzzrg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308480,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,1971-03-12,,1920-01-09,M,,/m/0g96wd,Clive Dunn,51.0,/m/0jwx5f,,/m/01vct06
308481,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,1971-03-12,,1897-03-25,M,,,John Laurie,,/m/0jwx5l,,/m/057hy_
308482,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,1971-03-12,,1896-01-07,M,,,Arnold Ridley,,/m/0jwx5x,,/m/02t7zg
308483,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,1971-03-12,,1946-02-16,M,1.77,,Ian Lavender,25.0,/m/0jwx61,,/m/04xs2l


In [73]:
# Filter rows where 'character_name' is not NaN
non_nan_characters = merged_data[~merged_data['character_name'].isna()]

# Display the filtered rows
print(non_nan_characters)


        movie_id                                       plot_summary  \
6       31186339  The nation of Panem consists of a wealthy Capi...   
7       31186339  The nation of Panem consists of a wealthy Capi...   
8       31186339  The nation of Panem consists of a wealthy Capi...   
9       31186339  The nation of Panem consists of a wealthy Capi...   
10      31186339  The nation of Panem consists of a wealthy Capi...   
...          ...                                                ...   
308470   8628195  Abdur Rehman Khan , a middle-aged dry fruit se...   
308471   8628195  Abdur Rehman Khan , a middle-aged dry fruit se...   
308474   8628195  Abdur Rehman Khan , a middle-aged dry fruit se...   
308475   8628195  Abdur Rehman Khan , a middle-aged dry fruit se...   
308476   8628195  Abdur Rehman Khan , a middle-aged dry fruit se...   

       freebase_id release_date    character_name   actor_dob actor_gender  \
6       /m/0gkz15s   2012-03-12           Foxface         NaN        

In [6]:
merged_data.dtypes

movie_id                    int64
plot_summary               object
freebase_id                object
release_date               object
character_name             object
actor_dob                  object
actor_gender               object
actor_height              float64
actor_ethnicity            object
actor_name                 object
actor_age                 float64
character_actor_map_id     object
character_freebase_id      object
actor_freebase_id          object
dtype: object

In [7]:
import re

def regex_tokenize(text):
    # Regex pattern to split at sentence boundaries
    sentences = re.split(r'(?<=[.!?]) +', text)
    return [sentence.strip() for sentence in sentences if sentence]

merged_data['sentences'] = merged_data['plot_summary'].apply(regex_tokenize)
merged_data['sentences']

0         [Shlykov, a hard-working taxi driver and Lyosh...
1         [Shlykov, a hard-working taxi driver and Lyosh...
2         [Shlykov, a hard-working taxi driver and Lyosh...
3         [Shlykov, a hard-working taxi driver and Lyosh...
4         [Shlykov, a hard-working taxi driver and Lyosh...
                                ...                        
308480    [1940 - Operation Dynamo has just taken place....
308481    [1940 - Operation Dynamo has just taken place....
308482    [1940 - Operation Dynamo has just taken place....
308483    [1940 - Operation Dynamo has just taken place....
308484    [1940 - Operation Dynamo has just taken place....
Name: sentences, Length: 308485, dtype: object

In [32]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import pipeline
from collections import defaultdict

In [9]:
# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
#def character_sentiments(row):
   # sentences = row['sentences']
 ##   character_name = row['character_name']
    
    # Find sentences mentioning the character's name
   # character_sentences = [s for s in sentences if re.search(rf'\b{re.escape(character_name)}\b', s)]
    
    # Perform sentiment analysis on each character-specific sentence
    #sentiment_scores = [sid.polarity_scores(sentence)['compound'] for sentence in character_sentences]
    #return sentiment_scores

In [16]:
# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment_vader(sentence):
    score = analyzer.polarity_scores(sentence)
    return score

In [17]:
def analyze_sentiment_textblob(sentence):
    analysis = TextBlob(sentence)
    return analysis.sentiment.polarity

In [18]:
trying_data = merged_data.drop(['release_date', 'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_age'], axis=1)
trying_data

Unnamed: 0,movie_id,plot_summary,freebase_id,character_name,actor_name,character_actor_map_id,character_freebase_id,actor_freebase_id,sentences
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Natalia Koliakanova,/m/0gby7pd,,/m/0gby7pj,"[Shlykov, a hard-working taxi driver and Lyosh..."
1,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Pyotr Mamonov,/m/07lld1w,,/m/06trhc,"[Shlykov, a hard-working taxi driver and Lyosh..."
2,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Hal Singer,/m/0gc0hbm,,/m/01n4sp6,"[Shlykov, a hard-working taxi driver and Lyosh..."
3,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Vladimir Kashpur,/m/0gc3tz0,,/m/08087zv,"[Shlykov, a hard-working taxi driver and Lyosh..."
4,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Pyotr Zaychenko,/m/0gcjqgq,,/m/0clzzrg,"[Shlykov, a hard-working taxi driver and Lyosh..."
...,...,...,...,...,...,...,...,...,...
308480,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,,Clive Dunn,/m/0jwx5f,,/m/01vct06,[1940 - Operation Dynamo has just taken place....
308481,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,,John Laurie,/m/0jwx5l,,/m/057hy_,[1940 - Operation Dynamo has just taken place....
308482,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,,Arnold Ridley,/m/0jwx5x,,/m/02t7zg,[1940 - Operation Dynamo has just taken place....
308483,6040782,1940 - Operation Dynamo has just taken place. ...,/m/0fm00m,,Ian Lavender,/m/0jwx61,,/m/04xs2l,[1940 - Operation Dynamo has just taken place....


In [44]:
# Define the size of each chunk
chunk_size = 100000  # You can adjust this value based on your requirements

# Split the dataframe into chunks
data_chunks = [trying_data.iloc[i:i + chunk_size] for i in range(0, trying_data.shape[0], chunk_size)]

# Determine the number of chunks
num_chunks = len(data_chunks)
print(f"Total number of chunks: {num_chunks}")

# Example: Access the first chunk
first_chunk = data_chunks[0]
second_chunk = data_chunks[1]
third_chunk = data_chunks[2]
fourth_chunk = data_chunks[3]
print(first_chunk)

Total number of chunks: 4
       movie_id                                       plot_summary  \
0      23890098  Shlykov, a hard-working taxi driver and Lyosha...   
1      23890098  Shlykov, a hard-working taxi driver and Lyosha...   
2      23890098  Shlykov, a hard-working taxi driver and Lyosha...   
3      23890098  Shlykov, a hard-working taxi driver and Lyosha...   
4      23890098  Shlykov, a hard-working taxi driver and Lyosha...   
...         ...                                                ...   
99995  32012357  The film is about a group of girls growing up ...   
99996  32012357  The film is about a group of girls growing up ...   
99997  32012357  The film is about a group of girls growing up ...   
99998  10653373  When young history professor Russ is called up...   
99999  10653373  When young history professor Russ is called up...   

      freebase_id character_name           actor_name character_actor_map_id  \
0      /m/076w2lb            NaN  Natalia Koliakanova

In [19]:
trying_data_small = trying_data[:30]
trying_data_small

Unnamed: 0,movie_id,plot_summary,freebase_id,character_name,actor_name,character_actor_map_id,character_freebase_id,actor_freebase_id,sentences
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Natalia Koliakanova,/m/0gby7pd,,/m/0gby7pj,"[Shlykov, a hard-working taxi driver and Lyosh..."
1,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Pyotr Mamonov,/m/07lld1w,,/m/06trhc,"[Shlykov, a hard-working taxi driver and Lyosh..."
2,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Hal Singer,/m/0gc0hbm,,/m/01n4sp6,"[Shlykov, a hard-working taxi driver and Lyosh..."
3,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Vladimir Kashpur,/m/0gc3tz0,,/m/08087zv,"[Shlykov, a hard-working taxi driver and Lyosh..."
4,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Pyotr Zaychenko,/m/0gcjqgq,,/m/0clzzrg,"[Shlykov, a hard-working taxi driver and Lyosh..."
5,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,,Elena Saphonova,/m/0gckhgw,,/m/0gcfmwr,"[Shlykov, a hard-working taxi driver and Lyosh..."
6,31186339,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,Foxface,Jacqueline Emerson,/m/0gwc39w,/m/0gwc39z,/m/0gwc3b5,[The nation of Panem consists of a wealthy Cap...
7,31186339,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,Katniss Everdeen,Jennifer Lawrence,/m/0gw7kv0,/m/0c01vfc,/m/02x0dzw,[The nation of Panem consists of a wealthy Cap...
8,31186339,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,Peeta Mellark,Josh Hutcherson,/m/0gw7kvp,/m/0c03gdc,/m/08wjf4,[The nation of Panem consists of a wealthy Cap...
9,31186339,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,Effie Trinket,Elizabeth Banks,/m/0gw7kw6,/m/0gwc3bf,/m/0716t2,[The nation of Panem consists of a wealthy Cap...


In [61]:
# Step 1: Define a function to classify characters as villains or non-villains based on the plot summary
def classify_characters(plot_summary):
    villain_keywords = [
    'evil', 'enemy', 'villain', 'antagonist', 'dark', 'opponent', 'criminal', 'ruthless',
    'tyrant', 'corrupt', 'merciless', 'wicked', 'mastermind', 'rival', 'betray', 'sinister', 
    'murderer', 'usurper', 'oppressor', 'rebel', 'outlaw', 'destructive', 'vengeful', 'deceitful',
    'manipulative', 'schemer', 'traitor', 'nemesis', 'invader', 'malicious', 'dictator', 'threat',
    'conqueror', 'warlord', 'fearsome', 'notorious', 'fiend'
]

    tokens = plot_summary.lower().split()  # Simple tokenization to avoid downloading punkt

    for keyword in villain_keywords:
        if keyword in tokens:
            return 'villain'
    return 'not_villain'

# Apply classification to the merged dataframe
merged_data['character_role'] = merged_data['plot_summary'].apply(classify_characters)
count = 0
for role in merged_data['character_role']:
    if role == 'villain':
        count += 1
count

67849

In [67]:
# Step 2: Use Sentiment Analysis to determine if a villainous character is portrayed sympathetically
#def sentiment_analysis(plot_summary):
  #  analysis = TextBlob(plot_summary)
    # Return the polarity: Positive values indicate sympathy, negative values indicate otherwise
   # return analysis.sentiment.polarity

# Apply sentiment analysis to the plot summaries
#merged_data['sentiment_score'] = merged_data['plot_summary'].apply(sentiment_analysis)

# Step 3: Filter for sympathetic villains
# Sympathetic villains are those classified as 'villain' and having a positive sentiment score
#sympathetic_villains = merged_data[(merged_data['character_role'] == 'villain') & (merged_data['sentiment_score'] > 0)]
#sympathetic_villains

In [66]:
#from sklearn.feature_extraction.text import TfidfVectorizer
# Step 4: Extract important keywords/phrases from sympathetic villains' plot summaries
#def extract_keywords(plot_summaries, top_n=5):
    # Use TF-IDF to extract important keywords
 #   vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
  #  X = vectorizer.fit_transform(plot_summaries)
   # keywords = vectorizer.get_feature_names_out()
    #return keywords

# Extract keywords from the sympathetic villains' plot summaries
#sympathetic_villain_summaries = sympathetic_villains['plot_summary'].tolist()
#keywords = extract_keywords(sympathetic_villain_summaries)

# Display the sympathetic villains and keywords
#print(sympathetic_villains[['character_name', 'plot_summary', 'sentiment_score']])
#print("Sympathetic Villains Data:")

#print("\nExtracted Keywords indicating sympathy themes:")
#print(keywords)

In [None]:
# VADER ANALYSIS
# Assume 'sentences' is a list of sentences from a tokenized plot summary
sentiments = []

for sentence in merged_data['sentences']:
    score = analyze_sentiment_vader(sentence)  # or another method
    sentiments.append(score)

In [21]:
sentiments #vader

[{'neg': 0.083, 'neu': 0.832, 'pos': 0.084, 'compound': 0.0083},
 {'neg': 0.083, 'neu': 0.832, 'pos': 0.084, 'compound': 0.0083},
 {'neg': 0.083, 'neu': 0.832, 'pos': 0.084, 'compound': 0.0083},
 {'neg': 0.083, 'neu': 0.832, 'pos': 0.084, 'compound': 0.0083},
 {'neg': 0.083, 'neu': 0.832, 'pos': 0.084, 'compound': 0.0083},
 {'neg': 0.083, 'neu': 0.832, 'pos': 0.084, 'compound': 0.0083},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 'neu': 0.802, 'pos': 0.076, 'compound': -0.9884},
 {'neg': 0.122, 

In [68]:
# TRANSFORMER ANALYSIS
# Initialize the sentiment-analysis pipeline
sentiment_model = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

In [23]:
# Function to analyze sentiment
def analyze_sentiment_transformer(sentence):
    result = sentiment_model(sentence)
    return result  # [{'label': 'POSITIVE' or 'NEGATIVE', 'score': ...}]

In [51]:
# Assume 'sentences' is a list of sentences from a tokenized plot summary
sentiments2 = []

for sentence in trying_data_small['sentences']:
    score = analyze_sentiment_transformer(sentence)  # or another method
    sentiments2.append(score)


In [69]:
sentiments2 # transformer

[[{'label': 'POSITIVE', 'score': 0.9164183735847473}],
 [{'label': 'POSITIVE', 'score': 0.9164183735847473}],
 [{'label': 'POSITIVE', 'score': 0.9164183735847473}],
 [{'label': 'POSITIVE', 'score': 0.9164183735847473}],
 [{'label': 'POSITIVE', 'score': 0.9164183735847473}],
 [{'label': 'POSITIVE', 'score': 0.9164183735847473}],
 [{'label': 'NEGATIVE', 'score': 0.8172034621238708},
  {'label': 'NEGATIVE', 'score': 0.9695325493812561},
  {'label': 'POSITIVE', 'score': 0.9998169541358948},
  {'label': 'POSITIVE', 'score': 0.996258020401001},
  {'label': 'POSITIVE', 'score': 0.9925754070281982},
  {'label': 'POSITIVE', 'score': 0.9831526279449463},
  {'label': 'NEGATIVE', 'score': 0.9349101781845093},
  {'label': 'NEGATIVE', 'score': 0.9779826402664185},
  {'label': 'POSITIVE', 'score': 0.998878538608551},
  {'label': 'NEGATIVE', 'score': 0.9990096092224121},
  {'label': 'NEGATIVE', 'score': 0.9767866730690002},
  {'label': 'NEGATIVE', 'score': 0.9994340538978577},
  {'label': 'NEGATIVE', 

In [31]:
# SENTIMENTS FOR EACH CHARACTER USING TRANSFORMER
# Dictionary to store sentiment results for each character
character_sentiments_transformer = {}

for character in trying_data_small['character_name']:
    if pd.notna(character):
        character_sentiments_transformer[character] = []  # Initialize a list for each character
        # Loop through each sentence
        for sentence_list in trying_data_small['sentences']:
            for sentence in sentence_list:
                if character in sentence:  # Check if the character's name is in the sentence
                    # Perform sentiment analysis on the sentence
                    sentiment_result = sentiment_model(sentence)
                    # Append the result to the character's sentiment list
                    character_sentiments_transformer[character].append({
                            'sentence': sentence,
                            'sentiment': sentiment_result[0]['label'],
                            'score': sentiment_result[0]['score']
                        })
# Display results
for character, sentiments in character_sentiments_transformer.items():
    print(f"\nSentiments for {character}:")
    for entry in sentiments:
        print(f"Sentence: {entry['sentence']}")
        print(f"Sentiment: {entry['sentiment']}, Score: {entry['score']}")


Sentiments for Foxface:
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl fr

In [27]:
# SENTIMENTS FOR EACH CHARACTER USING VADER
# Dictionary to store sentiment results for each character
character_sentiments_vader = {}

for character in trying_data_small['character_name']:
    if pd.notna(character):
        character_sentiments_vader[character] = []  # Initialize a list for each character
        # Loop through each sentence
        for sentence_list in trying_data_small['sentences']:
            for sentence in sentence_list:
                if character in sentence:  # Check if the character's name is in the sentence
                    # Perform sentiment analysis on the sentence
                    sentiment_result = sentiment_model(sentence)
                    # Append the result to the character's sentiment list
                    character_sentiments_vader[character].append({
                            'sentence': sentence,
                            'sentiment': sentiment_result[0]['label'],
                            'score': sentiment_result[0]['score']
                        })
# Display results
for character, sentiments in character_sentiments_vader.items():
    print(f"\nSentiments for {character}:")
    for entry in sentiments:
        print(f"Sentence: {entry['sentence']}")
        print(f"Sentiment: {entry['sentiment']}, Score: {entry['score']}")


Sentiments for Foxface:
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous.
Sentiment: NEGATIVE, Score: 0.9991311430931091
Sentence: Foxface, the girl fr

In [33]:
# Dictionary to store overall sentiment scores for each character
character_overall_sentiments = {}

# Loop through each character and their associated sentiments
for character, sentiments in character_sentiments_transformer.items():
    # Dictionary to aggregate sentiment scores for the character
    sentiment_aggregator = defaultdict(list)
    
    # Collect scores for each sentiment type
    for entry in sentiments:
        sentiment_aggregator[entry['sentiment']].append(entry['score'])
    
    # Calculate average score for each sentiment type
    character_overall_sentiments[character] = {
        sentiment: np.mean(scores) for sentiment, scores in sentiment_aggregator.items()
    }

# Display overall sentiment for each character
for character, overall_sentiment in character_overall_sentiments.items():
    print(f"\nOverall Sentiment for {character}:")
    for sentiment, avg_score in overall_sentiment.items():
        print(f"{sentiment.capitalize()} Sentiment: Average Score = {avg_score:.2f}")


Overall Sentiment for Foxface:
Negative Sentiment: Average Score = 1.00

Overall Sentiment for Katniss Everdeen:

Overall Sentiment for Peeta Mellark:
Positive Sentiment: Average Score = 0.98

Overall Sentiment for Effie Trinket:

Overall Sentiment for Gale Hawthorne:

Overall Sentiment for Haymitch Abernathy:
Negative Sentiment: Average Score = 0.93

Overall Sentiment for Clove:
Negative Sentiment: Average Score = 0.97

Overall Sentiment for Caesar Flickerman:
Positive Sentiment: Average Score = 1.00

Overall Sentiment for Primrose Everdeen:
Positive Sentiment: Average Score = 1.00

Overall Sentiment for President Snow:
Negative Sentiment: Average Score = 0.99
Positive Sentiment: Average Score = 0.87

Overall Sentiment for Cato:
Negative Sentiment: Average Score = 1.00
Positive Sentiment: Average Score = 0.92

Overall Sentiment for Cinna:

Overall Sentiment for Seneca Crane:
Negative Sentiment: Average Score = 0.99

Overall Sentiment for Rue:
Positive Sentiment: Average Score = 0.93


In [55]:
# Dictionary to store sympathy scores for each character
character_sympathy_scores_alternative = {}

# Loop through each character and their associated sentiments
for character, sentiments in character_sentiments_transformer.items():
    # Initialize variables to track positive and negative sentiment count and scores
    positive_scores = []
    negative_scores = []
    neutral_scores = []

    # Process each sentiment entry for the character
    for entry in sentiments:
        sentiment = entry['sentiment']
        score = entry['score']

        # Track sentiment scores
        if sentiment == "POSITIVE":
            positive_scores.append(score)
        elif sentiment == "NEGATIVE":
            negative_scores.append(score)
        elif sentiment == "NEUTRALl":
            neutral_scores.append(score)

    # Calculate counts and averages
    num_positive = len(positive_scores)
    num_negative = len(negative_scores)
    num_neutral = len(neutral_scores)

    avg_positive = np.mean(positive_scores) if positive_scores else 0
    avg_negative = np.mean(negative_scores) if negative_scores else 0
    avg_neutral = np.mean(neutral_scores) if neutral_scores else 0

    # Calculate sentiment ratio and consistency
    sentiment_ratio = (avg_positive / avg_negative) if avg_negative > 0 else avg_positive
    sentiment_consistency = (num_positive / (num_positive + num_negative)) if (num_positive + num_negative) > 0 else 0

    # Determine sympathy score based on thresholds and consistency
    if sentiment_ratio > 1.5 and sentiment_consistency > 0.7:
        sympathy_label = "Highly Sympathetic"
    elif sentiment_ratio > 1.0 and sentiment_consistency > 0.5:
        sympathy_label = "Moderately Sympathetic"
    elif sentiment_ratio < 1.0:
        sympathy_label = "Mixed or Negative Sentiment"
    else:
        sympathy_label = "Neutral or Indeterminate"

    # Store the result for each character
    character_sympathy_scores_alternative[character] = {
        "avg_positive": avg_positive,
        "avg_negative": avg_negative,
        "num_positive": num_positive,
        "num_negative": num_negative,
        "sentiment_ratio": sentiment_ratio,
        "sentiment_consistency": sentiment_consistency,
        "sympathy_label": sympathy_label
    }

# Display results
for character, scores in character_sympathy_scores_alternative.items():
    print(f"\nSympathy Analysis for {character}:")
    print(f"Average Positive Score: {scores['avg_positive']:.2f}")
    print(f"Average Negative Score: {scores['avg_negative']:.2f}")
    print(f"Number of Positive Sentences: {scores['num_positive']}")
    print(f"Number of Negative Sentences: {scores['num_negative']}")
    print(f"Sentiment Ratio (Positive/Negative): {scores['sentiment_ratio']:.2f}")
    print(f"Sentiment Consistency: {scores['sentiment_consistency']:.2f}")
    print(f"Sympathy Label: {scores['sympathy_label']}")



Sympathy Analysis for Foxface:
Average Positive Score: 0.00
Average Negative Score: 1.00
Number of Positive Sentences: 0
Number of Negative Sentences: 21
Sentiment Ratio (Positive/Negative): 0.00
Sentiment Consistency: 0.00
Sympathy Label: Mixed or Negative Sentiment

Sympathy Analysis for Katniss Everdeen:
Average Positive Score: 0.00
Average Negative Score: 0.00
Number of Positive Sentences: 0
Number of Negative Sentences: 0
Sentiment Ratio (Positive/Negative): 0.00
Sentiment Consistency: 0.00
Sympathy Label: Mixed or Negative Sentiment

Sympathy Analysis for Peeta Mellark:
Average Positive Score: 0.98
Average Negative Score: 0.00
Number of Positive Sentences: 21
Number of Negative Sentences: 0
Sentiment Ratio (Positive/Negative): 0.98
Sentiment Consistency: 1.00
Sympathy Label: Mixed or Negative Sentiment

Sympathy Analysis for Effie Trinket:
Average Positive Score: 0.00
Average Negative Score: 0.00
Number of Positive Sentences: 0
Number of Negative Sentences: 0
Sentiment Ratio (Po