In [None]:
from google.colab import files
uploaded = files.upload()

Saving DKR_scenes.xlsx to DKR_scenes.xlsx
Saving Spider-Man Script.xlsx to Spider-Man Script (1).xlsx
Saving Thor Script.xlsx to Thor Script (1).xlsx
Saving Wonder Woman.xlsx to Wonder Woman (1).xlsx


In [None]:

import pandas as pd
import re
from textblob import TextBlob
from nltk.util import ngrams
from collections import Counter


In [None]:

# Define keyword list and functions
fight_keywords = ["fight", "battle", "combat", "duel", "clash", "attack", "war", "conflict", "strike", "hit", "punch", "kick", "defend", "assault", "charge", "wound", "injure", "kill", "defeat"]

# Tokenize the given text by lowercasing all of the words.
def simple_tokenizer(text):
    return re.findall(r'\w+', text.lower())

# Extract potential character names from the movie script.
def extract_character_names_v2(script_df):
    script_column = "script" if "script" in script_df.columns else "Script"
    character_names = set()
    pattern = re.compile(r'\b([A-Z][A-Z\s]+)\b')
    for _, row in script_df.iterrows():
        scene_text = row[script_column]
        potential_names = pattern.findall(scene_text)
        for name in potential_names:
            if len(name.split()) > 1 or (len(name.split()) == 1 and len(name) > 3):
                character_names.add(name.strip())
    return character_names

# Refine fight scene detection by checking for the presence of keywords, character mentions and analyzing sentiment.
def refine_fight_scene_detection_v2(script_df, character_names):
    script_column = "script" if "script" in script_df.columns else "Script"
    refined_fight_scenes = []
    for idx, row in script_df.iterrows():
        scene_text = row[script_column].lower()
        character_mentions = sum([1 for name in character_names if name.lower() in scene_text])
        if any(keyword in scene_text for keyword in fight_keywords) and character_mentions > 1:
            blob = TextBlob(scene_text)
            sentiment_polarity = blob.sentiment.polarity
            if sentiment_polarity < -0.2 or sentiment_polarity > 0.5:
                refined_fight_scenes.append((idx, sentiment_polarity, row[script_column][:100] + "..."))
    return refined_fight_scenes

# Load the movie scripts as dataframes.
wonder_woman_df = pd.read_excel("Wonder Woman.xlsx")
dkr_df = pd.read_excel("DKR_scenes.xlsx")
spiderman_df = pd.read_excel("Spider-Man Script.xlsx")
thor_df = pd.read_excel("Thor Script.xlsx")

# Extract character names and refine fight scenes for each movie.
ww_character_names = extract_character_names_v2(wonder_woman_df)
ww_refined_fight_scenes = refine_fight_scene_detection_v2(wonder_woman_df, ww_character_names)

dkr_character_names = extract_character_names_v2(dkr_df)
dkr_refined_fight_scenes = refine_fight_scene_detection_v2(dkr_df, dkr_character_names)

spiderman_character_names = extract_character_names_v2(spiderman_df)
spiderman_refined_fight_scenes = refine_fight_scene_detection_v2(spiderman_df, spiderman_character_names)

thor_character_names = extract_character_names_v2(thor_df)
thor_refined_fight_scenes = refine_fight_scene_detection_v2(thor_df, thor_character_names)

# Count the refined fight scenes for each movie into a dictionary.
fight_scene_counts = {
    "Wonder Woman": len(ww_refined_fight_scenes),
    "Dark Knight Rises": len(dkr_refined_fight_scenes),
    "Spider-Man": len(spiderman_refined_fight_scenes),
    "Thor": len(thor_refined_fight_scenes)
}

fight_scene_counts


{'Wonder Woman': 4, 'Dark Knight Rises': 2, 'Spider-Man': 2, 'Thor': 5}