In [1]:
import numpy as np
import json
import os
import pandas as pd

from pathlib import Path

In [2]:
death_verbs_patient = {
    "kill", "murder", "execute", "assassinate", "slay",
    "behead", "crucify", "hang", "drown",
    "poison", "decapitate", "sacrifice", "eradicate",
    "annihilate", "exterminate", "obliterate", "destroy",
    "massacre", "euthanize", "lynch", "terminate"
}
    
death_verbs_agent = {
    "die", "commit", "perish", "sacrifice",
    "pass away", "expire", "succumb", 
    "surrender life", "bleed out"
}

def count_character_deaths(characters_bags):
    death_count = 0
    character_death_flags = {}
    
    for character in characters_bags:
        char_name = character['name']
        char_bag = character['bag']
        
        died = False
        
        for verb_type, verb in char_bag:
            if verb_type == 'patient verb' and verb in death_verbs_patient:
                died = True
                break
                
            if verb_type == 'agent verb' and verb in death_verbs_agent:
                died = True
                break
        
        character_death_flags[char_name] = 1 if died else 0
        
        if died:
            death_count += 1
    
    return death_count, character_death_flags

In [3]:
data_path = Path('../data/processed')

records = {}

for movie in os.listdir(data_path):
    character_bags = json.load(open(data_path / movie))

    movie_id = movie.split('_')[-1].split('.')[0]

    num_deaths, character_deaths =  count_character_deaths(character_bags)

    records[movie_id] = {'num_deaths': num_deaths,
                         'num_characters': len(character_bags),
                         'character_deaths': character_deaths}

In [4]:
print(f'Number of movies = {len(records)}')
print(f'Number of characters = {np.sum([v['num_characters'] for k, v in records.items()])}')
print(f'Number of deaths = {np.sum([v['num_deaths'] for k, v in records.items()])}\n')

print(f'Average number of characters per movie = {np.mean([v['num_characters'] for k, v in records.items()]).round(2)}')
print(f'Average number of deaths per movie = {np.mean([v['num_deaths'] for k, v in records.items()]).round(2)}')
print(f'Average percentage of dead characters = {np.mean([v['num_deaths'] / v['num_characters'] for k, v in records.items()]).round(2)}')

Number of movies = 18990
Number of characters = 57949
Number of deaths = 7232

Average number of characters per movie = 3.05
Average number of deaths per movie = 0.38
Average percentage of dead characters = 0.11
