In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%config InlineBackend
%load_ext autoreload
%autoreload 2

InlineBackend(InlineBackendConfig) options
----------------------------------------
InlineBackend.close_figures=<Bool>
    Close all figures at the end of each cell.
            When True, ensures that each cell starts with no active figures, but it
            also means that one must keep track of references in order to edit or
            redraw figures in subsequent cells. This mode is ideal for the notebook,
            where residual plots from other cells might be surprising.
            When False, one must call figure() to create new figures. This means
            that gcf() and getfigs() can reference figures created in other cells,
            and the active figure can continue to be edited with pylab/pyplot
            methods that reference the current active figure. This mode facilitates
            iterative editing of figures, and behaves most consistently with
            other matplotlib backends, but figure barriers between cells must
            be explicit.
    Cu

In [4]:
header = ['wiki_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_date_of_birth',
          'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age_at_release',
          'freebase_character_actor_map_id', 'freebase_character_id', 'freebase_actor_id']
df = pd.read_csv('../dataset/character.metadata.tsv', sep='\t', names=header)
df.head()

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_release_date,character_name,actor_date_of_birth,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age_at_release,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [5]:
import xml.etree.ElementTree as ET
import gzip
import xmltodict

movie_id = '23270459'
# Load and parse the XML file
directory = '../dataset/corenlp_plot_summaries/'
file_path = directory + movie_id + '.xml.gz'

In [6]:
def extract_corenlp_xml(filename):
    tree = ET.parse(gzip.open(filename))
    #xml to json
    xmlstr = ET.tostring(tree.getroot(), encoding='utf8', method='xml')
    data_dict = xmltodict.parse(xmlstr)
    return data_dict

In [7]:
file = extract_corenlp_xml(file_path)

In [None]:
file

{'root': {'document': {'sentences': {'sentence': [{'@id': '1',
      'tokens': {'token': [{'@id': '1',
         'word': 'Former',
         'lemma': 'former',
         'CharacterOffsetBegin': '1',
         'CharacterOffsetEnd': '7',
         'POS': 'JJ',
         'NER': 'O'},
        {'@id': '2',
         'word': 'dream',
         'lemma': 'dream',
         'CharacterOffsetBegin': '8',
         'CharacterOffsetEnd': '13',
         'POS': 'NN',
         'NER': 'O'},
        {'@id': '3',
         'word': 'architect',
         'lemma': 'architect',
         'CharacterOffsetBegin': '14',
         'CharacterOffsetEnd': '23',
         'POS': 'NN',
         'NER': 'O'},
        {'@id': '4',
         'word': 'Dominick',
         'lemma': 'Dominick',
         'CharacterOffsetBegin': '24',
         'CharacterOffsetEnd': '32',
         'POS': 'NNP',
         'NER': 'PERSON'},
        {'@id': '5',
         'word': '``',
         'lemma': '``',
         'CharacterOffsetBegin': '33',
         'Charac

In [9]:
characters = {}  # Dictionary to hold character information
for sentence in file['root']['document']['sentences']['sentence']:
    entities = sentence.get('tokens', {}).get('token', [])
    tokens = sentence['tokens']['token']

    # Extract entities and classify them as characters
    for entity in entities:
        if type(entity) is dict and entity.get('NER') is not None and entity[
            'NER'] == 'PERSON':  # Assuming 'type' is the correct key for entity type
            char_name = entity['word']
            if char_name not in characters:
                characters[char_name] = {}
characters

{'Dominick': {},
 'Cobb': {},
 'Arthur': {},
 'Saito': {},
 'Mallorie': {},
 'Maurice': {},
 'Fischer': {},
 'Robert': {},
 'Yusuf': {},
 'Eames': {},
 'Peter': {},
 'Browning': {},
 'Mal': {}}

In [20]:
# coreference to ida

mentioned_characters = []
mentioned_non_characters = []
character_mention_count = {}

for coreference in file['root']['document']['coreference']['coreference']:
    # find token
    for mention in coreference['mention']:
        # get sentence associated with the mention , -1 because sentence index in mention start from 1
        sentence = file['root']['document']['sentences']['sentence'][int(mention['sentence']) - 1]

        if mention.get('@representative') is not None and mention.get('@representative') == 'true':
            # check if mention is a character
            token = sentence['tokens']['token'][int(mention['head']) - 1]['word']
            if token not in characters:
                break

        if characters[token].get('mentions') is None:
            characters[token]['mentions'] = [mention]
        else:
            characters[token]['mentions'].append(mention)

characters

{'Dominick': {},
 'Cobb': {'mentions': [{'@representative': 'true',
    'sentence': '1',
    'start': '1',
    'end': '9',
    'head': '8'},
   {'sentence': '1', 'start': '8', 'end': '9', 'head': '8'},
   {'sentence': '5', 'start': '1', 'end': '3', 'head': '1'},
   {'sentence': '6', 'start': '6', 'end': '23', 'head': '10'},
   {'sentence': '6', 'start': '6', 'end': '11', 'head': '10'},
   {'sentence': '6', 'start': '10', 'end': '11', 'head': '10'},
   {'sentence': '6', 'start': '12', 'end': '14', 'head': '12'},
   {'sentence': '7', 'start': '6', 'end': '8', 'head': '6'},
   {'sentence': '9', 'start': '2', 'end': '3', 'head': '2'},
   {'sentence': '9', 'start': '8', 'end': '9', 'head': '8'},
   {'sentence': '9', 'start': '16', 'end': '17', 'head': '16'},
   {'sentence': '9', 'start': '19', 'end': '20', 'head': '19'},
   {'sentence': '9', 'start': '26', 'end': '27', 'head': '26'},
   {'sentence': '10', 'start': '1', 'end': '2', 'head': '1'},
   {'sentence': '10', 'start': '7', 'end': '8'

In [26]:
character_mention_count = {}
for character in characters:
    if characters[character].get('mentions') is None:
        character_mention_count[character] = 1
        continue
    character_mention_count[character] = len(characters[character]['mentions']) + 1
character_mention_count

{'Dominick': 1,
 'Cobb': 88,
 'Arthur': 14,
 'Saito': 46,
 'Mallorie': 1,
 'Maurice': 1,
 'Fischer': 48,
 'Robert': 1,
 'Yusuf': 18,
 'Eames': 22,
 'Peter': 1,
 'Browning': 10,
 'Mal': 26}

Create [sentence][token] -> character (optional) map

In [12]:
mention_inverse_map = {}

for mention in characters:
    if characters[mention].get('mentions') is None:
        continue
    for m in characters[mention]['mentions']:
        if mention_inverse_map.get(int(m['sentence'])) is None:
            mention_inverse_map[int(m['sentence'])] = {}
        mention_inverse_map[int(m['sentence'])][int(m['head'])] = mention


def get_mentioned_character(sentence_id, token_id):
    if mention_inverse_map.get(sentence_id) is None:
        return None
    character = mention_inverse_map[sentence_id].get(token_id)
    return character


print(get_mentioned_character(1, 8))
print(get_mentioned_character(1, 9))
mention_inverse_map

Cobb
None


{1: {8: 'Cobb', 12: 'Arthur', 42: 'Saito'},
 5: {1: 'Cobb'},
 6: {10: 'Cobb', 12: 'Cobb'},
 7: {6: 'Cobb', 9: 'Arthur', 1: 'Saito', 17: 'Saito'},
 9: {2: 'Cobb', 8: 'Cobb', 16: 'Cobb', 19: 'Cobb', 26: 'Cobb', 5: 'Saito'},
 10: {1: 'Cobb',
  7: 'Cobb',
  53: 'Arthur',
  18: 'Yusuf',
  21: 'Yusuf',
  10: 'Eames',
  13: 'Eames',
  16: 'Eames'},
 11: {10: 'Cobb', 12: 'Cobb', 1: 'Saito', 5: 'Saito'},
 12: {27: 'Cobb', 4: 'Fischer', 9: 'Fischer', 25: 'Fischer'},
 23: {1: 'Cobb', 6: 'Cobb', 13: 'Mal'},
 24: {25: 'Cobb', 32: 'Cobb', 11: 'Mal', 14: 'Mal', 34: 'Mal'},
 25: {1: 'Cobb', 8: 'Cobb', 17: 'Cobb'},
 26: {9: 'Cobb', 1: 'Saito', 5: 'Saito', 19: 'Fischer', 13: 'Mal'},
 31: {1: 'Cobb', 10: 'Saito', 8: 'Fischer', 13: 'Eames', 16: 'Eames'},
 32: {1: 'Cobb', 3: 'Cobb', 12: 'Cobb', 6: 'Mal'},
 33: {1: 'Cobb',
  6: 'Cobb',
  10: 'Mal',
  16: 'Mal',
  28: 'Mal',
  67: 'Mal',
  74: 'Mal'},
 34: {3: 'Cobb', 7: 'Mal'},
 35: {2: 'Cobb', 5: 'Cobb', 17: 'Saito'},
 37: {7: 'Cobb', 35: 'Arthur', 9: 'Sai

In [13]:
file['root']['document']['sentences']['sentence'][0]['basic-dependencies']['dep'][2]

{'@type': 'nn',
 'governor': {'@idx': '8', '#text': 'Cobb'},
 'dependent': {'@idx': '3', '#text': 'architect'}}

In [14]:
def print_dep(sentence, character, other, is_dependent):
    print(f"{character} {'(dependant)' if is_dependent else '(governor)'} {'->' if is_dependent else '<-'} {other} {'(dependant)' if not is_dependent else '(governor)'}")
    for token in sentence['tokens']['token']:
        print(token['word'], end=' ')
    print("\n")


def add_dep(dependencies, sentence, dep, type, character, other, is_dependent):
    if dependencies.get(character) is None:
        dependencies[character] = {}
    if dependencies[character].get(type) is None:
        dependencies[character][type] = {}
    role = 'dependant' if is_dependent else 'governor'
    if dependencies[character][type].get(role) is None:
        dependencies[character][type][role] = []
    dependencies[character][type][role].append(other)
    # print_dep(sentence, character, other, is_dependent)

In [15]:
dep_alg = 'collapsed-ccprocessed-dependencies'

# get all types of dependencies
types = set()
for sentence in file['root']['document']['sentences']['sentence']:
    if sentence.get(dep_alg) is None:
        continue
    for dep in sentence[dep_alg]['dep']:
        types.add(dep['@type'])

dependencies = {}

for t in types:
    # print(f"\n----\n{t}\n---\n")
    for sentence in file['root']['document']['sentences']['sentence']:
        if sentence.get(dep_alg) is None:
            continue

        for dep in sentence[dep_alg]['dep']:
            if dep.get('@type') != t:
                continue

            character = get_mentioned_character(int(sentence['@id']), int(dep.get('dependent').get('@idx')))
            if character is not None:
                other = sentence['tokens']['token'][int(dep.get('governor').get('@idx')) - 1]['lemma']
                if other in characters:
                    continue
                add_dep(dependencies, sentence, dep, t, character, other, True)

            character = get_mentioned_character(int(sentence['@id']), int(dep.get('governor').get('@idx')))
            if character is not None:
                other = sentence['tokens']['token'][int(dep.get('dependent').get('@idx')) - 1]['lemma']
                if other in characters:
                    continue
                add_dep(dependencies, sentence, dep, t, character, other, False)
        # print(dep.get('dependent'))

In [16]:
dependencies

{'Browning': {'prep_of': {'dependant': ['appearance']},
  'appos': {'dependant': ['godfather']},
  'poss': {'dependant': ['subconscious']},
  'agent': {'dependant': ['orchestrate']}},
 'Mal': {'prep_of': {'dependant': ['projection', 'projection']},
  'nsubj': {'dependant': ['convinced',
    'dream',
    'commit',
    'leave',
    'continue']},
  'poss': {'dependant': ['death', 'suicide', 'mind']},
  'prep_with': {'dependant': ['year']},
  'dobj': {'dependant': ['wake', 'shoot']},
  'rcmod': {'governor': ['try']}},
 'Cobb': {'appos': {'governor': ['wife']},
  'nsubj': {'dependant': ['sell',
    'succeed',
    'return',
    'accept',
    'assemble',
    'succeed',
    'reveal',
    'spend',
    'do',
    'flee',
    'leave',
    'enter',
    'confront',
    'stay',
    'refuse',
    'confess',
    'responsible',
    'attain',
    'choose',
    'find',
    'pass']},
  'prep_to': {'dependant': ['due']},
  'prep_against': {'dependant': ['clear']},
  'poss': {'dependant': ['totem',
    'wife

Important dependencies:
- __agent__ - An agent performing the action in a passive sentence, typically following "by," like "Browning (dependant) -> orchestrate (governor)" in "Kidnapping was orchestrated by Browning"
- __amod__ - Adjectival modifier. An adjective modifying a noun, like "Saito (governor) <- japanese (dependant)" in "Japanese businessman Saito"
- __dobj__ - Direct object. A noun phrase receiving the action of a transitive verb, like "Fischer (dependant) -> abduct (governor)" in "The team abducts Fischer "
- __nn__ - Noun compound modifier. A noun modifying another noun, like "Saito (governor) <- businessman (dependant)" in "Japanese businessman Saito"
- __nsubj__ - Nominal subject. Nominal subject. The main noun or pronoun that the sentence is about, like "Cobb (dependant) -> confront (governor)" in "Cobb confronts his projection"
- __nsubjpass__ - Passive nominal subject. The subject in a passive sentence, like "Arthur (dependant) -> force (governor)" in "Arthur is forced to improvise" 
- __poss__ - Possession modifier. A possessive noun or pronoun, like "Arthur (dependant) -> dream (governor)" in "They are sedated into Arthur 's dream."

Parsing dependencies to extract:
 - __actions_taken__ - agent, nsubj
 - __actions_received__ - dobj, nsubjpass, 
 - __possessions__ - poss
 - __descriptions__ - amod, nn

In [17]:
for character in dependencies:
    actions_taken = []
    actions_received = []
    possessions = []
    descriptions = []
    
    for type in dependencies[character]:
        for role in dependencies[character][type]:
            for other in dependencies[character][type][role]:
                if type == 'agent' and role == 'dependant' or type == 'nsubj' and role == 'dependant':
                    actions_taken.append(other)
                if type == 'dobj' and role == 'dependant' or type == 'nsubjpass' and role == 'dependant':
                    actions_received.append(other)
                if type == 'poss' and role == 'dependant':
                    possessions.append(other)
                if type == 'amod' and role == 'governor' or type == 'nn' and role == 'governor':
                    descriptions.append(other)
    dependencies[character] = {
        'actions_taken': actions_taken,
        'actions_received': actions_received,
        'possessions': possessions,
        'descriptions': descriptions
    }
dependencies

{'Browning': {'actions_taken': ['orchestrate'],
  'actions_received': [],
  'possessions': ['subconscious'],
  'descriptions': []},
 'Mal': {'actions_taken': ['convinced',
   'dream',
   'commit',
   'leave',
   'continue'],
  'actions_received': ['wake', 'shoot'],
  'possessions': ['death', 'suicide', 'mind'],
  'descriptions': []},
 'Cobb': {'actions_taken': ['sell',
   'succeed',
   'return',
   'accept',
   'assemble',
   'succeed',
   'reveal',
   'spend',
   'do',
   'flee',
   'leave',
   'enter',
   'confront',
   'stay',
   'refuse',
   'confess',
   'responsible',
   'attain',
   'choose',
   'find',
   'pass'],
  'actions_received': ['incriminate'],
  'possessions': ['totem',
   'wife',
   'influence',
   'child',
   'team',
   'team',
   'child',
   'father-in-law',
   'projection',
   'projection',
   'confession'],
  'descriptions': ['former', 'dream', 'architect', 'Dom']},
 'Yusuf': {'actions_taken': ['drive', 'initiate'],
  'actions_received': [],
  'possessions': ['dre

In [18]:
#save dependencies to .json file with id of the movie as name
import json

# create directory if doesn't exist
import os
if not os.path.exists('dataset/characters'):
    os.makedirs('dataset/characters')

with open(f'dataset/characters/{movie_id}.json', 'w') as fp:
    json.dump(dependencies, fp)

__Character Frequency Analysis:__
Count how often each character's name appears in the summary. More frequent mentions usually indicate a more central role in the narrative.

__Sentiment Analysis:__
Apply sentiment analysis to the sentences or phrases associated with each character. This can help you understand the narrative's emotional context around each character, indicating their role in the story's tone and themes.

__Network Analysis__
Create a network graph where nodes represent characters, and edges represent interactions or relationships between them. Analyzing this network can reveal central characters, isolated ones, or key relationships.

Extracting Character Relationships from Plot Summaries by creating a network of interactions-dependencies between characters of any type or of a specific type target of an action, source of an action, etc.

__Role Identification__
Identify the verbs and actions associated with each character. For instance, a character frequently associated with verbs like 'lead', 'decide', or 'solve' might be a protagonist, while one associated with 'help' or 'support' might be a sidekick.

__Topic Modeling__
Use topic modeling to identify themes or topics in the movie summary and then see how each character is associated with these topics. This could provide insight into thematic significance.

__Plot Point Association__
Identify key plot points and see which characters are most involved in these. Characters central to major plot points are often more important to the narrative.