In [28]:
import os 
import tarfile
import gzip
import urllib.request
import xml.etree.ElementTree as ET
from nltk.tree import Tree
import pandas as pd

In [5]:
DATA_PATH = "data"
CORE_NLP_PATH = DATA_PATH + "/corenlp"
CORE_NLP_GZ = CORE_NLP_PATH + "/corenlp_plot_summaries"
CORE_NLP_XML = CORE_NLP_PATH + "/corenlp_plot_summaries_xml"

In [11]:
def load_corenlp_data(downloaded = True, tar_path = (CORE_NLP_PATH + "/corenlp_plot_summaries.tar")):
    """
    Load corenlp data from tar file indicated or downloads it from the web and puts all the files in xml format in a folder.
    param downloaded: boolean indicating if the data has been downloaded or not
    param tar_path: path to the tar file if downloaded is False
    """

    # Download data if not downloaded
    if not downloaded:
        coreNLP_filename = 'http://www.cs.cmu.edu/~ark/personas/data/corenlp_plot_summaries.tar'
        tar = tarfile.open(fileobj=urllib.request.urlopen(coreNLP_filename), mode="r") 
    else:
        tar = tarfile.open(tar_path, mode="r")

    tar.extractall(path=CORE_NLP_PATH) 
    tar.close()


    if not os.path.exists(CORE_NLP_XML):
        os.mkdir(CORE_NLP_XML)
        for filename in os.listdir(CORE_NLP_GZ):
            f = os.path.join(CORE_NLP_GZ, filename) 
            if os.path.isfile(f):
                # Open and store file as xml 
                with gzip.open(f, 'rb') as f_in:
                    gz_file = os.path.join(CORE_NLP_XML, filename)
                    with open(gz_file[:-3], 'wb') as f_out:
                        f_out.write(f_in.read())

In [12]:
load_corenlp_data()

In [24]:
def get_movie_id(movie_xml):
    """
    Get the movie id from the movie xml file
    param movie_xml: path to the xml file
    return: movie id
    """
    return movie_xml.split('/')[-1].split('.')[0]

def get_tree(movie_xml, data_path = CORE_NLP_XML):
    """
    Get the tree from the movie xml file
    param movie_xml: path to the xml file
    param data_path: path to the folder containing the xml files
    return: xml tree
    """
    movie_path = os.path.join(data_path, movie_xml)
    tree = ET.parse(movie_path)
    return tree

def get_sentences(tree):
    """
    Get the CoreNLP parsed sentences from the tree
    param tree: xml tree
    return: list of sentences (string)
    """
    sentences = []
    for child in tree.iter():
        if child.tag == "parse":
            sentences.append(child.text)
    return sentences

def print_tree(sentence):
    """
    Print the tree of a sentence
    param sentence: parsed sentence to print (string)
    """
    tree = Tree.fromstring(sentence)
    tree.pretty_print()

In [26]:
def get_characters(tree):
    """
    Get all of the characters in a movie from the xml tree. The characters are consecutive PERSON tags.
    param tree: xml tree
    return characters: list of characters
    """
    characters = []
    current_word = None
    was_person = False
    character = ''
    for child in tree.iter():
        if child.tag == 'word':
            current_word = child.text
        if child.tag == 'NER' and child.text == 'PERSON':
            if was_person:# Continue the character
                character += ' ' + current_word
            else: # Start the character
                character = current_word
                was_person = True
        if was_person and child.tag == 'NER' and child.text != 'PERSON': # End the character
            characters.append(character)
            character = ''
            was_person = False
    return characters

def get_full_name(string, characters):
    ''' 
    Find the longest name of a given character in a list of character names. 
    param string: character name (partial or full)
    param characters: list of character names
    return full_name: longest name of character found in characters
    '''
    names = string.split(' ')
    max_length = 0
    for character in characters:
        char_names = character.split(' ')
        if set(names) <= set(char_names): 
            num_names = len(char_names)
            if num_names > max_length:
                max_length = num_names
                full_name = character
    return full_name

In [27]:
def get_mentions(movie_xml, data_path = CORE_NLP_XML):
    """
    Get the number of times the character of a movie are mentioned in the plot summary.
    param movie_xml: path to the xml file
    return character_mentions: dictionary mapping characters to the number of times they are mentioned
    """

    tree = get_tree(movie_xml, data_path)
    characters = get_characters(tree)

    # get a dictionary mapping characters to the number of times they are mentioned
    character_mentions = dict()
    for character in characters:
        full_name = get_full_name(character, characters)
        if full_name in character_mentions:
            character_mentions[full_name] += 1
        else:
            character_mentions[full_name] = 1
    return character_mentions

def sort_by_mention(movie_xml, data_path = CORE_NLP_XML):
    """
    Sort the characters of a movie by the number of times they are mentioned in the plot summary.
    param movie_xml: path to the xml file
    return sorted_mentions: list of characters sorted by the number of times they are mentioned
    """

    character_mentions = get_mentions(movie_xml, data_path)
    sorted_mentions = sorted(character_mentions.items(), key=lambda x: x[1], reverse=True)
    return sorted_mentions


# the main character is defined as the most mentipned character in the plot summary
# this definition might not be sufficient
def get_main_character(movie_xml, data_path = CORE_NLP_XML):
    """
    Get the main character of a movie from the xml file
    param movie_xml: path to the xml file
    return: main character full name (string)
    """

    sorted_mentions = sort_by_mention(movie_xml, data_path)
    return sorted_mentions[0][0]

In [32]:
# load the CMU characters data
characters_metadata = pd.read_csv('CMU_Characters_PreProcessed.tsv', delimiter= '\t')
characters_metadata.sample(5)

Unnamed: 0,WikiMovieID,FreebaseMovieID,ReleaseDate,CharacterName,YoB,Gender,Height,Ethnicity,ActorName,Age,FreebaseActorMapID,FreebaseCharacterID,FreebaseActorID,ReleaseYear
213211,1898116,/m/064ndc,2003-10-24,Johnny,1978.0,M,1.83,,Riley Smith,25.0,/m/02tb52_,/m/0cgg1f2,/m/083r2k,2003.0
255695,30146066,/m/0g55p5n,2011-09-10,Linda Fentress,1966.0,F,1.68,/m/07hwkr,Robin Wright,45.0,/m/0gvwrzs,/m/0h25l4t,/m/06jzh,2011.0
135048,482383,/m/02fttd,1983-02-18,Plainclothesman,1951.0,M,1.85,,Tony Devon,31.0,/m/0h170rs,/m/0h170jp,/m/0gbx55t,1983.0
171783,1570765,/m/05c8mm,2004-06-15,Phil,,M,,,Sean Andrews,,/m/0l4_tyq,/m/0l4_zv6,/m/0gcgh_8,2004.0
193534,6613889,/m/0gdt8z,1992-01-01,,1947.0,F,,,Liz Torres,44.0,/m/0cfzhg4,,/m/04wx7d,1992.0


In [34]:
# load the CMU movies data
movies_metadata = pd.read_csv('movies_preprocessed.tsv', delimiter= '\t')
movies_metadata.sample(5)

Unnamed: 0,WikiMovieID,FreebaseMovieID,MovieName,ReleaseDate,BORevenue,Runtime,Languages,Countries,MovieGenre,ReleaseYear
9784,9765282,/m/02prh_8,Dreamland,2007-02-27,,77.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{""/m/06n90"": ""Science Fiction"", ""/m/03npn"": ""H...",2007.0
36646,9505276,/m/02ph0nw,Viejo smoking,1937,,105.0,{'/m/06nm1': 'Spanish Language'},{'/m/0jgd': 'Argentina'},"{""/m/04t36"": ""Musical"", ""/m/07s9rl0"": ""Drama"",...",1937.0
34889,23290050,/m/0gkt0kq,Mayumi,1990-06-09,,110.0,{'/m/02hwhyv': 'Korean Language'},{'/m/06qd3': 'South Korea'},"{""/m/01jfsb"": ""Thriller"", ""/m/07s9rl0"": ""Drama""}",1990.0
44079,6231202,/m/0fx_dp,Malta Story,1953,,103.0,{'/m/02h40lc': 'English Language'},{'/m/07ssc': 'United Kingdom'},"{""/m/068d7h"": ""Romantic drama"", ""/m/02l7c8"": ""...",1953.0
50518,23233139,/m/065z5fm,A Blood Pledge,2009-06-18,3392086.0,88.0,{'/m/02hwhyv': 'Korean Language'},{'/m/06qd3': 'South Korea'},"{""/m/03npn"": ""Horror"", ""/m/07s9rl0"": ""Drama"", ...",2009.0


In [37]:
# get the Wikipedia movie id from the Harry Potter movies
test_ids = characters_metadata[characters_metadata['CharacterName'] == 'Harry Potter']['WikiMovieID'].values
test_ids

array([  858575,   667372,   670407, 31941988,  9834441,   667368,
         667371,   667361])

In [40]:
for id in test_ids:
    movie_xml = str(id) + '.xml'
    print(get_main_character(movie_xml))

Harry Potter
Harry Potter
Harry Potter
Harry Potter
Harry
Harry
Harry Potter
Harry Potter
