In [17]:
import pandas as pd
import xml.etree.ElementTree as ET

In [18]:
DATA_FOLDER           = "../data/"
PROCESSED_DATA_FOLDER = '../processed_data/'
CORENLP_DATA_FOLDER   = '../corenlp_plot_summaries/'

CHARACTER_DATASET     = DATA_FOLDER + "character.metadata.tsv"
MOVIE_DATASET         = DATA_FOLDER + "movie.metadata.tsv"
NAME_CLUSTER_DATASET  = DATA_FOLDER + "name.clusters.txt"
PLOT_DATASET          = DATA_FOLDER + "plot_summaries.txt"
TVTROPES_DATASET      = DATA_FOLDER + "tvtropes.clusters.txt"

PROCESSED_CHARACTER   = PROCESSED_DATA_FOLDER + 'character_metadata.csv'
PROCESSED_MOVIE       = PROCESSED_DATA_FOLDER + 'movie_metadata.csv'
PROCESSED_NAME        = PROCESSED_DATA_FOLDER + 'name_clusters.csv'
PROCESSED_PLOT        = PROCESSED_DATA_FOLDER + 'plot_summaries.csv'
PROCESSED_TVTROPES    = PROCESSED_DATA_FOLDER + 'tvtropes_clusters.csv'

### Pre-processing the data

At first, we fetch the data from the `data` folder into pd dataframes in order to set the desired headings and to modify the type if needed.
Then we store it in `processed_data` folder in the desired .csv format.

For `movie_metadata` and `character_metadata` we modify the dates to the datetime format.

In [19]:
movie_metadata = pd.read_csv(MOVIE_DATASET, sep='\t', header=None) \
    .rename(columns={0 : 'wikipedia_movie_id',
                     1 : 'freebase_movie_id',
                     2 : 'movie_name',
                     3 : 'movie_release_date',
                     4 : 'office_revenue',
                     5 : 'runtime',
                     6 : 'languages',
                     7 : 'countries',
                     8 : 'genres'})

movie_metadata['movie_release_date'] = pd.to_datetime(movie_metadata['movie_release_date'], errors = 'coerce')
movie_metadata['movie_release_year'] = movie_metadata['movie_release_date'].apply(lambda d : d.year)

In [20]:
character_metadata = pd.read_csv(CHARACTER_DATASET, sep='\t', header=None) \
    .rename(columns={0  : 'wikipedia_movie_id',
                     1  : 'freebase_movie_id',
                     2  : 'movie_release_date',
                     3  : 'character_name',
                     4  : 'actor_birth_date',
                     5  : 'actor_gender',
                     6  : 'actor_height',
                     7  : 'actor_ethnicity',
                     8  : 'actor_name',
                     9  : 'actor_age',
                     10 : 'freebase_map_id',
                     11 : 'freebase_character_id',
                     12 : 'freebase_actor_id'})

character_metadata['movie_release_date'] = pd.to_datetime(character_metadata['movie_release_date'],          errors='coerce')
character_metadata['actor_birth_date'] =   pd.to_datetime(character_metadata['actor_birth_date'], utc=True , errors='coerce')

In [21]:
plot_summaries = pd.read_csv(PLOT_DATASET, sep='\t', header=None)\
    .rename(columns={0 : 'movie_id',
                     1 : 'plot'})

In [22]:
name_clusters = pd.read_csv(NAME_CLUSTER_DATASET, sep='\t', header=None)\
    .rename(columns={0 : 'character_name',
                     1 : 'freebase_map_id'})

For the `tvtropes_cluster` : we first create a raw dataframe containing the data.
Then we construct the resulting dataframe where each values of the dictionary correspond to the correct column.

In [23]:
tvtropes_cluster_raw = pd.read_csv(TVTROPES_DATASET, sep='\t', header=None)
tvtropes_cluster_raw[1] = tvtropes_cluster_raw[1].map(ast.literal_eval)
tvtropes_cluster = pd.DataFrame()

tvtropes_cluster['character_type'] = tvtropes_cluster_raw[0]
tvtropes_cluster['character_name'] = tvtropes_cluster_raw[1].map(lambda d : d['char'])
tvtropes_cluster['movie_name']     = tvtropes_cluster_raw[1].map(lambda d : d['movie'])
tvtropes_cluster['movie_id']       = tvtropes_cluster_raw[1].map(lambda d : d['id'])
tvtropes_cluster['actor_name']     = tvtropes_cluster_raw[1].map(lambda d : d['actor'])

In [24]:
character_metadata.to_csv(PROCESSED_CHARACTER, index=False)
movie_metadata    .to_csv(PROCESSED_MOVIE,     index=False)
plot_summaries    .to_csv(PROCESSED_PLOT,      index=False)
name_clusters     .to_csv(PROCESSED_NAME,      index=False)
tvtropes_cluster  .to_csv(PROCESSED_TVTROPES,  index=False)

### NLP pre-processing

In [25]:
# using dictionary of English words tagged with their natural gender
# source : https://github.com/ecmonsen/gendered_words

url = 'https://raw.githubusercontent.com/ecmonsen/gendered_words/master/gendered_words.json'
tagged_words = pd.read_json(url)

In [26]:
WORDS_M = set(tagged_words[tagged_words.gender == 'm']['word'])
WORDS_F = set(tagged_words[tagged_words.gender == 'f']['word'])

In [27]:
def get_names_m_f(movie_id):
    names     = character_metadata[character_metadata.wikipedia_movie_id == movie_id][['character_name', 'actor_gender']]\
                .dropna()\
                .groupby('actor_gender')\
                .agg(' '.join)\
                .to_dict()['character_name']

    names_m   = set(names.get('M', '').lower().split(' '))
    names_f   = set(names.get('F', '').lower().split(' '))
    intersect = names_f.intersection(names_m)

    names_m   = names_m.difference(intersect)
    names_f   = names_f.difference(intersect)
    return names_m, names_f

In [28]:
def analyze_movie(movie_id, root):
    names_m, names_f = get_names_m_f(movie_id)

    actions_by_m = []
    actions_by_f = []
    actions_on_m = []
    actions_on_f = []
    descriptions_of_m = []
    descriptions_of_f = []

    if root != 'FileNotFoundError':
        for dep in root.iter('dep'):

            subject_name = dep[1].text.lower()
            action       = dep[0].text
            relation     = dep.attrib['type']

            if   subject_name in names_m or subject_name in WORDS_M :
                if   relation in {'nsubj'}:
                    actions_by_m     .append(action)
                elif relation in {'dobj', 'iobj'}:
                    actions_on_m     .append(action)
                elif relation in {'nmod', 'amod', 'nummod', 'appos'}:
                    descriptions_of_m.append(action)

            elif subject_name in names_f or subject_name in WORDS_F :
                if   relation in {'nsubj'}:
                    actions_by_f     .append(action)
                elif relation in {'dobj', 'iobj'}:
                    actions_on_f     .append(action)
                elif relation in {'nmod', 'amod', 'nummod', 'appos'}:
                    descriptions_of_f.append(action)

    new_row = {
        'actions by men'        : actions_by_m,
        'actions by women'      : actions_by_f,
        'actions on men'        : actions_on_m,
        'actions on women'      : actions_on_f,
        'descriptions of men'   : descriptions_of_m,
        'descriptions of women' : descriptions_of_f
    }
    return new_row

In [29]:
movie_ids = set(movie_metadata['wikipedia_movie_id'])

In [30]:
def parse_nlp(id) -> ET :
    try:
        tree = ET.parse(CORENLP_DATA_FOLDER + str(id) + '.xml')
        return tree.getroot()
    #It is quite common that some ids do not have a corresponding xml file
    except FileNotFoundError:
        return 'FileNotFoundError'

In [32]:
# takes approximately 20min to run
nlp_dict = {}
for movie_id in movie_ids:
    root = parse_nlp(movie_id)
    if root != 'FileNotFoundError':
        nlp_dict[movie_id] = analyze_movie(movie_id, root)
movie_nlp = pd.DataFrame.from_dict(nlp_dict, orient='index')
movie_nlp.index.name = 'movie_id'
movie_nlp.to_csv(PROCESSED_DATA_FOLDER + 'movie_nlp.csv')