In [252]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
pd.options.mode.chained_assignment = None

movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv',sep='\t')

movie_metadata.columns = ['1. Wikipedia movie ID',
                          '2. Freebase movie ID',
                          '3. Movie name',
                          '4. Movie release date',
                          '5. Movie box office revenue',
                          '6. Movie runtime',
                          '7. Movie languages (Freebase ID:name tuples)',
                          '8. Movie countries (Freebase ID:name tuples)',
                          '9. Movie genres (Freebase ID:name tuples)']

character_metadata = pd.read_csv('MovieSummaries/character.metadata.tsv',sep='\t')

character_metadata.columns = ['1. Wikipedia movie ID',
                              '2. Freebase movie ID',
                              '3. Movie release date',
                              '4. Character name',
                              '5. Actor date of birth',
                              '6. Actor gender',
                              '7. Actor height (in meters)',
                              '8. Actor ethnicity (Freebase ID)',
                              '9. Actor name',
                              '10. Actor age at movie release',
                              '11. Freebase character/actor map ID',
                              '12. Freebase character ID',
                              '13. Freebase actor ID']

movie_metadata_bechdel = pd.read_csv("CMU_bechdel_added.csv")
print(movie_metadata_bechdel.shape)
movie_metadata_bechdel = movie_metadata_bechdel.drop("Unnamed: 0", axis=1)


character_metadata_bechdel = character_metadata.copy(deep = True)
print("Size before:", character_metadata_bechdel.shape)
character_metadata_bechdel = character_metadata_bechdel[character_metadata_bechdel['2. Freebase movie ID'].isin(movie_metadata_bechdel["2. Freebase movie ID"].to_numpy())]
print("Size after:", character_metadata_bechdel.shape)

(6521, 11)
Size before: (450668, 13)
Size after: (72458, 13)


In [194]:
print(movie_metadata_bechdel.shape)
movie_metadata_bechdel = movie_metadata_bechdel[movie_metadata_bechdel['2. Freebase movie ID'].isin(character_metadata_bechdel["2. Freebase movie ID"].to_numpy())]
print(movie_metadata_bechdel.shape)

(6521, 10)
(6202, 10)


In [195]:
plot_summaries=pd.read_csv('MovieSummaries/plot_summaries.txt', sep='\t', header=None, names=['id', 'plot_summary'])
plot_summaries_bechdel = plot_summaries[plot_summaries['id'].isin(character_metadata_bechdel['1. Wikipedia movie ID'].to_numpy())]

plot_summaries.head()
print(plot_summaries.shape)
print(plot_summaries_bechdel.shape)

(42303, 2)
(5194, 2)


In [281]:
movie_idx = 255

def calculate_actor_mention_score(movie_idx):
    movie_summary = plot_summaries_bechdel.iloc[movie_idx]["plot_summary"]
    
    #Tokenize the movie summary
    tokens = tokenizer.tokenize(movie_summary)
    tokens = [x.lower() for x in tokens]
    tokens_freq = pd.Series(tokens).value_counts(sort=True)
    
    #align movie dataset & character dataset
    movie_id = plot_summaries_bechdel.iloc[movie_idx]["id"]
    character_list = character_metadata_bechdel[character_metadata_bechdel['1. Wikipedia movie ID'] == movie_id][['4. Character name','6. Actor gender']]
    
    """
    character_list_processed = character_list.copy()
    character_list_processed = character_list_processed.dropna()
    
    #Lowercase character names
    character_list_processed["4. Character name"] = character_list_processed["4. Character name"].str.lower()

    #Split name and surname to two columns
    character_list_processed[["4. Character name", "4.5. Character surname"]] = character_list_processed["4. Character name"].str.split(' ', n=1, expand=True)

    #Stack two columns as rows as if surnames are different characters
    character_list_stacked = character_list_processed[["4. Character name", "4.5. Character surname"]].stack()

    #Duplicate and stack gender column so that dataframe matches
    character_gender_stacked = pd.concat([character_list_processed["6. Actor gender"],character_list_processed["6. Actor gender"]])
    character_gender_stacked = pd.concat([pd.Series(character_list_stacked.to_numpy()), pd.Series(character_gender_stacked.to_numpy())], axis = 1)
    character_gender_stacked_idx = character_gender_stacked.set_index(0)
    
    #Take the intersection between the token's frequency and movie cast
    tokens_intersection = tokens_freq[character_gender_stacked_idx.index.intersection(tokens_freq.index)]
    character_mention_freq = character_gender_stacked[character_gender_stacked[0].isin(character_gender_stacked_idx.index.intersection(tokens_freq.index))]

    #Add the number of character mentions in summary to the character meta-dataset
    character_mention_freq["no_mention"] = tokens_intersection.values#pd.DataFrame({'4. Character name':character_mention_freq[0], 'no_mention':character_mention_freq.values})
    character_mention_freq.columns = ["character_name", "gender", "no_mention"]
    character_list_final = character_mention_freq
    #print(character_list_final)
    """
    
    
    character_list_processed = character_list.copy()
    character_list_processed = character_list_processed.dropna()

    #Lowercase character names
    character_list_processed["4. Character name"] = character_list_processed["4. Character name"].str.lower()

    #Split full name and only get the first name
    character_list_processed["4. Character name"] = character_list_processed["4. Character name"].str.split(' ').str[0]

    character_list_processed = character_list_processed.drop(character_list_processed[character_list_processed["4. Character name"] == "the"].index)

    character_gender_stacked = character_list_processed.drop_duplicates(subset='4. Character name', keep=False)
    character_gender_stacked_idx = character_gender_stacked.set_index("4. Character name")

    #Take the intersection between the token's frequency and movie cast
    tokens_intersection = tokens_freq[character_gender_stacked_idx.index.intersection(tokens_freq.index)]
    character_gender_stacked = character_gender_stacked.drop_duplicates()
    character_mention_freq = character_gender_stacked[character_gender_stacked["4. Character name"].isin(tokens_intersection.index)]#.drop_duplicates()

    #Add the number of character mentions in summary to the character meta-dataset
    character_mention_freq["no_mention"] = tokens_intersection.values#pd.DataFrame({'4. Character name':character_mention_freq[0], 'no_mention':character_mention_freq.values})
    character_mention_freq.columns = ["character_name", "gender", "no_mention"]
    character_list_final = character_mention_freq
    #print(character_list_final)
    
    #Group by gender and calculate total number of mentions by gender
    character_list_freq_added = character_list_final.groupby(['gender']).sum()
    if len(character_list_freq_added['no_mention'].index) != 0:
        if character_list_freq_added['no_mention'].shape[0] == 2:
            female_mention, male_mention = character_list_freq_added['no_mention'][0], character_list_freq_added['no_mention'][1] #groupby is alphabethic, index 0 = F
            mention_ratio = female_mention/(female_mention + male_mention)
        elif character_list_freq_added['no_mention'].index[0] == "M":
            mention_ratio = 0.
        elif character_list_freq_added['no_mention'].index[0] == "F":
            mention_ratio = 1.
        else:
            mention_ratio = np.nan
    else:
        mention_ratio = np.nan

    actor_mention_score = round(mention_ratio, 4)
    
    return actor_mention_score, movie_id


In [283]:
import time
beginning = time.time()
for a in range(1,1000):
    #print(a)
    _,_ = calculate_actor_mention_score(a)

print(time.time() - beginning)

5.853267192840576


In [270]:
movie_idx = 26
sample_text = plot_summaries_bechdel.iloc[movie_idx]["plot_summary"]
print(sample_text)
tokens = tokenizer.tokenize(sample_text)
tokens = [x.lower() for x in tokens]
tokens_freq = pd.Series(tokens).value_counts(sort=True)
print(tokens)
print(tokens_freq)
first_movie_id = plot_summaries_bechdel.iloc[movie_idx]["id"]
character_list = character_metadata_bechdel[character_metadata_bechdel['1. Wikipedia movie ID'] == first_movie_id][['4. Character name','6. Actor gender']]
print(character_list)

Pompous paleontologist Rick Marshall  has a low-level job at the La Brea Tar Pits, three years after a disastrous interview with Matt Lauer of Today became a viral video and ruined his career. Doctoral candidate student Holly Cantrell  tells him that his controversial theories combining time warps and paleontology inspired her. She shows him a fossil with an imprint of a cigarette lighter that he recognizes as his own along with a crystal made into a necklace that gives off strong tachyon energy. She convinces him to finish his tachyon amplifier and come help her on a seemingly routine expedition to the cave where Holly found the fossil, which is in the middle of nowhere. With cave gift shop owner Will Stanton ([[Danny McBride  they raft into the cave, where Marshall has detected high levels of tachyons. He activates the tachyon amplifier, triggering an earthquake that opens a time warp into which the raft falls. The group finds themselves in a desert, filled with various items from ma

In [280]:
character_list_processed = character_list.copy()
character_list_processed = character_list_processed.dropna()
#Lowercase character names
character_list_processed["4. Character name"] = character_list_processed["4. Character name"].str.lower()
#print(character_list_processed)
#Split name and surname to two columns
#character_list_processed["4. Character name"] = character_list_processed["4. Character name"].str.split('-').str[0] #corner case
character_list_processed["4. Character name"] = character_list_processed["4. Character name"].str.split(' ').str[0]


character_list_processed = character_list_processed.drop(character_list_processed[character_list_processed["4. Character name"] == "the"].index)
character_gender_stacked = character_list_processed.drop_duplicates(subset='4. Character name', keep=False)

print(character_list_processed)
print(character_gender_stacked)
#Stack two columns as rows as if surnames are different characters
#character_list_stacked = character_list_processed[["4. Character name", "4.5. Character surname"]].stack()

#Duplicate and stack gender column so that dataframe matches
#character_gender_stacked = pd.concat([character_list_processed["6. Actor gender"],character_list_processed["6. Actor gender"]])
#character_gender_stacked = pd.concat([pd.Series(character_list_stacked.to_numpy()), pd.Series(character_gender_stacked.to_numpy())], axis = 1)
character_gender_stacked_idx = character_gender_stacked.set_index("4. Character name")
#print(character_gender_stacked)
#print(tokens_freq)
#print(character_gender_stacked)
#print(tokens_freq.index)
#print(character_gender_stacked_idx.index.intersection(tokens_freq.index))
#print(tokens_freq[character_gender_stacked_idx.index.intersection(tokens_freq.index)])


tokens_intersection = tokens_freq[character_gender_stacked_idx.index.intersection(tokens_freq.index)]
character_gender_stacked = character_gender_stacked.drop_duplicates()
print(tokens_intersection)
print(character_gender_stacked)
#Get the intersection between the tokens and characters dataframe
character_mention_freq = character_gender_stacked[character_gender_stacked["4. Character name"].isin(tokens_intersection.index)]#.drop_duplicates()
print(tokens_intersection)
print("heree")
print(tokens_intersection.values)
print(character_mention_freq)
character_mention_freq["no_mention"] = tokens_intersection.values#pd.DataFrame({'4. Character name':character_mention_freq[0], 'no_mention':character_mention_freq.values})
character_mention_freq.columns = ["character_name", "gender", "no_mention"]
character_list_final = character_mention_freq
print(character_list_final)

#character_list1 = character_list[character_list["4. Character name"].isin(tokens_freq.index.intersection(character_list["4. Character name"]).to_numpy())]
#print(character_list1.sort_values(by=["4. Character name"]))


       4. Character name 6. Actor gender
262483          teenager               M
262484               dr.               M
262485             holly               F
262486              will               M
262487            cha-ka               M
262488              enik               M
262490               tar               M
262491               tar               F
262492           teacher               M
262493          teenager               M
262494         astronaut               M
262496             ernie               M
262497             barry               M
       4. Character name 6. Actor gender
262484               dr.               M
262485             holly               F
262486              will               M
262487            cha-ka               M
262488              enik               M
262492           teacher               M
262494         astronaut               M
262496             ernie               M
262497             barry               M
holly    7
will 

In [267]:
#print(character_list)
#character_list_freq_added = pd.merge(character_list, character_mention_freq, on="4. Character name", how="left")
#print(character_list_freq_added)

character_list_freq_added = character_list_final.groupby(['gender']).sum() 

if len(character_list_freq_added['no_mention'].index) != 0:
    if character_list_freq_added['no_mention'].shape[0] == 2:
        female_mention, male_mention = character_list_freq_added['no_mention'][0], character_list_freq_added['no_mention'][1] #groupby is alphabethic, index 0 = F
        mention_ratio = female_mention/(female_mention + male_mention)
    elif character_list_freq_added['no_mention'].index[0] == "M":
        mention_ratio = 0.
    elif character_list_freq_added['no_mention'].index[0] == "F":
        mention_ratio = 1.
    else:
        mention_ratio = np.nan
else:
    mention_ratio = np.nan
    
print(mention_ratio)

nan
