In [1]:
import pandas as pd
import numpy as np

movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv',sep='\t')

movie_metadata.columns = ['1. Wikipedia movie ID',
                          '2. Freebase movie ID',
                          '3. Movie name',
                          '4. Movie release date',
                          '5. Movie box office revenue',
                          '6. Movie runtime',
                          '7. Movie languages (Freebase ID:name tuples)',
                          '8. Movie countries (Freebase ID:name tuples)',
                          '9. Movie genres (Freebase ID:name tuples)']

character_metadata = pd.read_csv('MovieSummaries/character.metadata.tsv',sep='\t')

character_metadata.columns = ['1. Wikipedia movie ID',
                              '2. Freebase movie ID',
                              '3. Movie release date',
                              '4. Character name',
                              '5. Actor date of birth',
                              '6. Actor gender',
                              '7. Actor height (in meters)',
                              '8. Actor ethnicity (Freebase ID)',
                              '9. Actor name',
                              '10. Actor age at movie release',
                              '11. Freebase character/actor map ID',
                              '12. Freebase character ID',
                              '13. Freebase actor ID']

movie_metadata_bechdel = pd.read_csv("CMU_bechdel_added.csv")
print(movie_metadata_bechdel.shape)
movie_metadata_bechdel = movie_metadata_bechdel.drop("Unnamed: 0", axis=1)


character_metadata_bechdel = character_metadata.copy(deep = True)
print("Size before:", character_metadata_bechdel.shape)
character_metadata_bechdel = character_metadata_bechdel[character_metadata_bechdel['2. Freebase movie ID'].isin(movie_metadata_bechdel["2. Freebase movie ID"].to_numpy())]
print("Size after:", character_metadata_bechdel.shape)

(6521, 11)
Size before: (450668, 13)
Size after: (72458, 13)


In [2]:
print(movie_metadata_bechdel.shape)
movie_metadata_bechdel = movie_metadata_bechdel[movie_metadata_bechdel['2. Freebase movie ID'].isin(character_metadata_bechdel["2. Freebase movie ID"].to_numpy())]
print(movie_metadata_bechdel.shape)

(6521, 10)
(6202, 10)


In [3]:
plot_summaries=pd.read_csv('MovieSummaries/plot_summaries.txt', sep='\t', header=None, names=['id', 'plot_summary'])
plot_summaries_bechdel = plot_summaries[plot_summaries['id'].isin(character_metadata_bechdel['1. Wikipedia movie ID'].to_numpy())]

plot_summaries.head()
print(plot_summaries.shape)
print(plot_summaries_bechdel.shape)

(42303, 2)
(5194, 2)


In [68]:
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
tokenizer = RegexpTokenizer(r'\w+')
tokens_list=[]
for i, row in tqdm(plot_summaries.iterrows(), total=plot_summaries.shape[0]):
    tokens = tokenizer.tokenize(row[1])
    print(tokens)
    tokens = [x.lower() for x in tokens]
    print(tokens)
    tokens_list.append(tokens)
    print(tokens_list)
    input()

  0%|                                                                                        | 0/42303 [00:00<?, ?it/s]

['Shlykov', 'a', 'hard', 'working', 'taxi', 'driver', 'and', 'Lyosha', 'a', 'saxophonist', 'develop', 'a', 'bizarre', 'love', 'hate', 'relationship', 'and', 'despite', 'their', 'prejudices', 'realize', 'they', 'aren', 't', 'so', 'different', 'after', 'all']
['shlykov', 'a', 'hard', 'working', 'taxi', 'driver', 'and', 'lyosha', 'a', 'saxophonist', 'develop', 'a', 'bizarre', 'love', 'hate', 'relationship', 'and', 'despite', 'their', 'prejudices', 'realize', 'they', 'aren', 't', 'so', 'different', 'after', 'all']
[['shlykov', 'a', 'hard', 'working', 'taxi', 'driver', 'and', 'lyosha', 'a', 'saxophonist', 'develop', 'a', 'bizarre', 'love', 'hate', 'relationship', 'and', 'despite', 'their', 'prejudices', 'realize', 'they', 'aren', 't', 'so', 'different', 'after', 'all']]


  0%|                                                                                        | 0/42303 [02:20<?, ?it/s]


KeyboardInterrupt: Interrupted by user

In [190]:
movie_idx = 255
sample_text = plot_summaries_bechdel.iloc[movie_idx]["plot_summary"]
print(sample_text)
tokens = tokenizer.tokenize(sample_text)
tokens = [x.lower() for x in tokens]
tokens_freq = pd.Series(tokens).value_counts(sort=True)
print(tokens)
print(tokens_freq)
first_movie_id = plot_summaries_bechdel.iloc[movie_idx]["id"]
character_list = character_metadata_bechdel[character_metadata_bechdel['1. Wikipedia movie ID'] == first_movie_id][['4. Character name','6. Actor gender']]
print(character_list)



20-year-old Will Hunting  of South Boston has a genius-level intellect but chooses to work as a janitor at the Massachusetts Institute of Technology and spend his free time with his friends Chuckie Sullivan , Billy McBride  and Morgan O'Mally . When Fields Medal-winning combinatorialist Professor Gerald Lambeau  posts a difficult problem taken from algebraic graph theory as a challenge for his graduate students to solve, Will solves the problem quickly but anonymously. Lambeau posts a much more difficult problem and chances upon Will solving it, but Will runs off. Will meets Skylar , a British student about to graduate from Harvard University and pursue a graduate degree at Stanford University School of Medicine in California. Will is faced with incarceration after assaulting a man who had bullied him as a child. Lambeau arranges for Will to forgo jail time if he agrees to study mathematics under Lambeau's supervision and to see a therapist. Will agrees, but treats his first few therap

In [191]:
character_list_processed = character_list.copy()
character_list_processed = character_list_processed.dropna()
#Lowercase character names
character_list_processed["4. Character name"] = character_list_processed["4. Character name"].str.lower()

#Split name and surname to two columns
character_list_processed[["4. Character name", "4.5. Character surname"]] = character_list_processed["4. Character name"].str.split(' ', n=1, expand=True)

#Stack two columns as rows as if surnames are different characters
character_list_stacked = character_list_processed[["4. Character name", "4.5. Character surname"]].stack()

#Duplicate and stack gender column so that dataframe matches
character_gender_stacked = pd.concat([character_list_processed["6. Actor gender"],character_list_processed["6. Actor gender"]])
character_gender_stacked = pd.concat([pd.Series(character_list_stacked.to_numpy()), pd.Series(character_gender_stacked.to_numpy())], axis = 1)
character_gender_stacked_idx = character_gender_stacked.set_index(0)
#print(character_gender_stacked)
#print(tokens_freq)
#print(character_gender_stacked)
#print(tokens_freq.index)
#print(character_gender_stacked_idx.index.intersection(tokens_freq.index))
#print(tokens_freq[character_gender_stacked_idx.index.intersection(tokens_freq.index)])
tokens_intersection = tokens_freq[character_gender_stacked_idx.index.intersection(tokens_freq.index)]
#Get the intersection between the tokens and characters dataframe
character_mention_freq = character_gender_stacked[character_gender_stacked[0].isin(character_gender_stacked_idx.index.intersection(tokens_freq.index))]

character_mention_freq["no_mention"] = tokens_intersection.values#pd.DataFrame({'4. Character name':character_mention_freq[0], 'no_mention':character_mention_freq.values})
character_mention_freq.columns = ["character_name", "gender", "no_mention"]
character_list_final = character_mention_freq
print(character_list_final)

#character_list1 = character_list[character_list["4. Character name"].isin(tokens_freq.index.intersection(character_list["4. Character name"]).to_numpy())]
#print(character_list1.sort_values(by=["4. Character name"]))


   character_name gender  no_mention
2            sean      M          12
3         maguire      M           1
4            will      M          27
5         hunting      F           1
6         chuckie      M           3
7        sullivan      M           1
10         skylar      F           5
11         morgan      M           1
13          billy      M           1
14        mcbride      M           1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  character_mention_freq["no_mention"] = tokens_intersection.values#pd.DataFrame({'4. Character name':character_mention_freq[0], 'no_mention':character_mention_freq.values})


In [192]:
#print(character_list)
#character_list_freq_added = pd.merge(character_list, character_mention_freq, on="4. Character name", how="left")
#print(character_list_freq_added)

character_list_freq_added = character_list_final.groupby(['gender']).sum() 
if character_list_freq_added['no_mention'].shape[0] == 2:
    female_mention, male_mention = character_list_freq_added['no_mention'][0], character_list_freq_added['no_mention'][1] #groupby is alphabethic, index 0 = F
    mention_ratio = female_mention/(female_mention + male_mention)
elif character_list_freq_added['no_mention'].index[0] == "M":
    mention_ratio = 0.
elif character_list_freq_added['no_mention'].index[0] == "F":
    mention_ratio = 1.
else:
    mention_ratio = np.nan
    
print(mention_ratio)

0.11320754716981132
