In [3]:
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import nltk


The first step we are going to follow is to extract the character names to then be able to change them to the gender of the character. For now we just take the whole character dataset and replace it in the summaries. However this can lead to error for example in the summary used as example the word different  is counted as a male character and is replaced by he while in this context it's not a male character.

In [5]:
# character name
# use the data of character name & gender
df = pd.read_csv('data/character.metadata.tsv', sep='\t', header=None)
name_list = df.iloc[:, [3]].dropna()
df_name_gender = df.iloc[:, [3, 5]].dropna()
df_name_gender.columns = ['Name','Gender']
df_female_name = df_name_gender[df_name_gender['Gender']=='F']['Name']
df_male_name = df_name_gender[df_name_gender['Gender']=='M']['Name']
name_list=name_list.squeeze().tolist()
name_dict = set()

# build a name dict
for item in list(name_list):
    names = item.split()
    for name in names:
        # remove the names like 'Alice's father' that can have a negative impact on the result
            name_dict.add(name.lower())

cleaned_list = [re.sub('[^a-zA-Z]', '', s) for s in name_dict]
with open('data/names from movies.txt', 'w', encoding='utf-8') as file:
    for item in cleaned_list:
        file.write(item + '\n')

In [7]:
# create a set to collect the names of female and male characters
female_name_dict = set()
male_name_dict = set()

# build a name dict
for item in df_female_name:
    names = item.split()
    for name in names:
        # remove the names like 'Alice's father' that can have a negative impact on the result
        if not name.lower().endswith("'s"):
            female_name_dict.add(name.lower())
for item in df_male_name:
    names = item.split()
    for name in names:
        if not name.lower().endswith("'s"):
            male_name_dict.add(name.lower())

#some characters have the same family name but different gender
print(f'length of female name dict before:{len(female_name_dict)}')
intersection_set = female_name_dict & male_name_dict 
female_name_dict = female_name_dict - intersection_set
male_name_dict = male_name_dict - intersection_set
print(f'length of female name dict after:{len(female_name_dict)}')

length of female name dict before:26452
length of female name dict after:15520


In [44]:
# data preprocessing
# Set the stopwords
stop_words = set(stopwords.words('english'))

# Define a list of words representing females/males
#from https://www.merriam-webster.com/thesaurus/
female_words = set(['woman', 'girl', 'lady', 'female', 'daughter', 'mother', 'sister','girlfriend','maid','virgin','damsel','madame','senorita','princess','queen','her'])
female_names_words = female_words | female_name_dict
male_words = set(['man', 'boy', 'gentleman', 'male', 'son', 'father', 'brother','boyfriend','beau','husband','dude','lad','prince','king','him'])
male_names_words = male_words | male_name_dict


def preprocess_text(text):
    # lower the words and remove punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    # tokenize, replace words representing females with 'she' and males with 'he', and remove stopwords.
    return ['she' if word in female_names_words else 'he' if word in male_names_words else word for word in text.split() if word not in stop_words]

documents = []


with open('data/plot_summaries.txt', 'r', encoding='utf-8') as file:
    for line_number,line in enumerate(file):
        _, text = line.split('\t', 1)
        processed_text = preprocess_text(text)
        documents.append(processed_text)
        if line_number< 1:
            print(text)

Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.



In [45]:
documents[0]

['shlykov',
 'hardworking',
 'taxi',
 'driver',
 'lyosha',
 'saxophonist',
 'develop',
 'he',
 'lovehate',
 'relationship',
 'despite',
 'prejudices',
 'realize',
 'arent',
 'he']

In [9]:
# load the data adjectives.txt (extracted from the text in the extract_adj.ipynb)
adjectives = []

with open('data/adjectives.txt', 'r', encoding='utf-8') as file:
    for line in file:
        adjectives.append(line.strip())

In [17]:
# train the word2vec model using the skip-gram architecture
model = Word2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

# save the model
model.save("word2vec.model")

In [18]:
# word to vector
model = Word2Vec.load("word2vec.model")
adjectives = [word for word in adjectives if word in model.wv.key_to_index]
adj_vectors = [model.wv[word] for word in adjectives]
vector_she = model.wv['she']
vector_he = model.wv['he']

In [20]:
female_reference_vector = vector_she.reshape(1, -1)
female_similarities = [cosine_similarity(female_reference_vector, vec.reshape(1, -1))[0][0] for vec in adj_vectors]

In [21]:
male_reference_vector = vector_he.reshape(1, -1)
male_similarities = [cosine_similarity(male_reference_vector, vec.reshape(1, -1))[0][0] for vec in adj_vectors]

In [22]:
similaritie = pd.DataFrame({'Female':female_similarities,'Male':male_similarities})
similaritie.index=adjectives
print(similaritie)

                Female      Male
metallic      0.050491  0.122352
plague        0.036093  0.102109
humanitarian  0.024778  0.100234
undeterred    0.344810  0.341719
nicer         0.439288  0.292362
...                ...       ...
whimper       0.009873  0.052732
faraway       0.107892  0.137066
calligraphy   0.082331  0.084284
recreational  0.115089  0.113648
biased       -0.012671  0.025984

[13252 rows x 2 columns]


In [23]:
top_20_female_adj = similaritie['Female'].sort_values(ascending=False).head(100).index
top_20_male_adj = similaritie['Male'].sort_values(ascending=False).head(100).index
print(top_20_female_adj)
print(top_20_male_adj)

Index(['longer', 'overjoyed', 'anymore', 'slept', 'relieved', 'shed',
       'meantime', 'worried', 'nevertheless', 'distressed', 'broke', 'sorry',
       'forgotten', 'sadly', 'somebody', 'reconnect', 'misunderstanding',
       'apologizes', 'pleasurable', 'ruined', 'thinks', 'gidget', 'reconcile',
       'none', 'confused', 'though', 'embarrassed', 'belong', 'want',
       'changed', 'thrilled', 'ashamed', 'stunned', 'loved', 'wont', 'pleased',
       'calm', 'neglected', 'depressed', 'somehow', 'mistake', 'touch',
       'genuinely', 'sees', 'tired', 'bothered', 'comfortable', 'missed',
       'surprise', 'neither', 'hug', 'excited', 'suitable', 'excitement',
       'suddenly', 'worst', 'leave', 'knowing', 'afterwards', 'bother',
       'devastated', 'almost', 'amazed', 'expected', 'élizabeth', 'pretending',
       'unwell', 'already', 'staying', 'properly', 'happier', 'cant',
       'otherwise', 'pretend', 'clearly', 'closer', 'hate', 'terribly',
       'prettier', 'dejected', 'sob