In [2]:
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import nltk

In [3]:
# character name
# use the data of character name & gender
df = pd.read_csv('data/character.metadata.tsv', sep='\t', header=None)
name_list = df.iloc[:, [3]].dropna()
df_name_gender = df.iloc[:, [3, 5]].dropna()
df_name_gender.columns = ['Name','Gender']
df_female_name = df_name_gender[df_name_gender['Gender']=='F']['Name']
df_male_name = df_name_gender[df_name_gender['Gender']=='M']['Name']
name_list=name_list.squeeze().tolist()
name_dict = set()

# build a name dict
for item in list(name_list):
    names = item.split()
    for name in names:
        # remove the names like 'Alice's father' that can have a negative impact on the result
            name_dict.add(name.lower())

cleaned_list = [re.sub('[^a-zA-Z]', '', s) for s in name_dict]
with open('data/names from movies.txt', 'w', encoding='utf-8') as file:
    for item in cleaned_list:
        file.write(item + '\n')

In [4]:
# create a set to collect the names of female and male characters
female_name_dict = set()
male_name_dict = set()

# build a name dict
for item in df_female_name:
    names = item.split()
    for name in names:
        # remove the names like 'Alice's father' that can have a negative impact on the result
        if not name.lower().endswith("'s"):
            female_name_dict.add(name.lower())
for item in df_male_name:
    names = item.split()
    for name in names:
        if not name.lower().endswith("'s"):
            male_name_dict.add(name.lower())

#some characters have the same family name but different gender
print(f'length of female name dict before:{len(female_name_dict)}')
intersection_set = female_name_dict & male_name_dict 
female_name_dict = female_name_dict - intersection_set
male_name_dict = male_name_dict - intersection_set
print(f'length of female name dict after:{len(female_name_dict)}')

length of female name dict before:26452
length of female name dict after:15520


In [53]:
# data preprocessing
# Set the stopwords
stop_words = set(stopwords.words('english'))

# Define a list of words representing females/males
female_words = set(['woman', 'girl', 'lady', 'female', 'daughter', 'mother', 'sister','girlfriend','maid','virgin','damsel','madame','senorita','princess','queen','her'])
female_names_words = female_words | female_name_dict
male_words = set(['man', 'boy', 'gentleman', 'male', 'son', 'father', 'brother','boyfriend','beau','husband','dude','lad','prince','king','him'])
male_names_words = male_words | male_name_dict

#from https://www.merriam-webster.com/thesaurus/

def preprocess_text(text):
    # lower the words and remove punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    # tokenize, replace words representing females with 'she' and males with 'he', and remove stopwords.
    return ['she' if word in female_names_words else 'he' if word in male_names_words else word for word in text.split() if word not in stop_words]

documents = []

with open('data/plot_summaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        _, text = line.split('\t', 1)
        processed_text = preprocess_text(text)
        documents.append(processed_text)

In [19]:
# load the data adjectives.txt (extracted from the text)
adjectives = []

with open('data/adjectives.txt', 'r', encoding='utf-8') as file:
    for line in file:
        adjectives.append(line.strip())

In [20]:
print(len(adjectives))

13322


In [106]:
# train the word2vec model
model = Word2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

# save the model
model.save("word2vec.model")

In [21]:
# word to vector
model = Word2Vec.load("word2vec.model")
adjectives = [word for word in adjectives if word in model.wv.key_to_index]
adj_vectors = [model.wv[word] for word in adjectives]
vector_she = model.wv['she']
vector_he = model.wv['he']

In [14]:
# example: brave
similarity = cosine_similarity(vector_she.reshape(1,-1), model.wv['brave'].reshape(1,-1))
print(f'similarity between female and brave: {similarity}')
similarity = cosine_similarity(vector_he.reshape(1,-1), model.wv['brave'].reshape(1,-1))
print(f'similarity between male and brave: {similarity}')

similarity between female and brave: [[0.22432798]]
similarity between male and brave: [[0.3258356]]


In [22]:
female_reference_vector = vector_she.reshape(1, -1)
female_similarities = [cosine_similarity(female_reference_vector, vec.reshape(1, -1))[0][0] for vec in adj_vectors]

In [23]:
male_reference_vector = vector_he.reshape(1, -1)
male_similarities = [cosine_similarity(male_reference_vector, vec.reshape(1, -1))[0][0] for vec in adj_vectors]

In [24]:
similaritie = pd.DataFrame({'Female':female_similarities,'Male':male_similarities})
similaritie.index=adjectives
print(similaritie)

                Female      Male
metallic     -0.048140 -0.082806
plague       -0.003679  0.050497
humanitarian  0.044830  0.094047
undeterred    0.186797  0.177037
nicer         0.408380  0.356255
...                ...       ...
whimper       0.072018  0.114505
faraway       0.134394  0.108107
calligraphy   0.082819  0.008272
recreational -0.041819 -0.113353
biased       -0.049455  0.019377

[13273 rows x 2 columns]


In [25]:
top_20_female_adj = similaritie['Female'].sort_values(ascending=False).head(100).index
top_20_male_adj = similaritie['Male'].sort_values(ascending=False).head(100).index
print(top_20_female_adj)
print(top_20_male_adj)

Index(['vogish', 'sickly', 'infertile', 'neglected', 'orphaned', 'unmarried',
       'flighty', 'unendurable', 'healthy', 'caring', 'irreducible',
       'destitute', 'forlorn', 'unhappy', 'swaddled', 'morbidly', 'cared',
       'inhibited', 'autistic', 'unloved', 'unloving', 'livelier',
       'submissive', 'demure', 'genuinely', 'disinherited', 'promiscuous',
       'distressed', 'bedside', 'businesswoman', 'invalid', 'betrothed',
       'sublet', 'impregnated', 'agnès', 'pubescent', 'cervical', 'hasidic',
       'remarried', 'penniless', 'seduced', 'neglectful', 'incestual',
       'hateful', 'ladies', 'yearnful', 'overjoyed', 'grown', 'flattered',
       'spiritful', 'illiterate', 'frédéric', 'nicer', 'dutiful', 'fond',
       'parentless', 'virtuous', 'unpolished', 'remarry', 'unwanted',
       'noblewoman', 'unwell', 'precocious', 'overbearing', 'belated',
       'dissolute', 'accommodating', 'immdiately', 'bargirl', 'incestous',
       'unattractive', 'unfaithful', 'affectionate