In [1]:
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import spacy

In [2]:
# character name
# data of character name & gender
df = pd.read_csv('data/character.metadata.tsv', sep='\t', header=None)
df_name_gender = df.iloc[:, [3, 5]].dropna()
df_name_gender.columns = ['Name','Gender']
print(df_name_gender.head(5))

                         Name Gender
0                    Akooshay      F
1  Lieutenant Melanie Ballard      F
2         Desolation Williams      M
3          Sgt Jericho Butler      M
4             Bashira Kincaid      F


In [3]:
# build a name dict
name_list = df.iloc[:, [3]].dropna()
name_list=name_list.squeeze().tolist()
name_dict = set()
for item in list(name_list):
    names = item.split()
    for name in names:
            name_dict.add(name.lower())

In [4]:
# English vocabulary specific to a certain gender
# female_nouns.txt contains common words that are only used to refer to females, like 'mother'
female_words=set()
with open('data/female_nouns.txt', 'r') as file:
    for line in file:
        words = re.split('[^a-zA-Z]', line)
        for word in words:
            if word:
                female_words.add(word.lower())                
                
# male_nouns.txt contains common words that are only used to refer to males, like 'uncle'     
male_words=set()
with open('data/male_nouns.txt', 'r') as file:
    for line in file:
        words = re.split('[^a-zA-Z]', line)
        for word in words:
            if word:
                male_words.add(word.lower())

In [5]:
# Define a list of words representing females/males
# females
df_female_name = df_name_gender[df_name_gender['Gender']=='F']['Name']
female_name_dict = set()
for item in df_female_name:
    names = item.split()
    for name in names:
        # remove the names like 'Alice's father' that can have a negative impact on the result
        if not name.lower() in female_words | male_words:
            if not name.lower().endswith("'s"):
                female_name_dict.add(name.lower())
female_cleaned_list = [re.sub('[^a-zA-Z]', '', s) for s in female_name_dict]


# males
df_male_name = df_name_gender[df_name_gender['Gender']=='M']['Name']
male_name_dict = set()
for item in df_male_name:
    names = item.split()
    for name in names:
        if not name.lower() in female_words | male_words:
            if not name.lower().endswith("'s"):
                male_name_dict.add(name.lower())
male_cleaned_list = [re.sub('[^a-zA-Z]', '', s) for s in male_name_dict]

In [6]:
#some characters have the same family name but different gender,remove the family names
female_name_dict_cleaned = set(female_cleaned_list)
male_name_dict_cleaned = set(male_cleaned_list)

intersection_set = male_name_dict_cleaned & female_name_dict_cleaned
male_name_dict = male_name_dict_cleaned - intersection_set
female_name_dict = female_name_dict_cleaned - intersection_set
cleaned_list = [re.sub('[^a-zA-Z]', '', s) for s in name_dict]
print(len(cleaned_list))
cleaned_list = set(cleaned_list) - intersection_set
print(len(cleaned_list))

68422
53838


In [7]:
# code for storing the names
'''with open('data/names from movies.txt', 'w', encoding='utf-8') as file:
    for item in cleaned_list:
        file.write(item + '\n')
with open('data/female_name_dict1.txt', 'w', encoding='utf-8') as file:
    for name in female_name_dict:
        file.write(name + '\n')
with open('data/male_name_dict1.txt', 'w', encoding='utf-8') as file:
    for name in male_name_dict:
        file.write(name + '\n')'''

"with open('data/names from movies.txt', 'w', encoding='utf-8') as file:\n    for item in cleaned_list:\n        file.write(item + '\n')\nwith open('data/female_name_dict1.txt', 'w', encoding='utf-8') as file:\n    for name in female_name_dict:\n        file.write(name + '\n')\nwith open('data/male_name_dict1.txt', 'w', encoding='utf-8') as file:\n    for name in male_name_dict:\n        file.write(name + '\n')"

In [8]:
# data preprocessing
# Set the stopwords
stop_words = set(stopwords.words('english'))

# a list of words representing females/males
female_names_words = female_words | female_name_dict
male_names_words = male_words | male_name_dict

# remove adjs in the name list
adjs_set = set()
with open('data/common_adjs.txt', 'r') as file:
    for line in file:
        word = line.strip().lower()
        adjs_set.add(word)
female_names_words = female_names_words - adjs_set
male_names_words = male_names_words - adjs_set

# function used to tokenize, replace words representing females with 'she' and males with 'he', and remove stopwords
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return ['she' if word in female_names_words else 'he' if word in male_names_words else word for word in text.split() if word not in stop_words]

# apply the fuction to the summaries
documents = []
with open('data/plot_summaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        _, text = line.split('\t', 1)
        processed_text = preprocess_text(text)
        documents.append(processed_text)

In [13]:
# code to extract adjs(the outcome is stored in data/adjectives.txt, so you can skip this)
# 
female_name_dict = []
with open('data/female_name_dict1.txt', 'r', encoding='utf-8') as file:
    for line in file:
        female_name_dict.append(line.strip())
female_name_dict=set(female_name_dict)
male_name_dict = []
with open('data/male_name_dict1.txt', 'r', encoding='utf-8') as file:
    for line in file:
        male_name_dict.append(line.strip())
male_name_dict=set(male_name_dict)

#
# data preprocessing, very time-consuming

# Define a list of words representing females/males
female_names_words = female_words | female_name_dict
male_names_words = male_words | male_name_dict
remove_dict=(stop_words | female_names_words | male_names_words)
#from https://www.merriam-webster.com/thesaurus/

def preprocess_text_adj(text):
    # lower the words and remove punctuation
    text_token = word_tokenize(text)
    tagged = pos_tag(text_token)
    adjectives = [word for word, pos in tagged if (pos.startswith('JJ'))]
    text = [word.lower() for word in adjectives if word.isalpha()]
    # tokenize, replace words representing females with 'she' and males with 'he', and remove stopwords.
    return [word for word in text if word not in remove_dict]


documents_adj=[]
with open('data/plot_summaries.txt', 'r', encoding='utf-8') as file:
    for line in file:
        text = line
        documents_adj.append(preprocess_text_adj(text))
documents_adj = [word for sublist in documents_adj for word in sublist]

vocabulary=list(set(documents_adj))
word_counts = {word: documents_adj.count(word) for word in vocabulary}
word_count_df = pd.DataFrame(list(word_counts.items()), columns=['Word', 'Count'])
# only use the words appear at least twice
documents_adj=list(word_count_df[word_count_df['Count']>1]['Word'])

In [14]:
# remove some words in case of incorrect classification
lines = []

# file from https://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/
with open('data/female name.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line=line.lower()
        lines.append(line.strip())
documents_adj = set(documents_adj)-set(lines)

lines = []
with open('data/male name.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line=line.lower()
        lines.append(line.strip())
documents_adj = set(documents_adj)-set(lines)


lines = []
'''with open('data/names from movies.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line=line.lower()
        lines.append(line.strip())
documents = set(documents)-(set(lines) & set(documents))'''

print(len(documents_adj))
other_common_words = set(['child', 'orphan', 'baby', 'girls', 'lover', 'mute','child', 'orphan', 'bride', 
                          'baby', 'girls', 'lover', 'mute', 'housekeeper', 'cousin', 'neighbour', 'parent', 
                          'sibling', 'pregnant', 'housewife', 'relative', 'servant', 'waif', 'grandson', 
                          'stranger', 'courtesan', 'servant','cousin', 'neighbour', 'parent', 'sibling', 
                          'housewife','nurse', 'stepsister', 'housekeeper','grandson','servant','grandchildren',
                          'relatives', 'uncles', 'orphans', 'childbirth', 'scoundrel', 'foreigner', 'stepfamily', 
                          'tuberculosis', 'mallaya', 'triplet', 'grandchildren', 'outcast', 'neighbours', 'huanhuan', 
                          'lakshmiammal', 'jaipal', 'himal', 'pasarian', 'leukemia'])
documents_adj = set(documents_adj)-set(other_common_words)

8671


In [15]:
# save data
with open('data/adjectives.txt', 'w', encoding='utf-8') as file:
    for item in documents_adj:
        file.write(item + '\n')

In [16]:
# load the data adjectives.txt (extracted from the text)
adjectives = []
with open('data/adjectives.txt', 'r', encoding='utf-8') as file:
    for line in file:
        adjectives.append(line.strip())

In [17]:
# train the word2vec model
model = Word2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

# save the model
model.save("word2vec.model")

In [18]:
# apply the model
model = Word2Vec.load("word2vec.model")
adjectives = [word for word in adjectives if word in model.wv.key_to_index]
adj_vectors = [model.wv[word] for word in adjectives]
vector_she = model.wv['she']
vector_he = model.wv['he']

In [21]:
# example: lovely
similarity = cosine_similarity(vector_she.reshape(1,-1), model.wv['lovely'].reshape(1,-1))
print(f'similarity between female and lovely: {similarity}')
similarity = cosine_similarity(vector_he.reshape(1,-1), model.wv['lovely'].reshape(1,-1))
print(f'similarity between male and lovely: {similarity}')

similarity between female and lovely: [[0.42599806]]
similarity between male and lovely: [[0.15806998]]


In [20]:
female_reference_vector = vector_she.reshape(1, -1)
female_similarities = [cosine_similarity(female_reference_vector, vec.reshape(1, -1))[0][0] for vec in adj_vectors]
male_reference_vector = vector_he.reshape(1, -1)
male_similarities = [cosine_similarity(male_reference_vector, vec.reshape(1, -1))[0][0] for vec in adj_vectors]

In [22]:
similaritie = pd.DataFrame({'Female':female_similarities,'Male':male_similarities})
similaritie.index=adjectives
print(similaritie)

                Female      Male
botched       0.115474  0.226528
labyrinthine  0.115607  0.125282
cruellest     0.184536  0.224918
sisterly      0.181152  0.139859
returning     0.310180  0.337617
...                ...       ...
impersonal    0.026452 -0.018584
cybertronian  0.123443  0.196504
turncoat      0.057478  0.170331
mumble        0.117950  0.115268
misfit        0.116231  0.064116

[8646 rows x 2 columns]


In [23]:
similarity_diff=similaritie['Female']-similaritie['Male']
top_20_female_adj = similarity_diff.sort_values(ascending=False).head(100).index
print(top_20_female_adj)
top_20_male_adj = similarity_diff.sort_values(ascending=True).head(100).index
print(top_20_male_adj)

Index(['beautiful', 'loving', 'lovely', 'shy', 'somewhat', 'unhappy',
       'charming', 'intimate', 'passionate', 'fiancé', 'older', 'disturbed',
       'behaviour', 'fond', 'fiancée', 'neighbor', 'worrisome', 'shared',
       'lonely', 'neglected', 'affair', 'deeply', 'promiscuous', 'smitten',
       'emotional', 'younger', 'émilie', 'attracted', 'caring', 'sweetheart',
       'twin', 'divorced', 'jealousy', 'sexual', 'rude', 'precocious',
       'uncomfortable', 'grown', 'seduced', 'paternal', 'genuinely',
       'flirtatious', 'uterine', 'overbearing', 'loved', 'intrigued',
       'reserved', 'introverted', 'cared', 'socialite', 'extramarital',
       'domineering', 'infatuated', 'outgoing', 'cheerful', 'distressed',
       'agreeable', 'lifestyle', 'unmarried', 'overprotective', 'delighted',
       'engaged', 'insecure', 'possessive', 'thirtyish', 'neglectful',
       'teenage', 'imbalanced', 'spoiled', 'courtship', 'unfaithful',
       'incestous', 'desire', 'deaf', 'joyless', 'a