# Setting up

In [13]:
import little_mallet_wrapper as lmw
import pandas as pd
import ast
from collections import Counter
import numpy as np
from nltk.corpus import stopwords
import json

%matplotlib inline

path_to_mallet = '~/mallet/bin/mallet'
path_to_proj = '../'
title_to_train = "ovid_sent"
INPUT = "../topic-modeling/input/" + title_to_train + ".tsv"
output_directory_path = path_to_proj + 'topic-modeling/output/' + title_to_train

In [14]:
# Preparing training data ---- 
train_df = pd.read_csv(INPUT, sep='\t')
stop=stopwords.words('english')
training_data = [lmw.process_string(t, remove_stop_words=True, stop_words=stop) for t in train_df['text'].tolist()]
training_data = [d for d in training_data if d.strip()] 

# Analysis

### Examining topics

In [15]:
# Explore results for different number of topics
num_topics = 10

In [16]:
# Examining the top 10 keywords in each topic ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + '/mallet.topic_keys.' + str(num_topics))
top_10 = []
top_10_list = {}
for i, t in enumerate(topic_keys):
  top_10_list[t[0]] = t[:10]
  top_10.append(' '.join(t[:10]))
naming = {top_10[i].split()[0]: top_10[i] for i in range(len(top_10))}
naming

{'gods': 'gods said take power long many tell let time goddess',
 'tree': 'tree white changed wings branches hair form gold leaves water',
 'earth': 'earth air sky fire light sun clouds stars chariot world',
 'father': 'father mother said love daughter words son see girl god',
 'body': 'body arms back hand blood hair like hands face spear',
 'sacred': 'sacred wine incense altar rites horns gods king temple altars',
 'sea': 'sea waters waves nymphs river water ocean wind deep land',
 'son': 'son city king war father walls achilles also troy people',
 'wild': 'wild fields fierce birds cattle blood bull sight woods bird',
 'times': 'times three old nine four age seven lived black years'}

### Inferring gender for characters in each document

In [17]:
# Pull list of characters from booknlp output ----
gender_info = pd.read_csv('../booknlp_output/gender_info.csv')
gender_info

Unnamed: 0.1,Unnamed: 0,index,name,names,gender,total_agent,total_object,perc_agent,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,1,499,Jupiter,"['Jupiter', 'Ammon', 'mighty Jupiter', 'Jupite...",he/him/his,76,21,0.783505,,,,,
1,2,605,Jove,"['Jove', 'Hector', 'Ajax', 'mighty Jove', 'bra...",he/him/his,71,20,0.780220,,,,,
2,3,1482,the god,"['the god', 'The god']",he/him/his,99,12,0.891892,,,,,
3,5,519,Phoebus,"['Phoebus', 'Apollo', 'Phoebus Apollo']",he/him/his,69,8,0.896104,,,,,
4,6,1835,the boy,"['the boy', 'The boy']",he/him/his,69,25,0.734043,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
927,924,9112,the natives,['the natives'],they/them/their,1,0,1.000000,,,,,
928,925,5510,the women of Scythia,['the women of Scythia'],they/them/their,0,1,0.000000,,,,,
929,927,1428,Romans,"['Romans', 'the Romans']",they/them/their,1,0,1.000000,,,,,
930,928,5649,The terrified crowd,['The terrified crowd'],they/them/their,2,1,0.666667,,,,,


In [18]:
# Create a dictionary of named characters and referential gender ---- 
all_names = {}
list_name = list(gender_info['names'])
list_gender = list(gender_info['gender'])
for i in range(len(gender_info)): 
  name = ast.literal_eval(list_name[i])
  for n in name: 
    all_names[n] = list_gender[i]

In [19]:
# Assigning gender to each document ---- 
docs = pd.read_csv(INPUT, sep='\t')
gender = []
for sent in docs['text']: 
  gender_sent = []
  for name in all_names.keys(): 
    if name in sent:
      gender_sent.append(all_names[name])
  gender.append(gender_sent)
docs['gender'] = gender
docs

Unnamed: 0.1,Unnamed: 0,title,text,gender
0,0,metamorphoses,I want to speak about bodies changed into new ...,[]
1,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]"
2,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]"
3,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]"
4,4,metamorphoses,"Though there was land and sea and air, it was...",[]
...,...,...,...,...
4893,4893,metamorphoses,Each is a father and a master,[he/him/his]
4894,4894,metamorphoses,"You gods, the friends of Aeneas, to whom fire...","[he/him/his, he/him/his, he/him/his, he/him/hi..."
4895,4895,metamorphoses,"And now the work is done, that Jupiter’s anger...","[he/him/his, she/her]"
4896,4896,metamorphoses,"Let that day, that only has power over my bod...",[]


In [20]:
# Assigning gender label to each sentence ----
gender = list(docs['gender'])
labels, masc, fem, neutral = [], [], [], []
for item in gender: 
  if len(item) == 1: 
    labels.append(item[0])
    if 'he/him/his' in item:
      masc.append(1)
      fem.append(0)
      neutral.append(0)
    elif 'she/her' in item:
      fem.append(1)
      masc.append(0)
      neutral.append(0)
    else: 
      neutral.append(1)
      masc.append(0)
      fem.append(0)
  elif len(item) == 0: 
    labels.append('none')
    masc.append(0)
    fem.append(0)
    neutral.append(0)
  else: 
    counter = dict(Counter(item))
    counter_perc = {k: v / len(item) for k, v in counter.items()} 
    #count = sorted(counter.items(), key=lambda x:x[1], reverse=True)[0][0]
    labels.append("mixed")
    if 'he/him/his' in item:
      masc.append(counter_perc['he/him/his'])
      fem.append(0)
      neutral.append(0)
    elif 'she/her' in item:
      fem.append(counter_perc['she/her'])
      masc.append(0)
      neutral.append(0)
    else: 
      neutral.append(counter_perc['they/them/their'])
      masc.append(0)
      fem.append(0)
docs['major'] = labels
docs['fem'] = fem
docs['masc'] = masc
docs['neutral'] = neutral
docs

Unnamed: 0.1,Unnamed: 0,title,text,gender,major,fem,masc,neutral
0,0,metamorphoses,I want to speak about bodies changed into new ...,[],none,0.000000,0.000000,0.0
1,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]",mixed,0.666667,0.000000,0.0
2,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]",mixed,0.750000,0.000000,0.0
3,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]",mixed,1.000000,0.000000,0.0
4,4,metamorphoses,"Though there was land and sea and air, it was...",[],none,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...
4893,4893,metamorphoses,Each is a father and a master,[he/him/his],he/him/his,0.000000,1.000000,0.0
4894,4894,metamorphoses,"You gods, the friends of Aeneas, to whom fire...","[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.727273,0.0
4895,4895,metamorphoses,"And now the work is done, that Jupiter’s anger...","[he/him/his, she/her]",mixed,0.000000,0.500000,0.0
4896,4896,metamorphoses,"Let that day, that only has power over my bod...",[],none,0.000000,0.000000,0.0


In [21]:
# Get top topics for each document ----
topic_distributions = lmw.load_topic_distributions(output_directory_path + '/mallet.topic_distributions.' + str(num_topics))
top = []
topic_dict = {}
for topic in naming.keys(): 
  topic_dict[topic] = []
for doc in topic_distributions: 
  ind_prob = {i: p for i, p in enumerate(doc)}
  top_1 = sorted(ind_prob.items(), key=lambda x:x[1], reverse=True)[0]
  top.append(list(naming.keys())[top_1[0]])
  for i in range(len(topic_dict)):
    topic_dict[list(naming.keys())[i]].append(doc[i])
docs_subset = docs.copy().iloc[:len(topic_distributions), :]        
docs_subset['Top Topics'] = top
for i in range(len(topic_dict)):
  docs_subset[list(naming.keys())[i]] = topic_dict[list(naming.keys())[i]]
docs_subset.head()

Unnamed: 0.1,Unnamed: 0,title,text,gender,major,fem,masc,neutral,Top Topics,gods,tree,earth,father,body,sacred,sea,son,wild,times
0,0,metamorphoses,I want to speak about bodies changed into new ...,[],none,0.000000,0.000000,0.0,tree,0.031505,0.392655,0.018144,0.188885,0.170580,0.013044,0.019380,0.146231,0.011614,0.007962
1,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]",mixed,0.666667,0.000000,0.0,gods,0.779512,0.125963,0.008539,0.030068,0.021453,0.006139,0.009120,0.009995,0.005466,0.003747
2,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]",mixed,0.750000,0.000000,0.0,earth,0.009335,0.005238,0.783141,0.130041,0.050544,0.003865,0.005742,0.006293,0.003441,0.002359
3,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]",mixed,1.000000,0.000000,0.0,earth,0.048155,0.005439,0.736340,0.019660,0.052488,0.119396,0.005963,0.006535,0.003574,0.002450
4,4,metamorphoses,"Though there was land and sea and air, it was...",[],none,0.000000,0.000000,0.0,earth,0.021004,0.178445,0.678738,0.042596,0.030392,0.008696,0.012920,0.014159,0.007743,0.005308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4861,4861,metamorphoses,Is it a greater thing to have conquered the s...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.571429,0.0,father,0.063007,0.035351,0.036286,0.627720,0.091168,0.026087,0.038757,0.042474,0.023227,0.015923
4862,4862,metamorphoses,"When Venus, the golden mother of Aeneas, saw t...","[he/him/his, he/him/his, he/him/his, she/her, ...",mixed,0.000000,0.600000,0.0,son,0.232490,0.002481,0.072722,0.184405,0.006398,0.247443,0.002720,0.248593,0.001630,0.001118
4863,4863,metamorphoses,Will I be the only one always to be troubled ...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.555556,0.0,sacred,0.250149,0.010878,0.088086,0.270080,0.028054,0.315709,0.011926,0.013070,0.007147,0.004900
4864,4864,metamorphoses,"Prevent them, I beg you, thwart this attempt,...",[they/them/their],they/them/their,0.000000,0.000000,1.0,father,0.291774,0.007857,0.174727,0.306168,0.131370,0.005798,0.008613,0.009439,0.005162,0.059093


In [22]:
# Retaining only documents with at least one gendered character ----
docs_filtered = docs_subset[docs_subset['gender'].str.len() > 0].reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,title,text,gender,major,fem,masc,neutral,Top Topics,gods,tree,earth,father,body,sacred,sea,son,wild,times
0,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]",mixed,0.666667,0.000000,0.0,gods,0.779512,0.125963,0.008539,0.030068,0.021453,0.006139,0.009120,0.009995,0.005466,0.003747
1,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]",mixed,0.750000,0.000000,0.0,earth,0.009335,0.005238,0.783141,0.130041,0.050544,0.003865,0.005742,0.006293,0.003441,0.002359
2,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]",mixed,1.000000,0.000000,0.0,earth,0.048155,0.005439,0.736340,0.019660,0.052488,0.119396,0.005963,0.006535,0.003574,0.002450
3,5,metamorphoses,"Nothing retained its shape, one thing obstruc...","[he/him/his, she/her, she/her]",mixed,0.000000,0.333333,0.0,body,0.171157,0.165334,0.165531,0.079533,0.334977,0.005492,0.008160,0.061573,0.004890,0.003353
4,6,metamorphoses,This conflict was ended by a god and a greater...,[he/him/his],he/him/his,0.000000,1.000000,0.0,earth,0.171157,0.007443,0.744464,0.026903,0.019195,0.005492,0.008160,0.008943,0.004890,0.003353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,4861,metamorphoses,Is it a greater thing to have conquered the s...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.571429,0.0,father,0.063007,0.035351,0.036286,0.627720,0.091168,0.026087,0.038757,0.042474,0.023227,0.015923
3318,4862,metamorphoses,"When Venus, the golden mother of Aeneas, saw t...","[he/him/his, he/him/his, he/him/his, she/her, ...",mixed,0.000000,0.600000,0.0,son,0.232490,0.002481,0.072722,0.184405,0.006398,0.247443,0.002720,0.248593,0.001630,0.001118
3319,4863,metamorphoses,Will I be the only one always to be troubled ...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.555556,0.0,sacred,0.250149,0.010878,0.088086,0.270080,0.028054,0.315709,0.011926,0.013070,0.007147,0.004900
3320,4864,metamorphoses,"Prevent them, I beg you, thwart this attempt,...",[they/them/their],they/them/their,0.000000,0.000000,1.0,father,0.291774,0.007857,0.174727,0.306168,0.131370,0.005798,0.008613,0.009439,0.005162,0.059093


In [28]:
# Get top topics for each gender ----
fem_major = docs_filtered[docs_filtered['major'] == 'she/her']
doc_id_fem = fem_major['Unnamed: 0'].tolist()
top_fem = docs_subset.iloc[doc_id_fem, :]
fem = {}
for topic in naming.keys(): 
  fem[topic] = np.mean(top_fem[topic])
fem
fem_topics = fem_major['Top Topics'].tolist()
fem_counter = dict(Counter(fem_topics))
fem_top = sorted(fem_counter.items(), key=lambda x:x[1], reverse=True)[:3]
fem_major.head()

{'gods': 0.11137287141829871,
 'tree': 0.07872110255220995,
 'earth': 0.07242164375806788,
 'father': 0.27399045886954876,
 'body': 0.1967171219927189,
 'sacred': 0.04639507281095399,
 'sea': 0.07576785529166279,
 'son': 0.07457038805167827,
 'wild': 0.037069376243311886,
 'times': 0.03297410901154939}

In [29]:
# Get top topics for each gender ----
masc_major = docs_filtered[docs_filtered['major'] == 'he/him/his']
doc_id_masc = masc_major['Unnamed: 0'].tolist()
top_masc = docs_subset.iloc[doc_id_masc, :]
masc = {}
for topic in naming.keys(): 
  masc[topic] = np.mean(top_masc[topic])
masc
#masc_topics = masc_major['Top Topics'].tolist()
#masc_counter = dict(Counter(masc_topics))
#masc_top = sorted(masc_counter.items(), key=lambda x:x[1], reverse=True)[:5]
#masc_top

{'gods': 0.11365672740422528,
 'tree': 0.07074380738470698,
 'earth': 0.06370525262070303,
 'father': 0.25295006546320437,
 'body': 0.18811163897634445,
 'sacred': 0.06169029040415073,
 'sea': 0.07869083506672898,
 'son': 0.09636711765851419,
 'wild': 0.048200563536068906,
 'times': 0.025883701485353863}

In [35]:
# Examining delta T(t) ----
for item in masc: 
   print(item, fem[item] - masc[item])

gods -0.002283855985926564
tree 0.007977295167502965
earth 0.008716391137364846
father 0.021040393406344393
body 0.008605483016374454
sacred -0.015295217593196742
sea -0.002922979775066198
son -0.021796729606835916
wild -0.01113118729275702
times 0.007090407526195525


In [33]:
# Get top topics for each gender ----
neu_major = docs_filtered[docs_filtered['major'] == 'they/them/their'].reset_index(drop=True)
neu_topics = neu_major['Top Topics'].tolist()
neu_counter = dict(Counter(neu_topics))
neu_top = sorted(neu_counter.items(), key=lambda x:x[1], reverse=True)[:5]
neu_top

[('father', 119), ('body', 71), ('gods', 43), ('sea', 39), ('earth', 33)]

In [34]:
# Get top topics for each gender (mixed) ----
mixed_major = docs_filtered[docs_filtered['major'] == 'mixed'].reset_index(drop=True)
mixed_topics = mixed_major['Top Topics'].tolist()
mixed_counter = dict(Counter(mixed_topics))
mixed_top = sorted(mixed_counter.items(), key=lambda x:x[1], reverse=True)[:5]
mixed_top

[('father', 652), ('body', 433), ('gods', 254), ('son', 199), ('sea', 133)]

In [35]:
# Probability for each topics between gender groups ---- 
prob_dict = {}
for topic in topic_dict: 
  prob_dict[topic] = np.mean(masc_major[topic]) - np.mean(fem_major[topic])
prob_dict

{'gods': 0.002283855985926564,
 'tree': -0.007977295167502965,
 'earth': -0.008716391137364846,
 'father': -0.021040393406344393,
 'body': -0.008605483016374454,
 'sacred': 0.015295217593196742,
 'sea': 0.002922979775066198,
 'son': 0.021796729606835916,
 'wild': 0.01113118729275702,
 'times': -0.007090407526195525}

### Verify gender-topic association with WEAT 

In [36]:
fem_top[0][0]

'father'

In [37]:
everything = {"Ovid": dict()}
everything['Ovid']['method'] = 'weat_single'
everything['Ovid']["attributes"] = "Females attributes vs Male attributes"
everything['Ovid']["targets"] = "Topic 1"
everything['Ovid']["X_key"] = "Topic 1"
everything['Ovid']["A_key"] = "Female attributes"
everything['Ovid']["B_key"] = "Male attributes"
everything['Ovid']['Female attributes'] = ["female","woman","girl","sister","she","her","hers","daughter"]
everything['Ovid']['Male attributes'] = ["male","man","boy","brother","he","him","his","son"]
everything['Ovid']['Topic 1'] = top_10_list[fem_top[0][0]]


json_object = json.dumps(everything, indent = 4)
with open("./weat/ovid.json", "w") as outfile:
  outfile.write(json_object)