# Setting up

In [90]:
import little_mallet_wrapper as lmw
import pandas as pd
import ast
from collections import Counter
import numpy as np

%matplotlib inline

path_to_mallet = '~/mallet/bin/mallet'
path_to_proj = '../'
title_to_train = "ovid_sent"
INPUT = "../topic-modeling/input/" + title_to_train + ".tsv"
output_directory_path = path_to_proj + 'topic-modeling/output/' + title_to_train

In [91]:
train_df = pd.read_csv(INPUT, sep='\t')
training_data = [lmw.process_string(t) for t in train_df['text'].tolist()]
training_data = [d for d in training_data if d.strip()]

# Analysis

### Examining topics

In [92]:
# Explore results for different number of topics
num_topics = 10

In [93]:
# Examining the top 10 keywords in each topic ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + '/mallet.topic_keys.' + str(num_topics))
top_10 = []
for i, t in enumerate(topic_keys):
  print(i, '\t', ' '.join(t[:10]))
  top_10.append(' '.join(t[:10]))
  

0 	 old age youth woman lived follows flower steps woodland years
1 	 times three blood earth water fire without cut food herbs
2 	 hand spear back wound one left blood right threw sword
3 	 earth sky light air clouds sun chariot stars horses heavens
4 	 tree gold golden branches oak purple flowers leaves hair white
5 	 body arms hair hands like saw face head turned eyes
6 	 one would said could gods father though words love let
7 	 son city father king war walls name house troy great
8 	 incense rites bacchus sacred palace gods women altars god altar
9 	 sea waters waves river wild water nymphs woods deep wind


In [94]:
# Handlabeling the topics ---- 
topic_labels = ['aging', 'time-nature', 'fight', 'nature', 'golden-tree', 'body-part', 'night', 'royal-family', 'sacred', 'sea']
naming = {topic_labels[i]: top_10[i] for i in range(len(top_10))}
naming

{'aging': 'old age youth woman lived follows flower steps woodland years',
 'time-nature': 'times three blood earth water fire without cut food herbs',
 'fight': 'hand spear back wound one left blood right threw sword',
 'nature': 'earth sky light air clouds sun chariot stars horses heavens',
 'golden-tree': 'tree gold golden branches oak purple flowers leaves hair white',
 'body-part': 'body arms hair hands like saw face head turned eyes',
 'night': 'one would said could gods father though words love let',
 'royal-family': 'son city father king war walls name house troy great',
 'sacred': 'incense rites bacchus sacred palace gods women altars god altar',
 'sea': 'sea waters waves river wild water nymphs woods deep wind'}

### Inferring gender for characters in each document

In [95]:
# Pull list of characters from booknlp output ----
gender_info = pd.read_csv('../booknlp_output/gender_info.csv')
gender_info

Unnamed: 0.1,Unnamed: 0,index,name,names,gender,total_agent,total_object,perc_agent
0,1,499,Jupiter,"['Jupiter', 'Ammon', 'mighty Jupiter', 'Jupite...",he/him/his,76,21,0.783505
1,2,605,Jove,"['Jove', 'Hector', 'Ajax', 'mighty Jove', 'bra...",he/him/his,71,20,0.780220
2,3,1482,the god,"['the god', 'The god']",he/him/his,99,12,0.891892
3,5,519,Phoebus,"['Phoebus', 'Apollo', 'Phoebus Apollo']",he/him/his,69,8,0.896104
4,6,1835,the boy,"['the boy', 'The boy']",he/him/his,69,25,0.734043
...,...,...,...,...,...,...,...,...
927,924,9112,the natives,['the natives'],they/them/their,1,0,1.000000
928,925,5510,the women of Scythia,['the women of Scythia'],they/them/their,0,1,0.000000
929,927,1428,Romans,"['Romans', 'the Romans']",they/them/their,1,0,1.000000
930,928,5649,The terrified crowd,['The terrified crowd'],they/them/their,2,1,0.666667


In [96]:
# Create a dictionary of named characters and referential gender ---- 
all_names = {}
list_name = list(gender_info['names'])
list_gender = list(gender_info['gender'])
for i in range(len(gender_info)): 
  name = ast.literal_eval(list_name[i])
  for n in name: 
    all_names[n] = list_gender[i]

In [97]:
# Assigning gender to each document ---- 
docs = pd.read_csv(INPUT, sep='\t')
gender = []
for sent in docs['text']: 
  gender_sent = []
  for name in all_names.keys(): 
    if name in sent:
      gender_sent.append(all_names[name])
  gender.append(gender_sent)
docs['gender'] = gender
docs

Unnamed: 0.1,Unnamed: 0,title,text,gender
0,0,metamorphoses,I want to speak about bodies changed into new ...,[]
1,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]"
2,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]"
3,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]"
4,4,metamorphoses,"Though there was land and sea and air, it was...",[]
...,...,...,...,...
4893,4893,metamorphoses,Each is a father and a master,[he/him/his]
4894,4894,metamorphoses,"You gods, the friends of Aeneas, to whom fire...","[he/him/his, he/him/his, he/him/his, he/him/hi..."
4895,4895,metamorphoses,"And now the work is done, that Jupiter’s anger...","[he/him/his, she/her]"
4896,4896,metamorphoses,"Let that day, that only has power over my bod...",[]


In [98]:
# Assigning gender label to each sentence ----
gender = list(docs['gender'])
labels, masc, fem, neutral = [], [], [], []
for item in gender: 
  if len(item) == 1: 
    labels.append(item[0])
    if 'he/him/his' in item:
      masc.append(1)
      fem.append(0)
      neutral.append(0)
    elif 'she/her' in item:
      fem.append(1)
      masc.append(0)
      neutral.append(0)
    else: 
      neutral.append(1)
      masc.append(0)
      fem.append(0)
  elif len(item) == 0: 
    labels.append('none')
    masc.append(0)
    fem.append(0)
    neutral.append(0)
  else: 
    counter = dict(Counter(item))
    counter_perc = {k: v / len(item) for k, v in counter.items()} 
    #count = sorted(counter.items(), key=lambda x:x[1], reverse=True)[0][0]
    labels.append("mixed")
    if 'he/him/his' in item:
      masc.append(counter_perc['he/him/his'])
      fem.append(0)
      neutral.append(0)
    elif 'she/her' in item:
      fem.append(counter_perc['she/her'])
      masc.append(0)
      neutral.append(0)
    else: 
      neutral.append(counter_perc['they/them/their'])
      masc.append(0)
      fem.append(0)
docs['major'] = labels
docs['fem'] = fem
docs['masc'] = masc
docs['neutral'] = neutral
docs

Unnamed: 0.1,Unnamed: 0,title,text,gender,major,fem,masc,neutral
0,0,metamorphoses,I want to speak about bodies changed into new ...,[],none,0.000000,0.000000,0.0
1,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]",mixed,0.666667,0.000000,0.0
2,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]",mixed,0.750000,0.000000,0.0
3,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]",mixed,1.000000,0.000000,0.0
4,4,metamorphoses,"Though there was land and sea and air, it was...",[],none,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...
4893,4893,metamorphoses,Each is a father and a master,[he/him/his],he/him/his,0.000000,1.000000,0.0
4894,4894,metamorphoses,"You gods, the friends of Aeneas, to whom fire...","[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.727273,0.0
4895,4895,metamorphoses,"And now the work is done, that Jupiter’s anger...","[he/him/his, she/her]",mixed,0.000000,0.500000,0.0
4896,4896,metamorphoses,"Let that day, that only has power over my bod...",[],none,0.000000,0.000000,0.0


In [99]:
# Get top topics for each document ----
topic_distributions = lmw.load_topic_distributions(output_directory_path + '/mallet.topic_distributions.' + str(num_topics))
top = []
topic_dict = {'aging': [], 'time-nature': [], 'fight': [], 'nature': [], 'golden-tree': [], 'body-part': [], 'night': [], 'royal-family': [], 'sacred': [], 'sea': []}
for doc in topic_distributions: 
  ind_prob = {i: p for i, p in enumerate(doc)}
  top_1 = sorted(ind_prob.items(), key=lambda x:x[1], reverse=True)[0]
  top.append(list(naming.keys())[top_1[0]])
  for i in range(len(topic_dict)):
    topic_dict[list(naming.keys())[i]].append(doc[i])
docs_subset = docs.copy().iloc[:len(topic_distributions), :]        # PLACEHOLDER FOR MISMATCH ERROR
docs_subset['Top Topics'] = top
for i in range(len(topic_dict)):
  docs_subset[list(naming.keys())[i]] = topic_dict[list(naming.keys())[i]]
docs_subset

Unnamed: 0.1,Unnamed: 0,title,text,gender,major,fem,masc,neutral,Top Topics,aging,time-nature,fight,nature,golden-tree,body-part,night,royal-family,sacred,sea
0,0,metamorphoses,I want to speak about bodies changed into new ...,[],none,0.000000,0.000000,0.0,body-part,0.006349,0.013643,0.021533,0.015360,0.009141,0.666995,0.209023,0.022394,0.012027,0.023536
1,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]",mixed,0.666667,0.000000,0.0,night,0.002979,0.006402,0.010104,0.007208,0.299147,0.018134,0.628829,0.010509,0.005644,0.011044
2,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]",mixed,0.750000,0.000000,0.0,nature,0.144882,0.075421,0.006129,0.433597,0.002602,0.010999,0.309874,0.006374,0.003423,0.006699
3,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]",mixed,1.000000,0.000000,0.0,nature,0.001946,0.004182,0.006601,0.582580,0.002802,0.242995,0.064077,0.045390,0.003687,0.045740
4,4,metamorphoses,"Though there was land and sea and air, it was...",[],none,0.000000,0.000000,0.0,nature,0.003899,0.008378,0.013224,0.549667,0.005613,0.023733,0.205541,0.013753,0.007386,0.168806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4861,4861,metamorphoses,Is it a greater thing to have conquered the s...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.571429,0.0,royal-family,0.012767,0.027432,0.043297,0.030886,0.018380,0.077707,0.167603,0.550419,0.024183,0.047325
4862,4862,metamorphoses,"When Venus, the golden mother of Aeneas, saw t...","[he/him/his, he/him/his, he/him/his, she/her, ...",mixed,0.000000,0.600000,0.0,sacred,0.000887,0.001906,0.003008,0.002146,0.001277,0.005399,0.380342,0.213813,0.387934,0.003288
4863,4863,metamorphoses,Will I be the only one always to be troubled ...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.555556,0.0,night,0.003899,0.085554,0.090400,0.163786,0.005613,0.023733,0.514246,0.090929,0.007386,0.014454
4864,4864,metamorphoses,"Prevent them, I beg you, thwart this attempt,...",[they/them/their],they/them/their,0.000000,0.000000,1.0,night,0.002814,0.006045,0.009542,0.229557,0.004050,0.072812,0.649498,0.009924,0.005329,0.010429


In [100]:
# Retaining only documents with at least one gendered character ----
docs_filtered = docs_subset[docs_subset['gender'].str.len() > 0].reset_index(drop=True)
docs_filtered

Unnamed: 0.1,Unnamed: 0,title,text,gender,major,fem,masc,neutral,Top Topics,aging,time-nature,fight,nature,golden-tree,body-part,night,royal-family,sacred,sea
0,1,metamorphoses,"You, gods, since you are the ones who alter t...","[she/her, she/her, they/them/their]",mixed,0.666667,0.000000,0.0,night,0.002979,0.006402,0.010104,0.007208,0.299147,0.018134,0.628829,0.010509,0.005644,0.011044
1,2,metamorphoses,Before there was earth or sea or the sky that ...,"[she/her, she/her, she/her, they/them/their]",mixed,0.750000,0.000000,0.0,nature,0.144882,0.075421,0.006129,0.433597,0.002602,0.010999,0.309874,0.006374,0.003423,0.006699
2,3,metamorphoses,"There was no Titan yet, shining his light on ...","[she/her, she/her]",mixed,1.000000,0.000000,0.0,nature,0.001946,0.004182,0.006601,0.582580,0.002802,0.242995,0.064077,0.045390,0.003687,0.045740
3,5,metamorphoses,"Nothing retained its shape, one thing obstruc...","[he/him/his, she/her, she/her]",mixed,0.000000,0.333333,0.0,body-part,0.002411,0.005180,0.055892,0.148980,0.003471,0.396401,0.365660,0.008503,0.004566,0.008936
4,6,metamorphoses,This conflict was ended by a god and a greater...,[he/him/his],he/him/his,0.000000,1.000000,0.0,nature,0.002665,0.005727,0.009038,0.692197,0.003837,0.016221,0.193237,0.062150,0.005048,0.009879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,4861,metamorphoses,Is it a greater thing to have conquered the s...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.571429,0.0,royal-family,0.012767,0.027432,0.043297,0.030886,0.018380,0.077707,0.167603,0.550419,0.024183,0.047325
3318,4862,metamorphoses,"When Venus, the golden mother of Aeneas, saw t...","[he/him/his, he/him/his, he/him/his, she/her, ...",mixed,0.000000,0.600000,0.0,sacred,0.000887,0.001906,0.003008,0.002146,0.001277,0.005399,0.380342,0.213813,0.387934,0.003288
3319,4863,metamorphoses,Will I be the only one always to be troubled ...,"[he/him/his, he/him/his, he/him/his, he/him/hi...",mixed,0.000000,0.555556,0.0,night,0.003899,0.085554,0.090400,0.163786,0.005613,0.023733,0.514246,0.090929,0.007386,0.014454
3320,4864,metamorphoses,"Prevent them, I beg you, thwart this attempt,...",[they/them/their],they/them/their,0.000000,0.000000,1.0,night,0.002814,0.006045,0.009542,0.229557,0.004050,0.072812,0.649498,0.009924,0.005329,0.010429


In [101]:
# Get top topics for each gender ----
fem_major = docs_filtered[docs_filtered['major'] == 'she/her'].reset_index(drop=True)
fem_topics = fem_major['Top Topics'].tolist()
fem_counter = dict(Counter(fem_topics))
fem_top = sorted(fem_counter.items(), key=lambda x:x[1], reverse=True)[:3]
fem_top

[('night', 140), ('body-part', 54), ('sea', 31)]

In [102]:
# Get top topics for each gender ----
masc_major = docs_filtered[docs_filtered['major'] == 'he/him/his'].reset_index(drop=True)
masc_topics = masc_major['Top Topics'].tolist()
masc_counter = dict(Counter(masc_topics))
masc_top = sorted(masc_counter.items(), key=lambda x:x[1], reverse=True)[:3]
masc_top

[('night', 180), ('body-part', 57), ('royal-family', 41)]

In [103]:
# Get top topics for each gender ----
neu_major = docs_filtered[docs_filtered['major'] == 'they/them/their'].reset_index(drop=True)
neu_topics = neu_major['Top Topics'].tolist()
neu_counter = dict(Counter(neu_topics))
neu_top = sorted(neu_counter.items(), key=lambda x:x[1], reverse=True)[:3]
neu_top

[('night', 148), ('body-part', 57), ('sea', 55)]

In [104]:
# Get top topics for each gender (mixed) ----
mixed_major = docs_filtered[docs_filtered['major'] == 'mixed'].reset_index(drop=True)
mixed_topics = mixed_major['Top Topics'].tolist()
mixed_counter = dict(Counter(mixed_topics))
mixed_top = sorted(mixed_counter.items(), key=lambda x:x[1], reverse=True)[:3]
mixed_top

[('night', 891), ('body-part', 302), ('royal-family', 204)]

In [105]:
# Probability for each topics between gender groups ---- 
prob_dict = {}
for topic in topic_dict: 
  prob_dict[topic] = np.mean(masc_major[topic]) - np.mean(fem_major[topic])
prob_dict

{'aging': 0.00856638384136121,
 'time-nature': -4.032076599506246e-05,
 'fight': 0.013999533362520994,
 'nature': -0.004884253100708155,
 'golden-tree': -0.008962161434659353,
 'body-part': -0.0144914762521543,
 'night': -0.0072964465369699805,
 'royal-family': 0.017429196117334914,
 'sacred': -0.00034989204688937875,
 'sea': -0.00397056318384055}