This notebook explores the output of BookNLP to measure the differences in gender representation (both in terms of the number of major characters and in the actions that men and women carry out) in 94 Pulitzer prize nominees.

In [None]:
import operator
from collections import Counter
import math
from os import path
import json

In [None]:
def read_metadata(filename):
    metadata={}
    with open(filename) as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            date=int(cols[1])
            author=cols[2]
            title=cols[3]
            author_gender=cols[4]
            metadata[idd]=(date, author, title, author_gender)
    return metadata

In [None]:
metadata=read_metadata("../data/pulitzer_metadata.txt")

In [None]:
def get_num_characters_by_gender(data, minimum_number_of_mentions=5):
    
    """ Get the total number of characters by gender who are mentioned by proper name 
    at least minimum_number_of_mentions times """
        
    f=m=0
    for character in data["characters"]:
        proper_name_count=character["NNPcount"]
        gender=character["g"]
        if proper_name_count >= minimum_number_of_mentions:
            if gender == 1:
                f+=1
            elif gender == 2:
                m+=2
    return f, m

In [None]:
def get_gendered_actions(alldata, category):
    
    """ Get the counts of actions that men and women participate in within a novel, according to category.
    category options are: agent, patient, mod, poss"""
    
    m_counts=Counter()
    f_counts=Counter()
    
    for idd in alldata:
        data=alldata[idd]
        for character in data["characters"]:
            gender=character["g"]
            agent_terms=[term["w"] for term in character[category]] 

            if gender > 0:
                if gender == 1:
                    counter=f_counts
                elif gender == 2:
                    counter=m_counts

                for verb in agent_terms:
                    counter[verb]+=1
                    
    return m_counts, f_counts

In [None]:
def get_top_characters_by_gender(data, n=5):
    
    """ Get the gender counts of the top n characters (by frequency) who are mentioned by proper name """
        
    f=m=0
    
    character_counts={}
    
    for character in data["characters"]:
        proper_name_count=character["NNPcount"]
        gender=character["g"]
        if gender >= 1:
            char_id=character["id"]
            character_counts[char_id]=proper_name_count, gender
            
    sorted_x = sorted(character_counts.items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:n]:
        gender=v[1]
        if gender == 1:
            f+=1
        elif gender == 2:
            m+=1
    return f, m

In [None]:
def calculate_differences(maleCounter, femaleCounter, display=25):
    
    """ Function that takes two Counter objects as inputs and prints out a ranked list of terms
    more characteristic of the first counter than the second.  Here we'll use log-odds
    with an uninformative prior (from Monroe et al 2008, "Fightin Words", eqn. 22) as our metric.
    
    """
    
    vocab=dict(maleCounter) 
    vocab.update(dict(femaleCounter))
    maleSum=sum(maleCounter.values())
    femaleSum=sum(maleCounter.values())

    ranks={}
    alpha=0.01
    alphaV=len(vocab)*alpha
        
    for word in vocab:
        
        log_odds_ratio=math.log( (maleCounter[word] + alpha) / (maleSum+alphaV-maleCounter[word]-alpha) ) - math.log( (femaleCounter[word] + alpha) / (femaleSum+alphaV-femaleCounter[word]-alpha) )
        variance=1./(maleCounter[word] + alpha) + 1./(femaleCounter[word] + alpha)
        
        ranks[word]=log_odds_ratio/math.sqrt(variance)

    sorted_x = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)
    
    print("Most male:")
    for k,v in sorted_x[:display]:
        print("%.3f\t%s" % (v,k))
    
    print("\nMost female:")
    for k,v in reversed(sorted_x[-display:]):
        print("%.3f\t%s" % (v,k))

In [None]:
def read_data(data_folder):
    data={}

    for idd in metadata:
        filename=path.join(data_folder, "%s.book" % idd)
        if path.exists(filename):
            with open(filename) as file:
                data[idd]=json.load(file)

    print(f"Read {len(data)} books")
    return data

In [None]:
data=read_data("../data/pulitzer_booknlp")

Let's use that data to compare how many men and women characters (mentioned by name at least 10 times) there are in books written by men and women authors. 

In [None]:
all_author_M=0.
all_author_F=0.
M_n=0
F_n=0

sorted_x = sorted(metadata.items(), key=operator.itemgetter(1), reverse=False)
    
for idd, (date, author, title, author_gender) in sorted_x:
    f, m=get_num_characters_by_gender(data[idd], minimum_number_of_mentions=10)
    if f+m > 0:
        if author_gender == "M":
            all_author_M+=float(f)/(f+m)
            M_n+=1
        else:
            all_author_F+=float(f)/(f+m)
            F_n+=1

    print(f"{f}\t{m}\t{author_gender}\t{date}\t{author}\t{title}")
    
print(f"\nRatio of major women characters by author gender:\n")
print(f"Books written by women: {100*all_author_F/F_n:.1f}% of characters are women (n={F_n})")
print(f"Books written by men:   {100*all_author_M/M_n:.1f}% of characters are women (n={M_n})")

Now let's focus on just the major characters -- the top 5 characters mentioned by frequency. Is there a meaningful difference there in how men and women as authors have major characters who are women?

In [None]:
all_author_M=0.
all_author_F=0.
M_n=0
F_n=0

sorted_x = sorted(metadata.items(), key=operator.itemgetter(1), reverse=False)
    
for idd, (date, author, title, author_gender) in sorted_x:
    f, m=get_top_characters_by_gender(data[idd], n=5)
    if author_gender == "M":
        all_author_M+=float(f)/(f+m)
        M_n+=1
    else:
        all_author_F+=float(f)/(f+m)
        F_n+=1

    print(f"{f}\t{m}\t{author_gender}\t{date}\t{author}\t{title}")
    
print(f"\nRatio of major women characters by author gender:\n")
print(f"Books written by women: {100*all_author_F/F_n:.1f}% of major characters are women (n={F_n})")
print(f"Books written by men:   {100*all_author_M/M_n:.1f}% of major characters are women (n={M_n})")

Now let's see what men and women *do* as characters in these novels.  Explore this for other categories as well -- e.g., to see what men and women *possess*, switch "agent" with "poss".

In [None]:
m_counts, f_counts=get_gendered_actions(data, "agent")

In [None]:
calculate_differences(m_counts, f_counts, display=25)

How would you break apart these characteristic actions by author gender?