In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline

import glob
import nltk
from string import punctuation

tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")


DATA_FOLDER = 'dataset/wikispeedia_paths-and-graph/'

ARTICLES_DATASET = DATA_FOLDER+"articles.tsv"
LINKS_DATASET = DATA_FOLDER+"links.tsv"
CATEGORIES_DATASET = DATA_FOLDER+"categories.tsv"
PATHS_FINISHED_DATASET = DATA_FOLDER+"paths_finished.tsv"
PATHS_UNFINISHED_DATASET = DATA_FOLDER+"paths_unfinished.tsv"

categories = pd.read_csv(CATEGORIES_DATASET, sep='\t', header=11, names = ["Name", "Category"])
articles = pd.read_csv(ARTICLES_DATASET, sep='\t', header=10, names = ["Name"])
links = pd.read_csv(LINKS_DATASET, header=10,sep='\t', names=["Outgoing", "Incoming"])
paths = pd.read_csv(PATHS_FINISHED_DATASET, header=15,sep='\t', names=["ip", "timestampe", "duration", "path", "rating"])

In [3]:
listtt= tokenizer.tokenize('Hello.  This is a test.  It works!')
[type(i) for i in listtt]

[str, str, str]

In [4]:
split= categories['Category'].str.split(".", expand=True)
categories = pd.concat([categories["Name"], split], axis=1)

# Check if all values in 0 are subject, then discard column 
if (categories[0] == categories[0][0]).all():
    categories = categories.drop(columns=0)

categories.columns = ["Name", "Subject", "SubCat1", "SubCat2"]

Analyze the articles on their use of female words.

In [5]:
#Two lists  of words that are used when a man or woman is present, based on Danielle Sucher's https://github.com/DanielleSucher/Jailbreak-the-Patriarchy
male_words=set(['guy','spokesman','chairman',"men's",'men','him',"he's",'his','boy','boyfriend','boyfriends',
                'boys','brother','brothers','dad','dads','dude','father','fathers','fiance','gentleman','gentlemen',
                'god','grandfather','grandpa','grandson','groom','he','himself','husband','husbands','king','male','man',
                'mr','nephew','nephews','priest','prince','son','sons','uncle','uncles','waiter','widower','widowers'])
female_words=set(['heroine','spokeswoman','chairwoman',"women's",'actress','women',"she's",'her','aunt','aunts','bride',
                'daughter','daughters','female','fiancee','girl','girlfriend','girlfriends','girls','goddess',
                'granddaughter','grandma','grandmother','herself','ladies','lady','lady','mom','moms','mother',
                'mothers','mrs','ms','niece','nieces','priestess','princess','queens','she','sister','sisters',
                'waitress','widow','widows','wife','wives','woman'])


In [6]:
article_list=glob.glob('data/plaintext_articles/*.txt')

sexes=['male','female','none','both']
sentence_counter={sex:0 for sex in sexes}
word_counter={sex:0 for sex in sexes}
word_freq={sex:{} for sex in sexes}
proper_nouns={}

def gender_the_sentence(sentence_words):
    mw_length=len(male_words.intersection(sentence_words))
    fw_length=len(female_words.intersection(sentence_words))

    if mw_length>0 and fw_length==0:
        gender='male'
    elif mw_length==0 and fw_length>0: 
        gender='female'
    elif mw_length>0 and fw_length>0: 
        gender='both'
    else:
        gender='none'
    return gender

def is_it_proper(word):
        if word[0]==word[0].upper():
            case='upper'
        else:
            case='lower'
        
        word_lower=word.lower()
        try:
            proper_nouns[word_lower][case] = proper_nouns[word_lower].get(case,0)+1
        except Exception as e:
            #This is triggered when the word hasn't been seen yet
            proper_nouns[word_lower]= {case:1}

def increment_gender(sentence_words,gender):
    sentence_counter[gender]+=1
    word_counter[gender]+=len(sentence_words)
    for word in sentence_words:
        word_freq[gender][word]=word_freq[gender].get(word,0)+1

In [7]:
for file_name in article_list:
    #Open the file
    text=str(open(file_name,'rb').read())
    #Split into sentences
    sentences=tokenizer.tokenize(text)
    
    for sentence in sentences:
        #word tokenize and strip punctuation
            sentence_words=sentence.split()
            sentence_words=[w.strip(punctuation) for w in sentence_words 
                            if len(w.strip(punctuation))>0]
            
            #figure out how often each word is capitalized
            [is_it_proper(word) for word in sentence_words[1:]]

            #lower case it
            sentence_words=set([w.lower() for w in sentence_words])
            
            #Figure out if there are gendered words in the sentence by computing the length of the intersection of the sets
            gender=gender_the_sentence(sentence_words)

            #Increment some counters
            increment_gender(sentence_words,gender)

In [8]:
proper_nouns=set([word for word in proper_nouns if  
                  proper_nouns[word].get('upper',0) / 
                  (proper_nouns[word].get('upper',0) + 
                   proper_nouns[word].get('lower',0))>.50])

common_words=set([w for w in sorted (word_freq['female'],
                                     key=word_freq['female'].get,reverse=True)[:1000]]+[w for w in sorted (word_freq['male'],key=word_freq['male'].get,reverse=True)[:1000]])

common_words=list(common_words-male_words-female_words-proper_nouns)
male_percent={word:(word_freq['male'].get(word,0) / word_counter['male']) 
              / (word_freq['female'].get(word,0) / word_counter['female']+word_freq['male'].get(word,0)/word_counter['male']) for word in common_words}

In [9]:
print('%.1f%% gendered' % (100*(sentence_counter['male']+sentence_counter['female'])/
                           (sentence_counter['male']+sentence_counter['female']+sentence_counter['both']+sentence_counter['none'])))
print(f"{sentence_counter['male']} sentences about men.")
print(f"{sentence_counter['female']} sentences about women.")
print(f"{(sentence_counter['male']/sentence_counter['female']):.1f} sentences about men for each sentence about women.")

ZeroDivisionError: division by zero

In [None]:
outfile_name='gender.tsv'
tsv_outfile=open(outfile_name,'wb')
header=bytes('percent_male\tmale_count\tfemalecount\tword\n', encoding='utf-8')
tsv_outfile.write(header)
for word in common_words:
    row = bytes("%.2f\t%01d\t%01d\t%s\n" % (100*male_percent[word],word_freq['male'].get(word,0),word_freq['female'].get(word,0),word), encoding='utf-8')
    tsv_outfile.write(row)
tsv_outfile.close()

In [None]:
gender_data = pd.DataFrame(data = [[100*male_percent[word],
                                    word_freq['male'].get(word,0),
                                    word_freq['female'].get(word,0),
                                    word] for word in common_words], 
                                    columns = ["percent_male", "male_count", "female_count", "word"])

In [None]:
gender_data["percent_female"] = 100-gender_data["percent_male"]
gender_data.sort_values(by="percent_male", ascending=False).head(20)


Unnamed: 0,percent_male,male_count,female_count,word,percent_female
774,84.47586,327,9,him\n,15.52414
541,80.787823,365,13,command,19.212177
700,80.355741,437,16,defeated,19.644259
442,79.902191,292,11,successor,20.097809
334,79.832246,2273,86,he\n,20.167754
525,79.500677,492,19,philosophy,20.499323
896,78.820913,3628,146,his\n,21.179087
3,78.50039,1414,58,army,21.49961
786,77.186666,497,22,troops,22.813334
317,75.559952,289,14,commander,24.440048


In [None]:
gender_data.sort_values(by="percent_female", ascending=False).head(20)

Unnamed: 0,percent_male,male_count,female_count,word,percent_female
666,2.69864,15,81,pregnant,97.30136
185,3.425631,54,228,she\n,96.574369
347,5.386954,138,363,her\n,94.613046
361,6.966717,30,60,eggs,93.033283
135,7.731961,47,84,women\'s,92.268039
75,8.794587,47,73,rice,91.205413
794,9.884697,52,71,baby,90.115303
392,11.034154,53,64,males,88.965846
69,15.575989,85,69,pop,84.424011
101,17.244113,64,46,females,82.755887
