Skip to content
Permalink
 
 
Cannot retrieve contributors at this time
import math
from collections import Counter
import nltk
from scipy.stats import chi2
from gender_novels.common import store_pickle, load_pickle
from gender_novels.corpus import Corpus
# TODO: Rewrite all of this using a Dunning class in a non-messy way.
def dunn_individual_word(total_words_in_corpus_1, total_words_in_corpus_2,
count_of_word_in_corpus_1,
count_of_word_in_corpus_2):
'''
applies dunning log likelihood to compare individual word in two counter objects
:param word: desired word to compare
:param m_corpus: c.filter_by_gender('male')
:param f_corpus: c. filter_by_gender('female')
:return: log likelihoods and p value
>>> total_words_m_corpus = 8648489
>>> total_words_f_corpus = 8700765
>>> wordcount_female = 1000
>>> wordcount_male = 50
>>> dunn_individual_word(total_words_m_corpus,total_words_f_corpus,wordcount_male,wordcount_female)
-1047.8610274053995
'''
a = count_of_word_in_corpus_1
b = count_of_word_in_corpus_2
c = total_words_in_corpus_1
d = total_words_in_corpus_2
e1 = c * (a + b) / (c + d)
e2 = d * (a + b) / (c + d)
dunning_log_likelihood = 2 * (a * math.log(a / e1) + b * math.log(b / e2))
if count_of_word_in_corpus_1 * math.log(count_of_word_in_corpus_1 / e1) < 0:
dunning_log_likelihood = -dunning_log_likelihood
p = 1 - chi2.cdf(abs(dunning_log_likelihood),1)
return dunning_log_likelihood
def dunning_total(counter1, counter2, filename_to_pickle=None):
'''
runs dunning_individual on words shared by both counter objects
(-) end of spectrum is words for counter_2
(+) end of spectrum is words for counter_1
the larger the magnitude of the number, the more distinctive that word is in its
respective counter object
use filename_to_pickle to store the result so it only has to be calculated once and can be
used for multiple analyses.
>>> from collections import Counter
>>> female_counter = Counter({'he': 1, 'she': 10, 'and': 10})
>>> male_counter = Counter({'he': 10, 'she': 1, 'and': 10})
>>> results = dunning_total(female_counter, male_counter)
# Results is a dict that maps from terms to results
# Each result dict contains the dunning score...
>>> results['he']['dunning']
-8.547243830635558
# ... counts for corpora 1 and 2 as well as total count
>>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
(11, 1, 10)
# ... and the same for frequencies
>>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
(0.2619047619047619, 0.047619047619047616, 0.47619047619047616)
:return: dict
'''
total_words_counter1 = 0
total_words_counter2 = 0
#get word total in respective counters
for word1 in counter1:
total_words_counter1 += counter1[word1]
for word2 in counter2:
total_words_counter2 += counter2[word2]
#dictionary where results will be returned
dunning_result = {}
for word in counter1:
counter1_wordcount = counter1[word]
if word in counter2:
counter2_wordcount = counter2[word]
if counter1_wordcount + counter2_wordcount < 10:
continue
dunning_word = dunn_individual_word( total_words_counter1, total_words_counter2,
counter1_wordcount,counter2_wordcount)
dunning_result[word] = {
'dunning': dunning_word,
'count_total': counter1_wordcount + counter2_wordcount,
'count_corp1': counter1_wordcount,
'count_corp2': counter2_wordcount,
'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 +
total_words_counter2),
'freq_corp1': counter1_wordcount / total_words_counter1,
'freq_corp2': counter2_wordcount / total_words_counter2
}
if filename_to_pickle:
store_pickle(dunning_result, filename_to_pickle)
return dunning_result
def male_vs_female_authors_analysis_dunning_lesser():
'''
tests word distinctiveness of shared words between male and female corpora using dunning
:return: dictionary of common shared words and their distinctiveness
'''
c = Corpus('test_corpus')
m_corpus = c.filter_by_gender('male')
f_corpus = c.filter_by_gender('female')
wordcounter_male = m_corpus.get_wordcount_counter()
wordcounter_female = f_corpus.get_wordcount_counter()
results = dunning_total(wordcounter_male, wordcounter_female)
print("women's top 10: ", results[0:10])
print("men's top 10: ", list(reversed(results[-10:])))
return results
def dunning_result_displayer(dunning_result, number_of_terms_to_display=10,
corpus1_display_name=None, corpus2_display_name=None,
part_of_speech_to_include=None):
"""
Convenience function to display dunning results as tables.
part_of_speech_to_include can either be a list of POS tags or a 'adjectives, 'adverbs',
'verbs', or 'pronouns'. If it is None, all terms are included.
:param dunning_result: Dunning result dict to display
:param number_of_terms_to_display: Number of terms for each corpus to display
:param corpus1_display_name: Name of corpus 1 (e.g. "Female Authors")
:param corpus2_display_name: Name of corpus 2 (e.g. "Male Authors")
:param part_of_speech_to_include: e.g. 'adjectives', or 'verbs'
:return:
"""
pos_names_to_tags = {
'adjectives': ['JJ', 'JJR', 'JJS'],
'adverbs': ['RB', 'RBR', 'RBS', 'WRB'],
'verbs': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
'pronouns': ['PRP', 'PRP$', 'WP', 'WP$']
}
if part_of_speech_to_include in pos_names_to_tags:
part_of_speech_to_include = pos_names_to_tags[part_of_speech_to_include]
if not corpus1_display_name:
corpus1_display_name = 'Corpus 1'
if not corpus2_display_name:
corpus2_display_name = 'Corpus 2'
headings = ['term', 'dunning', 'count_total', 'count_corp1', 'count_corp2', 'freq_total',
'freq_corp1', 'freq_corp2']
output = f'\nDisplaying Part of Speech: {part_of_speech_to_include}\n'
for i, corpus_name in enumerate([corpus1_display_name, corpus2_display_name]):
output += f'\nDunning Log-Likelihood results for {corpus_name}\n|'
for heading in headings:
heading = heading.replace('_corp1', ' ' + corpus1_display_name).replace('_corp2',
' ' + corpus2_display_name)
output += ' {:19s}|'.format(heading)
output += '\n' + 8 * 21 * '_' + '\n'
reverse = True
if i == 1: reverse = False
sorted_results = sorted(dunning_result.items(), key=lambda x: x[1]['dunning'],
reverse=reverse)
count_displayed = 0
for result in sorted_results:
if count_displayed == number_of_terms_to_display:
break
term = result[0]
term_pos = nltk.pos_tag([term])[0][1]
if part_of_speech_to_include and term_pos not in part_of_speech_to_include:
continue
output += '| {:18s}|'.format(result[0])
for heading in headings[1:]:
if heading in ['freq_total', 'freq_corp1', 'freq_corp2']:
output += ' {:16.4f}% |'.format(result[1][heading] * 100)
elif heading in ['dunning']:
output += ' {:17.2f} |'.format(result[1][heading])
else:
output += ' {:17.0f} |'.format(result[1][heading])
output += '\n'
count_displayed += 1
print(output)
def compare_word_association_in_corpus_analysis_dunning(word1, word2, corpus=None,
corpus_name=None):
"""
Uses Dunning analysis to compare words associated with word1 vs words associated with word2 in
the Corpus passed in as the parameter. If a corpus and corpus_name are passsed in, then the
analysis will use the corpus but name the file after corpus_name. If no corpus is passed in but
a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name).
If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg
corpus.
:param word1: str
:param word2: str
:param corpus: Corpus
:param corpus_name: str
:return: dict
"""
if corpus:
if not corpus_name:
corpus_name = corpus.corpus_name
else:
if not corpus_name:
corpus_name = "gutenberg"
corpus = Corpus(corpus_name)
pickle_filename = f'dunning_{word1}_vs_{word2}_associated_words_{corpus_name}'
try:
results = load_pickle(pickle_filename)
except IOError:
try:
pickle_filename = f'dunning_{word2}_vs_{word1}_associated_words_{corpus_name}'
results = load_pickle(pickle_filename)
except:
word1_counter = Counter()
word2_counter = Counter()
for novel in corpus.novels:
word1_counter.update(novel.words_associated(word1))
word2_counter.update(novel.words_associated(word2))
results = dunning_total(word1_counter, word2_counter,
filename_to_pickle=pickle_filename)
for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
dunning_result_displayer(results, number_of_terms_to_display=50,
part_of_speech_to_include=group)
return results
def compare_word_association_between_corpus_analysis_dunning(word, corpus1=None, corpus1_name=None,
corpus2=None, corpus2_name=None, use_word_window=False, word_window=None):
"""
Uses Dunning analysis to compare words associated with word between corpuses. If a corpus and corpus_name are
passsed in, then the analysis will use the corpus but name the file after corpus_name. If no corpus is passed in but
a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name).
If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg
corpus.
:param word1: str
:param corpus: Corpus
:param corpus_name: str
:return: dict
"""
if corpus1:
if not corpus1_name:
corpus1_name = corpus1.corpus_name
else:
if not corpus1_name:
corpus1_name = "gutenberg"
corpus1 = Corpus(corpus1_name)
if corpus2:
if not corpus2_name:
corpus2_name = corpus2.corpus_name
else:
if not corpus2_name:
corpus2_name = "gutenberg"
corpus2 = Corpus(corpus2_name)
pickle_filename = (f'dunning_{word}_associated_words_{corpus1_name}_vs_{corpus2_name}_in_'
f'{corpus1.corpus_name}')
if use_word_window:
pickle_filename+= f'_word_window_{word_window}'
try:
results = load_pickle(pickle_filename)
except IOError:
print("Precalculated result not available. Running analysis now...")
corpus1_counter = Counter()
corpus2_counter = Counter()
for novel in corpus1.novels:
if use_word_window:
get_word_windows(self, search_terms, window_size=word_window)
else:
corpus1_counter.update(novel.words_associated(word))
for novel in corpus2.novels:
if use_word_window:
get_word_windows(self, search_terms, window_size=word_window)
else:
corpus2_counter.update(novel.words_associated(word))
results = dunning_total(corpus1_counter, corpus2_counter,
filename_to_pickle=pickle_filename)
for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
dunning_result_displayer(results, number_of_terms_to_display=20,
corpus1_display_name=f'{corpus1_name}. {word}',
corpus2_display_name=f'{corpus2_name}. {word}',
part_of_speech_to_include=group)
return results
def male_VS_female_analysis_dunning(corpus_name, display_data = False):
'''
tests word distinctiveness of shared words between male and female corpora using dunning
Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc.
:return: dict
'''
# By default, try to load precomputed results. Only calculate if no stored results are
# available.
pickle_filename = f'dunning_male_vs_female_chars_{corpus_name}'
try:
results = load_pickle(pickle_filename)
except IOError:
c = Corpus(corpus_name)
m_corpus = c.filter_by_gender('male')
f_corpus = c.filter_by_gender('female')
from collections import Counter
wordcounter_male = Counter()
wordcounter_female = Counter()
for novel in m_corpus:
wordcounter_male += novel.words_associated('he')
for novel in f_corpus:
wordcounter_female += novel.words_associated('he')
# wordcounter_male = m_corpus.get_wordcount_counter()
# wordcounter_female = f_corpus.get_wordcount_counter()
results = dunning_total(wordcounter_male, wordcounter_female,
filename_to_pickle=pickle_filename)
if display_data:
for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
dunning_result_displayer(results, number_of_terms_to_display=20,
corpus1_display_name='Fem Author',
corpus2_display_name='Male Author',
part_of_speech_to_include=group)
return results
def dunning_result_to_dict(dunning_result, number_of_terms_to_display=10,
part_of_speech_to_include=None):
'''
Receives a dictionary of results and returns a dictionary of the top
number_of_terms_to_display most distinctive results for each corpus that have a part of speech
matching part_of_speech_to_include
:param dunning_result: Dunning result dict that will be sorted through
:param number_of_terms_to_display: Number of terms for each corpus to display
:param part_of_speech_to_include: e.g. 'adjectives', or 'verbs'
:return: dict
'''
pos_names_to_tags = {
'adjectives': ['JJ', 'JJR', 'JJS'],
'adverbs': ['RB', 'RBR', 'RBS', 'WRB'],
'verbs': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
'pronouns': ['PRP', 'PRP$', 'WP', 'WP$']
}
if part_of_speech_to_include in pos_names_to_tags:
part_of_speech_to_include = pos_names_to_tags[part_of_speech_to_include]
final_results_dict = {}
reverse = True
for i in range(2):
sorted_results = sorted(dunning_result.items(), key=lambda x: x[1]['dunning'],
reverse=reverse)
count_displayed = 0
for result in sorted_results:
if count_displayed == number_of_terms_to_display:
break
term = result[0]
term_pos = nltk.pos_tag([term])[0][1]
if part_of_speech_to_include and term_pos not in part_of_speech_to_include:
continue
final_results_dict[result[0]]=result[1]
count_displayed += 1
reverse = False
return final_results_dict
################################################
# Individual Analyses #
################################################
# Male Authors versus Female Authors
################################################
def male_vs_female_authors_analysis_dunning(corpus_name, display_results=False):
'''
tests word distinctiveness of shared words between male and female authors using dunning
If called with display_results=True, prints out the most distinctive terms overall as well as
grouped by verbs, adjectives etc.
Returns a dict of all terms in the corpus mapped to the dunning data for each term
:return:dict
'''
# By default, try to load precomputed results. Only calculate if no stored results are
# available.
pickle_filename = f'dunning_male_vs_female_authors_{corpus_name}'
try:
results = load_pickle(pickle_filename)
except IOError:
c = Corpus(corpus_name)
m_corpus = c.filter_by_gender('male')
f_corpus = c.filter_by_gender('female')
wordcounter_male = m_corpus.get_wordcount_counter()
wordcounter_female = f_corpus.get_wordcount_counter()
results = dunning_total(wordcounter_female, wordcounter_male,
filename_to_pickle=pickle_filename)
if display_results:
for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
dunning_result_displayer(results, number_of_terms_to_display=20,
corpus1_display_name='Fem Author',
corpus2_display_name='Male Author',
part_of_speech_to_include=group)
return results
# Male Characters versus Female Characters (words following 'he' versus words following 'she')
##############################################################################################
def he_vs_she_associations_analysis_dunning(corpus_name):
"""
Uses Dunning analysis to compare words associated with 'he' vs words associated with 'she' in
the Corpus passed in as the parameter. The corpus_name parameter is if you want to name the file
something other than Gutenberg (e.g. Gutenberg_female_authors)
:param corpus_name: str
"""
corpus = Corpus(corpus_name)
pickle_filename = f'dunning_he_vs_she_associated_words_{corpus_name}'
try:
results = load_pickle(pickle_filename)
except IOError:
he_counter = Counter()
she_counter = Counter()
for novel in corpus.novels:
he_counter.update(novel.words_associated("he"))
she_counter.update(novel.words_associated("she"))
results = dunning_total(she_counter, he_counter, filename_to_pickle=pickle_filename)
for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
dunning_result_displayer(results, number_of_terms_to_display=20,
corpus1_display_name='she...',
corpus2_display_name='he..',
part_of_speech_to_include=group)
# Female characters as written by Male Authors versus Female Authors
####################################################################
def female_characters_author_gender_differences(corpus_name):
"""
Compares how male authors versus female authors write female characters by looking at the words
that follow 'she'
:param corpus_name:
:return:
"""
male_corpus = Corpus(corpus_name).filter_by_gender('male')
female_corpus = Corpus(corpus_name).filter_by_gender('female')
compare_word_association_between_corpus_analysis_dunning(word='she',
corpus1=female_corpus, corpus1_name='fem aut',
corpus2=male_corpus, corpus2_name='male aut')
# Male characters as written by Male Authors versus Female Authors
####################################################################
def male_characters_author_gender_differences(corpus_name):
"""
Compares how male authors versus female authors write male characters by looking at the words
that follow 'he'
:param corpus_name:
:return:
"""
male_corpus = Corpus(corpus_name).filter_by_gender('male')
female_corpus = Corpus(corpus_name).filter_by_gender('female')
compare_word_association_between_corpus_analysis_dunning(word='he',
corpus1=female_corpus, corpus1_name='female aut',
corpus2=male_corpus, corpus2_name='male aut')
# God as written by Male Authors versus Female Authors
####################################################################
def god_author_gender_differences(corpus_name):
"""
Compares how male authors versus female authors refer to God by looking at the words
that follow 'God'
:param corpus_name:
:return:
"""
male_corpus = Corpus(corpus_name).filter_by_gender('male')
female_corpus = Corpus(corpus_name).filter_by_gender('female')
compare_word_association_between_corpus_analysis_dunning(word='God',
corpus1=female_corpus, corpus1_name='female aut',
corpus2=male_corpus, corpus2_name='male aut')
def money_author_gender_differences(corpus_name):
"""
Compares how male authors versus female authors refer to money by looking at the words
before and after money'
:param corpus_name:
:return:
"""
male_corpus = Corpus(corpus_name).filter_by_gender('male')
female_corpus = Corpus(corpus_name).filter_by_gender('female')
compare_word_association_between_corpus_analysis_dunning(word=['money','dollars', 'pounds', 'euros', 'dollar', 'pound','euro', 'wealth', 'income'],
corpus1=female_corpus, corpus1_name='female aut',
corpus2=male_corpus, corpus2_name='male aut')
# America as written by Male Authors versus Female Authors
####################################################################
def america_author_gender_differences(corpus_name):
"""
Compares how American male authors versus female authors refer to America by looking at the words
that follow 'America'
:param corpus_name:
:return:
"""
male_corpus = Corpus(corpus_name).filter_by_gender('male')
female_corpus = Corpus(corpus_name).filter_by_gender('female')
compare_word_association_between_corpus_analysis_dunning(word='America',
corpus1=female_corpus, corpus1_name='female aut',
corpus2=male_corpus, corpus2_name='male aut')
if __name__ == '__main__':
#### Uncomment any of the lines below to run one of the analyses.
# male_vs_female_authors_analysis_dunning('gutenberg')
# he_vs_she_associations_analysis_dunning('gutenberg')
# female_characters_author_gender_differences('gutenberg')
# male_characters_author_gender_differences('gutenberg')
# god_author_gender_differences('gutenberg')
# money_author_gender_differences('gutenberg')
# dunning_result_to_dict(male_vs_female_authors_analysis_dunning('gutenberg'))
from dh_testers.testRunner import main_test
main_test()