# The most significant adjectives for men / women

__What's the difference in describing men and women?__

And in more technical terms: 

__What are the most significant adjectives used with the word 'man' as opposed to the word 'woman'?__


The general approach would be:
- use the standard NLTK corpora representing free text (eg gutenberg, brown, webtext)
- count the occurences of every word preceding 'man' or 'woman' (let's call them adjectives)
- sort by the 'most significant' adjective, that is one that is most 'uncommonly common' for a given category - man or woman
- show them

## Technologies
- **NLTK** for NLP
- **pandas** for 'most-significant' calculations

In [7]:
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
import pandas as pd

def most_common_adj(corpus, word, limit=10, min_freq=3):
    # bigrams ending with word
    bigrams = nltk.bigrams(corpus.words())
    adjectives = [x.lower() for x,y in list(bigrams) if y==word]
    fd = nltk.FreqDist([w for w in adjectives if len(w)>2 and w not in stopwords])
    return [ (word, freq) for word,freq in fd.most_common(limit) if freq >= min_freq ]


def most_significant(groups, limit=10):
    """ Sorts adjectives in each group with most significant for the group first
    
        most significant means occurring in the group more often than in the whole corpus
        significance ranges from [0 to 1], 1 meaning that the adjective doesn't exist anywhere else but in this group
        NOTE: if two words have the same significance then the one occurring more often is more significant
    """
    dfs = [pd.DataFrame(group, columns=['word', 'counts']) for group in groups]
    dfs = [df.set_index('word') for df in dfs]
    
    #     adjustment is required to remove bias caused by some groups being more populous
    biggest_count = max([sum(df.counts) for df in dfs])
    dfs = [df*biggest_count/sum(df.counts) for df in dfs]
    
    total = reduce(lambda x,y: x.add(y, fill_value=0), dfs)
    for df in dfs:
        df['ratio'] = df/total
        
    sigs = [df.dropna().to_records().tolist() for df in dfs]
    sorted_sigs = [ sorted(sig, key=lambda x: (x[2],x[1]), reverse=True) for sig in sigs ]
    return [ sig[:limit] for sig in sorted_sigs ]
    
def show_most_significant(words, corpuses, adj_limit=1000, min_adj_freq=3, sig_adj_limit=100):
    most_common = [[most_common_adj(corpus, word, limit=adj_limit, min_freq=min_adj_freq) for word in words] for corpus in corpuses]

    # print [[word_counts[:10] for word_counts in corpus] for corpus in most_common]

    for corpus, corpus_most_common in zip(corpuses, most_common):
        print "\n-------------------------------------\n{}\n-------------------------------------\n".format(corpus.readme().split('\n')[0])
        for word, word_most_significant in zip(words, most_significant(corpus_most_common, limit=sig_adj_limit)):
    #         print "\n{}: -------------\n{}".format(word, word_most_significant)
                print "\n{}: -------------\n{}".format(word, ",".join(map(lambda x: x[0], word_most_significant)))


In [6]:
        
show_most_significant(
    words = ["man", "woman"],
    corpuses = [nltk.corpus.gutenberg
            , nltk.corpus.brown, nltk.corpus.webtext
           ],
    sig_adj_limit=100
)



-------------------------------------
Project Gutenberg Selections
-------------------------------------


man: -------------
little,mighty,wicked,rich,great,righteous,sensible,honest,blind,lazy,first,mortal,white,upon,euery,strong,tall,prudent,honourable,evil,dead,valiant,married,foolish,agreeable,never,ordinary,grey,last,violent,elderly,upright,slothful,small,brave,third,drunken,industrious,neither,thou,hearted,without,unto,big,happy,made,average,single,faithful,whatsoever,large,new,hairy,clever,known,perfect,better,haired,free,created,bad,like,lean,wayfaring,mean,sick,excellent,bloody,armed,crazy,lame,short,furious,inward,unhappy,happiest,dumb,best,faced,yet,tempered,respectable,fellow,impatient,every,one,wise,another,good,young,looking,certain,old,poor,natured,strange

woman: -------------
lovely,charming,widow,fine,virtuous,midianitish,beautiful,israelitish,amiable,strange,natured,poor,old,certain,looking,young,good,another,wise,one,every

-------------------------------------
BR

# Polish corpus - NKJP

NKJP requires some manual installs and is veeery slow. Needs some more work...

In [14]:
# import nltk
# from nltk.corpus.reader.nkjp import NKJPCorpusReader
# x = NKJPCorpusReader(root='/Users/gregaw/nltk_data/corpora/NKJP/', fileids='') # obtain the whole corpus
# len(x.words())
# # show_most_significant(
# #     words = ["mężczyzna", "kobieta"],
# #     corpuses = [nltk.corpus.nkjp
# #            ],
# #     sig_adj_limit=100
# # )

991834