In [1]:
import json
import os
import pandas as pd
import nltk as nltk
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
def flatten(l) :
    return [item for sublist in l for item in sublist]

def nounify(adj_word):
    """ Transform an adjective to the closest noun: dead -> death """
    adj_synsets = wn.synsets(adj_word, pos="a")

    # Word not found
    if not adj_synsets:
        return []

    # Get all adj lemmas of the word
    adj_lemmas = [l for s in adj_synsets 
                  for l in s.lemmas() 
                  if (s.name().split('.')[1] == 'a' or 
                      s.name().split('.')[1] == 's')]

    # Get related forms
    derivationally_related_forms = [(l, l.derivationally_related_forms()) 
                                    for l in adj_lemmas]

    # filter only the nouns
    related_noun_lemmas = [l for drf in derivationally_related_forms 
                           for l in drf[1] 
                           if l.synset().name().split('.')[1] == 'n']
    synsets = [l.synset() for l in related_noun_lemmas]
    return synsets

In [3]:
d_human = (pd.read_csv('humanOutput/filteredCorpus.csv')
     .assign(source = 'human'))
d_prag = (pd.read_csv('modelOutput/speaker_big_sl_perp_sampled_message.csv')
     .assign(source = 'pragmatic'))
d_lit = (pd.read_csv('modelOutput/speaker_big_s0_untuned_sampled_message.csv')
     .assign(source = 'literal'))
d = d_human.append(d_prag).append(d_lit).replace(np.nan,' ', regex=True)

TODO: not actually safe to just take first element of `colorSynsets`: it's technically the most common one which is often what we want (e.g. 'red' for 'reddish') but the WordNet docs explicitly say that the order shouldn't be trusted in general. Not sure what we can do about this...

In [4]:
wnl = WordNetLemmatizer()
def get_informativity(text):
    try :
        words = [wnl.lemmatize(word) for word in word_tokenize(text)]
    except :
        print(text)
        raise
    res = []
    for word in words :
        nounForms = wn.synsets(word, pos='n')
        nounSynsets = nounForms if nounForms else nounify(word)
        colorSynsets = [n for n in nounSynsets 
                        if 'color.n.01' in [s.name() for s in flatten(n.hypernym_paths())]]
        res += [s.min_depth() for s in colorSynsets][:1] if colorSynsets else []
    return np.max(res) if res else None

Test...

In [5]:
text = "fawn"
print(wn.synsets('fawn'))
print(get_informativity(text))
print(get_informativity('red'))

[Synset('dun.n.02'), Synset('fawn.n.02'), Synset('fawn.v.01'), Synset('fawn.v.02'), Synset('fawn.v.03')]
9
7


Run on whole dataset...

In [6]:
d['specificity'] = [get_informativity(text) for text in d['contents']]

In [7]:
d.to_csv("informativities.csv", index = False)