# POS-tagging for comparative/superlative identification

__Contents__

0. [Start the Stanford CoreNLP server](#Start-the-Stanford-CoreNLP-server)
0. [Convenience function for POS tagging](#Convenience-function-for-POS-tagging)
0. [Comparative/Superlative identifiers](#Comparative/Superlative-identifiers)
0. [Data analysis](#Data-analysis)
  0. [Tag the data](#Tag-the-data)
  0. [Identify comparatives and superlatives](#Identify-comparatives-and-superlatives)
  0. [Inspection](#Inspection)

In [1]:
import os
import pandas as pd
import nltk as nltk
from pycorenlp import StanfordCoreNLP

## Start the Stanford CoreNLP server

Before running this notebook, [get CoreNLP](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip), go into its directory, and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000`

If you're using port 9000 for something else, change that value and then change `PORT` in the next cell.

In [2]:
PORT = 9000

nlp = StanfordCoreNLP('http://localhost:{}'.format(PORT))

## Convenience function for POS tagging

In [26]:
def stanford_pos(text):
    """
    Parameters
    ----------
    text : str
       CoreNLP handles all tokenizing, at the sentence and word level.
       
    Returns
    -------
    list of tuples (str, str)
       The first member of each pair is the word, the second its POS tag.          
    """
    if not isinstance(text, basestring):
        print '%s: %s' % (type(text), str(text))
        text = str(text)
    ann = nlp.annotate(
        text, 
        properties={'annotators': 'pos', 
                    'outputFormat': 'json'})
    lemmas = []
    for sentence in ann['sentences']:
        for token in sentence['tokens']:
            lemmas.append((token['word'], token['pos']))
    return lemmas

## Comparative/Superlative identifiers

In [27]:
from nltk.stem.wordnet import WordNetLemmatizer

LEMMATIZER = WordNetLemmatizer()

def is_comp_sup(word, pos, tags, check_lemmatizer=False):
    """
    Parameters
    ----------
    word, pos : str, str
        The lemma.
    
    tags : iterable of str
        The tags considered positive evidence for comp/sup morphology.
       
       
    check_lemmatizer : bool
        If True, then if the `pos` is in `tags`, we also check that
        `word` is different from the lemmatized version of word
        according to WordNet, treating it as an adjective. This 
        could be used to achieve greater precision, perhaps at the
        expense of recall.
       
    Returns
    -------
    bool       
    """
    if pos not in tags:
        return False
    if check_lemmatizer and LEMMATIZER.lemmatize(word, 'a') == word:
        return False
    return True

def is_superlative(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'JJS', 'RBS'}, check_lemmatizer=check_lemmatizer)

def is_comparative(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'JJR', 'RBR'}, check_lemmatizer=check_lemmatizer)

## Data analysis

In [40]:
d_human = (pd.read_csv('humanOutput/colorReferenceMessage.csv', escapechar='\\')
     .assign(source = 'human'))
d_model = (pd.read_csv('modelOutput/speaker_reccontext_tuned_sampled_message.csv')
     .assign(source = 'model'))
d = d_human.append(d_model)
d

Unnamed: 0,gameid,time,roundNum,sender,contents,source
0,1124-1,1459877203862,1,speaker,The darker blue one,human
1,1124-1,1459877214034,2,speaker,purple,human
2,1124-1,1459877223719,3,speaker,Medium pink,human
3,1124-1,1459877227433,3,speaker,the medium dark one,human
4,1124-1,1459877240480,4,speaker,lime,human
5,1124-1,1459877257997,5,speaker,Mint green.,human
6,1124-1,1459877267242,6,speaker,Mud brown,human
7,1124-1,1459877278380,7,speaker,Mud brown,human
8,1124-1,1459877294720,8,speaker,Camo green,human
9,1124-1,1459877305438,9,speaker,Darkish red,human


### Tag the data

In [41]:
# A lemma is a (word, pos) tag pair.
d['lemmas'] = [stanford_pos(text) for text in d['contents']]

### Identify comparatives and superlatives

These steps put a 1 in the position of comparatives/superlatives, and a 0 in all other places, to maintain alignment with the original texts.

In [42]:
d['superlatives'] = [[1 if is_superlative(*lem) else 0 for lem in lemmas]
                     for lemmas in d['lemmas']]

In [43]:
d['comparatives'] = [[1 if is_comparative(*lem) else 0 for lem in lemmas]
                     for lemmas in d['lemmas']]

Count superlatives & comparatives

In [44]:
d['numSuper'] = [sum(counts) for counts in d['superlatives']]

d['numComp'] = [sum(counts) for counts in d['comparatives']]

### Inspection

Run the cell below to allow for non-scrolling display:

In [45]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [46]:
d.query('numSuper > 0').head()

Unnamed: 0,gameid,time,roundNum,sender,contents,source,lemmas,superlatives,comparatives,numSuper,numComp
12,1124-1,1459877341443,12,speaker,"Darkest shade of green, forest green",human,"[(Darkest, JJS), (shade, NN), (of, IN), (green...","[1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]",1,0
48,1124-1,1459877758186,42,speaker,lightest shade of teal/blue/green,human,"[(lightest, JJS), (shade, NN), (of, IN), (teal...","[1, 0, 0, 0]","[0, 0, 0, 0]",1,0
75,8235-6,1459877492779,18,speaker,brightest green,human,"[(brightest, JJS), (green, JJ)]","[1, 0]","[0, 0]",1,0
76,8235-6,1459877512927,19,speaker,brightest green,human,"[(brightest, JJS), (green, JJ)]","[1, 0]","[0, 0]",1,0
81,8235-6,1459877610659,23,speaker,darkest green,human,"[(darkest, JJS), (green, JJ)]","[1, 0]","[0, 0]",1,0


In [47]:
d.query('numComp > 0 & source == "model"').head()

Unnamed: 0,gameid,time,roundNum,sender,contents,source,lemmas,superlatives,comparatives,numSuper,numComp
2,2780-1,1459886542191,3,speaker,i have lighter of a bright green . too ~ like ...,model,"[(i, LS), (have, VBP), (lighter, JJR), (of, IN...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,2
5,2780-1,1459886660813,6,speaker,darker blue green,model,"[(darker, JJR), (blue, JJ), (green, NN)]","[0, 0, 0]","[1, 0, 0]",0,1
6,2780-1,1459886689406,7,speaker,darker red,model,"[(darker, JJR), (red, NN)]","[0, 0]","[1, 0]",0,1
7,2780-1,1459886733504,8,speaker,lesser purple,model,"[(lesser, JJR), (purple, JJ)]","[0, 0]","[1, 0]",0,1
12,2780-1,1459886889788,13,speaker,the lighter blue,model,"[(the, DT), (lighter, JJR), (blue, NN)]","[0, 0, 0]","[0, 1, 0]",0,1


### Write to file

In [48]:
(d.drop(['lemmas', 'superlatives', 'comparatives'], 1)
 .to_csv("taggedColorMsgs.csv", index = False))

0.09777015437392796

In [53]:
len(d_model[d_model['contents'].str.contains('not ')]) * 1.0 / len(d_model)

0.06803887935963408

In [65]:
def join_with_tilde(s):
    return ' ~ '.join(s)

def join_lemmas_lists(r):
    result = []
    for row in r:
        result.extend(eval(row))
    return repr(result)

def join_with_tagged(output, tags):
    return (pd.merge(output, tags, on=['gameid', 'roundNum'])[['gameid', 'roundNum', 'contents', 'numSuper', 'numComp', 'condition']])

In [68]:
joined = join_with_tagged(pd.read_csv('humanOutput/colorReferenceClicks.csv', escapechar='\\'), d_model)

In [71]:
for condition in ('closer', 'further', 'equal'):
    filtered = joined.query('condition == "%s"' % condition)
    print '%s comp: %s' % (condition, filtered['numComp'].sum() * 1.0 / len(filtered))
    print '%s neg: %s' % (condition, len(filtered[filtered['contents'].str.contains('not ')]) * 1.0 / len(filtered))
    print '%s super: %s' % (condition, filtered['numSuper'].sum() * 1.0 / len(filtered))    

closer comp: 0.111876075731
closer neg: 0.065404475043
closer super: 0.148020654045
further comp: 0.0960548885077
further neg: 0.073756432247
further super: 0.109777015437
equal comp: 0.0854700854701
equal neg: 0.0649572649573
equal super: 0.0837606837607
