# POS-tagging for comparative/superlative identification

In [27]:
import os
import pandas as pd
import nltk as nltk
from pycorenlp import StanfordCoreNLP

## Start the Stanford CoreNLP server

Before running this notebook, [get CoreNLP](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip), go into its directory, and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000`

If you're using port 9000 for something else, change that value and then change `PORT` in the next cell.

In [49]:
PORT = 9000

NLP = StanfordCoreNLP('http://localhost:{}'.format(PORT))

## Convenience function for POS tagging

In [74]:
def stanford_pos(text):
    """
    Parameters
    ----------
    text : str
       CoreNLP handles all tokenizing, at the sentence and word level.
       
    Returns
    -------
    list of tuples (str, str)
       The first member of each pair is the word, the second its POS tag.          
    """
    ann = nlp.annotate(
        text, 
        properties={'annotators': 'pos', 
                    'outputFormat': 'json'})
    lemmas = []
    for sentence in ann['sentences']:
        for token in sentence['tokens']:
            lemmas.append((token['word'], token['pos']))
    return lemmas

## Comparative/Superlative identifiers

In [77]:
from nltk.stem.wordnet import WordNetLemmatizer

LEMMATIZER = WordNetLemmatizer()

def is_comp_sup(word, pos, tags, check_lemmatizer=False):
    """
    Parameters
    ----------
    word, pos : str, str
        The lemma.
    
    tags : iterable of str
        The tags considered positive evidence for comp/sup morphology.
       
       
    check_lemmatizer : bool
        If True, then if the `pos` is in `tags`, we also check that
        `word` is different from the lemmatized version of word
        according to WordNet, treating it as an adjective. This 
        could be used to achieve greater precision, perhaps at the
        expense of recall.
       
    Returns
    -------
    bool       
    """
    if pos not in tags:
        return False
    if check_lemmatizer and LEMMATIZER.lemmatize(word, 'a') == word:
        return False
    return True

def is_superlative(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'JJS', 'RBS'}, check_lemmatizer=check_lemmatizer)

def is_comparative(word, pos):
    return is_comp_sup(
        word, pos, {'JJR', 'RBR'}, check_lemmatizer=check_lemmatizer)

## Data analysis

In [65]:
d = pd.read_csv('humanOutput/colorReferenceMessage.csv', escapechar='\\')

### Tag the data

In [76]:
# A lemma is a (word, pos) tag pair.
d['lemmas'] = [stanford_pos(text) for text in d['contents']]

## Identify comparatives and superlatives

These steps put a 1 in the position of comparatives/superlatives, and a 0 in all other places, to maintain alignment with the original texts.

In [78]:
d['superlatives'] = [[1 if is_superlative(*lem) else 0 for lem in lemmas]
                     for lemmas in d['lemmas']]

In [79]:
d['comparatives'] = [[1 if is_comparative(*lem) else 0 for lem in lemmas]
                     for lemmas in d['lemmas']]

Count superlatives & comparatives

In [81]:
d['numSuper'] = [sum(counts) for counts in d['superlatives']]

d['numComp'] = [sum(counts) for counts in d['comparatives']]

## Inspection

Run the cell below to allow for non-scrolling display:

In [84]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [87]:
d.query('numSuper > 0').head()

Unnamed: 0,gameid,time,roundNum,sender,contents,lemmas,superlatives,comparatives,numSuper,numComp
48,1124-1,1459877758186,42,speaker,lightest shade of teal/blue/green,"[(lightest, JJS), (shade, NN), (of, IN), (teal...","[1, 0, 0, 0]","[0, 0, 0, 0]",1,0
75,8235-6,1459877492779,18,speaker,brightest green,"[(brightest, JJS), (green, JJ)]","[1, 0]","[0, 0]",1,0
76,8235-6,1459877512927,19,speaker,brightest green,"[(brightest, JJS), (green, JJ)]","[1, 0]","[0, 0]",1,0
81,8235-6,1459877610659,23,speaker,darkest green,"[(darkest, JJS), (green, JJ)]","[1, 0]","[0, 0]",1,0
82,8235-6,1459877625200,24,speaker,brightest pink,"[(brightest, JJS), (pink, NN)]","[1, 0]","[0, 0]",1,0


In [88]:
d.query('numComp > 0').head()

Unnamed: 0,gameid,time,roundNum,sender,contents,lemmas,superlatives,comparatives,numSuper,numComp
0,1124-1,1459877203862,1,speaker,The darker blue one,"[(The, DT), (darker, JJR), (blue, JJ), (one, NN)]","[0, 0, 0, 0]","[0, 1, 0, 0]",0,1
13,1124-1,1459877360202,13,speaker,"One of the brown ones, the lighter shaded one","[(One, CD), (of, IN), (the, DT), (brown, JJ), ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",0,1
31,1124-1,1459877544164,26,speaker,darker red,"[(darker, JJR), (red, NN)]","[0, 0]","[1, 0]",0,1
33,1124-1,1459877564218,28,speaker,"purple, darker one","[(purple, JJ), (,, ,), (darker, JJR), (one, CD)]","[0, 0, 0, 0]","[0, 0, 1, 0]",0,1
38,1124-1,1459877621758,33,speaker,brown. not the yellow one or classic brown one...,"[(brown, JJ), (., .), (not, RB), (the, DT), (y...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",0,1
