Merge branch 'master' into coco-experiments

farrell236 · Nov 4, 2020 · 014d15a · 014d15a
2 parents 0660c48 + 83c1a3d
commit 014d15a
Show file tree

Hide file tree

Showing 42 changed files with 1,576 additions and 0 deletions.
diff --git a/nlp_metrics/README.md b/nlp_metrics/README.md
@@ -0,0 +1,53 @@
+NLP Metrics 
+===
+
+Evaluation metrics for caption generation. Code in this folder are based on the [Python 3 Fork](https://github.com/flauted/coco-caption) of 
+the [COCO Caption Evaluation](https://github.com/tylin/coco-caption) library, and have been modified to allow for other datasets. 
+
+## Requirements ##
+- java 1.8.0
+- python (tested 2.7/3.6)
+
+## Files ##
+
+- evals.py: The file includes MIMICEavlCap and COCOEavlCap class.
+- tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer
+- coco: pycocotools from [cocodataset](https://github.com/cocodataset) / [cocoapi](https://github.com/cocodataset/cocoapi)
+- bleu: Bleu evalutation codes
+- meteor: Meteor evaluation codes
+- rouge: Rouge-L evaluation codes
+- cider: CIDEr evaluation codes
+- spice: SPICE evaluation codes
+
+## Setup ##
+
+- You will first need to download the [Stanford CoreNLP 3.6.0](http://stanfordnlp.github.io/CoreNLP/index.html) code and models for use by SPICE. To do this, run either: 
+    - ``python get_stanford_models.py``
+    - ``./get_stanford_models.sh``
+## Notes ##
+- SPICE will try to create a cache of parsed sentences in ./spice/cache/. This dramatically speeds up repeated evaluations. 
+    - Without altering this code, use the environment variables ``SPICE_CACHE_DIR`` and ``SPICE_TEMP_DIR`` to set the cache directory.
+    - The cache should **NOT** be on an NFS mount.
+    - Caching can be disabled by editing the ``/spice/spice.py`` file. Remove the ``-cache`` argument to ``spice_cmd``.
+
+
+## References ##
+
+- [Microsoft COCO Captions: Data Collection and Evaluation Server](http://arxiv.org/abs/1504.00325)
+- PTBTokenizer: We use the [Stanford Tokenizer](http://nlp.stanford.edu/software/tokenizer.shtml) which is included in [Stanford CoreNLP 3.4.1](http://nlp.stanford.edu/software/corenlp.shtml).
+- BLEU: [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)
+- Meteor: [Project page](http://www.cs.cmu.edu/~alavie/METEOR/) with related publications. We use the latest version (1.5) of the [Code](https://github.com/mjdenkowski/meteor). Changes have been made to the source code to properly aggreate the statistics for the entire corpus.
+- Rouge-L: [ROUGE: A Package for Automatic Evaluation of Summaries](http://anthology.aclweb.org/W/W04/W04-1013.pdf)
+- CIDEr: [CIDEr: Consensus-based Image Description Evaluation](http://arxiv.org/pdf/1411.5726.pdf)
+- SPICE: [SPICE: Semantic Propositional Image Caption Evaluation](https://arxiv.org/abs/1607.08822)
+
+## Developers ##
+- Xinlei Chen (CMU)
+- Hao Fang (University of Washington)
+- Tsung-Yi Lin (Cornell)
+- Ramakrishna Vedantam (Virgina Tech)
+
+## Acknowledgement ##
+- David Chiang (University of Norte Dame)
+- Michael Denkowski (CMU)
+- Alexander Rush (Harvard University)
diff --git a/nlp_metrics/__init__.py b/nlp_metrics/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/nlp_metrics/bleu/LICENSE b/nlp_metrics/bleu/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/nlp_metrics/bleu/__init__.py b/nlp_metrics/bleu/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/nlp_metrics/bleu/bleu.py b/nlp_metrics/bleu/bleu.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+#
+# File Name : bleu.py
+#
+# Description : Wrapper for BLEU scorer.
+#
+# Creation Date : 06-01-2015
+# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
+# Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
+
+from .bleu_scorer import BleuScorer
+
+
+class Bleu:
+    def __init__(self, n=4):
+        # default compute Blue score up to 4
+        self._n = n
+        self._hypo_for_image = {}
+        self.ref_for_image = {}
+
+    def compute_score(self, gts, res):
+
+        assert(gts.keys() == res.keys())
+        imgIds = gts.keys()
+
+        bleu_scorer = BleuScorer(n=self._n)
+        for id in imgIds:
+            hypo = res[id]
+            ref = gts[id]
+
+            # Sanity check.
+            assert(type(hypo) is list)
+            assert(len(hypo) == 1)
+            assert(type(ref) is list)
+            assert(len(ref) >= 1)
+
+            bleu_scorer += (hypo[0], ref)
+
+        #score, scores = bleu_scorer.compute_score(option='shortest')
+        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
+        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
+
+        # return (bleu, bleu_info)
+        return score, scores
+
+    def method(self):
+        return "Bleu"
diff --git a/nlp_metrics/bleu/bleu_scorer.py b/nlp_metrics/bleu/bleu_scorer.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+
+# bleu_scorer.py
+# David Chiang <chiang@isi.edu>
+
+# Copyright (c) 2004-2006 University of Maryland. All rights
+# reserved. Do not redistribute without permission from the
+# author. Not for commercial use.
+
+# Modified by:
+# Hao Fang <hfang@uw.edu>
+# Tsung-Yi Lin <tl483@cornell.edu>
+
+'''Provides:
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+'''
+from builtins import range, dict
+
+import copy
+import sys, math, re
+from collections import defaultdict
+
+def precook(s, n=4, out=False):
+    """Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well."""
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return (len(words), counts)
+
+def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.'''
+
+    reflen = []
+    maxcounts = dict()
+    for ref in refs:
+        rl, counts = precook(ref, n)
+        reflen.append(rl)
+        for (ngram,count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+
+    # Calculate effective reference sentence length.
+    if eff == "shortest":
+        reflen = min(reflen)
+    elif eff == "average":
+        reflen = float(sum(reflen))/len(reflen)
+
+    ## lhuang: N.B.: leave reflen computaiton to the very end!!
+
+    ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
+
+    return (reflen, maxcounts)
+
+def cook_test(test, refs, eff=None, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.'''
+
+    reflen, refmaxcounts = refs
+    testlen, counts = precook(test, n, True)
+
+    result = dict()
+
+    # Calculate effective reference sentence length.
+
+    if eff == "closest":
+        result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
+    else: ## i.e., "average" or "shortest" or None
+        result["reflen"] = reflen
+
+    result["testlen"] = testlen
+
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
+
+    result['correct'] = [0]*n
+    for (ngram, count) in counts.items():
+        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+
+    return result
+
+class BleuScorer(object):
+    """Bleu scorer.
+    """
+
+    __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
+    # special_reflen is used in oracle (proportional effective ref len for a node).
+
+    def copy(self):
+        ''' copy the refs.'''
+        new = BleuScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        new._score = None
+        return new
+
+    def __init__(self, test=None, refs=None, n=4, special_reflen=None):
+        ''' singular instance '''
+
+        self.n = n
+        self.crefs = []
+        self.ctest = []
+        self.cook_append(test, refs)
+        self.special_reflen = special_reflen
+
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                cooked_test = cook_test(test, self.crefs[-1])
+                self.ctest.append(cooked_test) ## N.B.: -1
+            else:
+                self.ctest.append(None) # lens of crefs and ctest have to match
+
+        self._score = None ## need to recompute
+
+    def ratio(self, option=None):
+        self.compute_score(option=option)
+        return self._ratio
+
+    def score_ratio(self, option=None):
+        '''return (bleu, len_ratio) pair'''
+        return (self.fscore(option=option), self.ratio(option=option))
+
+    def score_ratio_str(self, option=None):
+        return "%.4f (%.2f)" % self.score_ratio(option)
+
+    def reflen(self, option=None):
+        self.compute_score(option=option)
+        return self._reflen
+
+    def testlen(self, option=None):
+        self.compute_score(option=option)
+        return self._testlen
+
+    def retest(self, new_test):
+        if type(new_test) is str:
+            new_test = [new_test]
+        assert len(new_test) == len(self.crefs), new_test
+        self.ctest = []
+        for t, rs in zip(new_test, self.crefs):
+            self.ctest.append(cook_test(t, rs))
+        self._score = None
+
+        return self
+
+    def rescore(self, new_test):
+        ''' replace test(s) with new test(s), and returns the new score.'''
+
+        return self.retest(new_test).compute_score()
+
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+
+        if type(other) is tuple:
+            ## avoid creating new BleuScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            assert self.compatible(other), "incompatible BLEUs."
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+            self._score = None ## need to recompute
+
+        return self
+
+    def compatible(self, other):
+        return isinstance(other, BleuScorer) and self.n == other.n
+
+    def single_reflen(self, option="average"):
+        return self._single_reflen(self.crefs[0][0], option)
+
+    def _single_reflen(self, reflens, option=None, testlen=None):
+
+        if option == "shortest":
+            reflen = min(reflens)
+        elif option == "average":
+            reflen = float(sum(reflens))/len(reflens)
+        elif option == "closest":
+            reflen = min((abs(l-testlen), l) for l in reflens)[1]
+        else:
+            assert False, "unsupported reflen option %s" % option
+
+        return reflen
+
+    def recompute_score(self, option=None, verbose=0):
+        self._score = None
+        return self.compute_score(option, verbose)
+
+    def compute_score(self, option=None, verbose=0):
+        n = self.n
+        small = 1e-9
+        tiny = 1e-15 ## so that if guess is 0 still return 0
+        bleu_list = [[] for _ in range(n)]
+
+        if self._score is not None:
+            return self._score
+
+        if option is None:
+            option = "average" if len(self.crefs) == 1 else "closest"
+
+        self._testlen = 0
+        self._reflen = 0
+        totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+
+        # for each sentence
+        for comps in self.ctest:
+            testlen = comps['testlen']
+            self._testlen += testlen
+
+            if self.special_reflen is None: ## need computation
+                reflen = self._single_reflen(comps['reflen'], option, testlen)
+            else:
+                reflen = self.special_reflen
+
+            self._reflen += reflen
+
+            for key in ['guess','correct']:
+                for k in range(n):
+                    totalcomps[key][k] += comps[key][k]
+
+            # append per image bleu score
+            bleu = 1.
+            for k in range(n):
+                bleu *= (float(comps['correct'][k]) + tiny) \
+                        /(float(comps['guess'][k]) + small)
+                bleu_list[k].append(bleu ** (1./(k+1)))
+            ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
+            if ratio < 1:
+                for k in range(n):
+                    bleu_list[k][-1] *= math.exp(1 - 1/ratio)
+
+            if verbose > 1:
+                print(comps, reflen)
+
+        totalcomps['reflen'] = self._reflen
+        totalcomps['testlen'] = self._testlen
+
+        bleus = []
+        bleu = 1.
+        for k in range(n):
+            bleu *= float(totalcomps['correct'][k] + tiny) \
+                    / (totalcomps['guess'][k] + small)
+            bleus.append(bleu ** (1./(k+1)))
+        ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
+        if ratio < 1:
+            for k in range(n):
+                bleus[k] *= math.exp(1 - 1/ratio)
+
+        if verbose > 0:
+            print(totalcomps)
+            print("ratio:", ratio)
+
+        self._score = bleus
+        return self._score, bleu_list
diff --git a/nlp_metrics/cider/__init__.py b/nlp_metrics/cider/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'