In [54]:
import evaluate
from rouge_score import rouge_scorer

predictions:a cat is on the table

references:there is a cat on the table

* 分子是predictions与references共有的`N-gram`个数
* 分母是references所有`N-gram`个数

$$ precision_1 = \frac{len(['a', 'cat', 'is', 'on' 'the', 'table')]}{len(['there', 'is', 'a', 'cat', 'on', 'the', 'table'])} = \frac{6}{7} $$

$$ precision_2 = \frac{len([('a', 'cat'), ('on', 'the'), ('the', 'table')]}{len([('there', 'is'), ('is', 'a'), ('a', 'cat'), ('cat', 'on'), ('on', 'the'), ('the', 'table')])} = \frac{1}{2} $$

In [55]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
scores = scorer.score('a cat is on the table',
                      'there is a cat on the table')
scores

{'rouge1': Score(precision=0.8571428571428571, recall=1.0, fmeasure=0.923076923076923),
 'rouge2': Score(precision=0.5, recall=0.6, fmeasure=0.5454545454545454),
 'rougeL': Score(precision=0.7142857142857143, recall=0.8333333333333334, fmeasure=0.7692307692307692),
 'rougeLsum': Score(precision=0.7142857142857143, recall=0.8333333333333334, fmeasure=0.7692307692307692)}

In [56]:
rouge = evaluate.load('rouge')
rouge

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [57]:
predictions = ["a cat is on the table"]
references = ["there is a cat on the table"]

results = rouge.compute(predictions=predictions,
                        references=references)
print(results)

{'rouge1': 0.923076923076923, 'rouge2': 0.5454545454545454, 'rougeL': 0.7692307692307692, 'rougeLsum': 0.7692307692307692}


In [58]:
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]

results = rouge.compute(predictions=predictions,
                        references=references,
                        # use_aggregator (boolean): If True, returns aggregates. Defaults to True.
                        use_aggregator=False)
print(results)

{'rouge1': [1.0, 1.0], 'rouge2': [1.0, 1.0], 'rougeL': [1.0, 1.0], 'rougeLsum': [1.0, 1.0]}


In [59]:
# It can also deal with lists of references for each predictions:

rouge = evaluate.load('rouge')
predictions = ["hello there", "general kenobi"]
references = [["hello", "there"], ["general kenobi", "general yoda"]]
results = rouge.compute(predictions=predictions,
                        references=references)
print(results)

{'rouge1': 0.8333333333333333, 'rouge2': 0.5, 'rougeL': 0.8333333333333333, 'rougeLsum': 0.8333333333333333}
