In [None]:
!pip install datasets
!pip install rouge
!pip install nltk
!pip install pycountry

In [None]:
from datasets import list_datasets
from datasets import load_dataset


print(list_datasets())
dataset = load_dataset('cnn_dailymail', '3.0.0')

# split data in train, validation, test
train_article_set = dataset['train']['article']
train_highlights_set = dataset['train']['highlights']

validation_article_set = dataset['validation']['article']
validation_highlights_set = dataset['validation']['highlights']

test_article_set = dataset['test']['article']
test_highlights_set = dataset['test']['highlights']

In [3]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sys import version_info


PY3 = version_info[0] == 3


if PY3:
    bytes = bytes
    unicode = str
else:
    bytes = str
    unicode = unicode
string_types = (bytes, unicode,)


try:
    from itertools import ifilterfalse as ffilter
except ImportError:
    from itertools import filterfalse as ffilter

try:
    from collections.abc import Sequence
except ImportError:
    from collections import Sequence


def unicode_compatible(cls):
    """
    Decorator for unicode compatible classes. Method ``__unicode__``
    has to be implemented to work decorator as expected.
    """
    if PY3:
        cls.__str__ = cls.__unicode__
        cls.__bytes__ = lambda self: self.__str__().encode("utf-8")
    else:
        cls.__str__ = lambda self: self.__unicode__().encode("utf-8")

    return cls


def to_string(object):
    return to_unicode(object) if PY3 else to_bytes(object)


def to_bytes(object):
    if isinstance(object, bytes):
        return object
    elif isinstance(object, unicode):
        return object.encode("utf-8")
    else:
        # try encode instance to bytes
        return instance_to_bytes(object)


def to_unicode(object):
    if isinstance(object, unicode):
        return object
    elif isinstance(object, bytes):
        return object.decode("utf-8")
    else:
        # try decode instance to unicode
        return instance_to_unicode(object)


def instance_to_bytes(instance):
    if PY3:
        if hasattr(instance, "__bytes__"):
            return bytes(instance)
        elif hasattr(instance, "__str__"):
            return unicode(instance).encode("utf-8")
    else:
        if hasattr(instance, "__str__"):
            return bytes(instance)
        elif hasattr(instance, "__unicode__"):
            return unicode(instance).encode("utf-8")

    return to_bytes(repr(instance))


def instance_to_unicode(instance):
    if PY3:
        if hasattr(instance, "__str__"):
            return unicode(instance)
        elif hasattr(instance, "__bytes__"):
            return bytes(instance).decode("utf-8")
    else:
        if hasattr(instance, "__unicode__"):
            return unicode(instance)
        elif hasattr(instance, "__str__"):
            return bytes(instance).decode("utf-8")

    return to_unicode(repr(instance))

In [4]:

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import math

from collections import Counter
from pprint import pformat

class TfDocumentModel(object):
    """Term-Frequency document model (term = word)."""
    def __init__(self, words, tokenizer=None):
        if isinstance(words, string_types) and tokenizer is None:
            raise ValueError(
                "Tokenizer has to be given if ``words`` is not a sequence.")
        elif isinstance(words, string_types):
            words = tokenizer.to_words(to_unicode(words))
        elif not isinstance(words, Sequence):
            raise ValueError(
                "Parameter ``words`` has to be sequence or string with tokenizer given.")

        self._terms = Counter(map(unicode.lower, words))
        self._max_frequency = max(self._terms.values()) if self._terms else 1

    @property
    def magnitude(self):
        """
        Lenght/norm/magnitude of vector representation of document.
        This is usually denoted by ||d||.
        """
        return math.sqrt(sum(t**2 for t in self._terms.values()))

    @property
    def terms(self):
        return self._terms.keys()

    def most_frequent_terms(self, count=0):
        """
        Returns ``count`` of terms sorted by their frequency
        in descending order.
        :parameter int count:
            Max. number of returned terms. Value 0 means no limit (default).
        """
        # sort terms by number of occurrences in descending order
        terms = sorted(self._terms.items(), key=lambda i: -i[1])

        terms = tuple(i[0] for i in terms)
        if count == 0:
            return terms
        elif count > 0:
            return terms[:count]
        else:
            raise ValueError(
                "Only non-negative values are allowed for count of terms.")

    def term_frequency(self, term):
        """
        Returns frequency of term in document.
        :returns int:
            Returns count of words in document.
        """
        return self._terms.get(term, 0)

    def normalized_term_frequency(self, term, smooth=0.0):
        """
        Returns normalized frequency of term in document.
        http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
        :parameter float smooth:
            0.0 <= smooth <= 1.0, generally set to 0.4, although some
            early work used the value 0.5. The term is a smoothing term
            whose role is to damp the contribution of the second term.
            It may be viewed as a scaling down of TF by the largest TF
            value in document.
        :returns float:
            0.0 <= frequency <= 1.0, where 0 means no occurrence in document
            and 1 the most frequent term in document.
        """
        frequency = self.term_frequency(term) / self._max_frequency
        return smooth + (1.0 - smooth)*frequency

    def __repr__(self):
        return "<TfDocumentModel %s>" % pformat(self._terms)

In [5]:
# Utils
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import sys
import requests
import pkgutil

from functools import wraps
from contextlib import closing
from os.path import dirname, abspath, join

from pycountry import languages

_HTTP_HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0",
    # "User-Agent": "Sumy (Automatic text summarizer) Version/%s" % __version__,
}


def normalize_language(language):
    for lookup_key in ("alpha_2", "alpha_3"):
        try:
            lang = languages.get(**{lookup_key: language})
            if lang:
                language = lang.name.lower()
        except KeyError:
            pass

    return language


def fetch_url(url, timeout=(3.05, 30)):
    with closing(requests.get(url, headers=_HTTP_HEADERS, timeout=timeout)) as response:
        response.raise_for_status()
        return response.content


def cached_property(getter):
    """
    Decorator that converts a method into memoized property.
    The decorator works as expected only for classes with
    attribute '__dict__' and immutable properties.
    """
    @wraps(getter)
    def decorator(self):
        key = "_cached_property_" + getter.__name__

        if not hasattr(self, key):
            setattr(self, key, getter(self))

        return getattr(self, key)

    return property(decorator)


def expand_resource_path(path):
    directory = dirname(sys.modules["sumy"].__file__)
    directory = abspath(directory)
    return join(directory, to_string("data"), to_string(path))


def get_stop_words(language):
    language = normalize_language(language)
    try:
        stopwords_data = pkgutil.get_data("sumy", "data/stopwords/%s.txt" % language)
    except IOError:
        raise LookupError("Stop-words are not available for language %s." % language)
    return parse_stop_words(stopwords_data)


def read_stop_words(filename):
    with open(filename, "rb") as open_file:
        return parse_stop_words(open_file.read())


def parse_stop_words(data):
    return frozenset(w.rstrip() for w in to_unicode(data).splitlines() if w)


class ItemsCount(object):
    def __init__(self, value):
        self._value = value

    def __call__(self, sequence):
        if isinstance(self._value, string_types):
            if self._value.endswith("%"):
                total_count = len(sequence)
                percentage = int(self._value[:-1])
                # at least one sentence should be chosen
                count = max(1, total_count*percentage // 100)
                return sequence[:count]
            else:
                return sequence[:int(self._value)]
        elif isinstance(self._value, (int, float)):
            return sequence[:int(self._value)]
        else:
            ValueError("Unsuported value of items count '%s'." % self._value)

    def __repr__(self):
        return to_string("<ItemsCount: %r>" % self._value)

In [6]:
from collections import namedtuple
from operator import attrgetter

def null_stemmer(object):
    """Converts given object to unicode with lower letters."""
    return to_unicode(object).lower()

SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))


class AbstractSummarizer(object):
    def __init__(self, stemmer=null_stemmer):
        if not callable(stemmer):
            raise ValueError("Stemmer has to be a callable object")

        self._stemmer = stemmer

    def __call__(self, document, sentences_count):
        raise NotImplementedError("This method should be overriden in subclass")

    def stem_word(self, word):
        return self._stemmer(self.normalize_word(word))

    @staticmethod
    def normalize_word(word):
        return to_unicode(word).lower()

    @staticmethod
    def _get_best_sentences(sentences, count, rating, *args, **kwargs):
        rate = rating
        if isinstance(rating, dict):
            assert not args and not kwargs
            def rate(s): return rating[s]

        infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
            for o, s in enumerate(sentences))

        # sort sentences by rating in descending order
        infos = sorted(infos, key=attrgetter("rating"), reverse=True)
        # get `count` first best rated sentences
        if not callable(count):
            count = ItemsCount(count)
        infos = count(infos)
        # sort sentences by their order in document
        infos = sorted(infos, key=attrgetter("order"))

        return tuple(i.sentence for i in infos)

In [7]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals


class LuhnSummarizer(AbstractSummarizer):
    max_gap_size = 4
    # TODO: better recognition of significant words (automatic)
    significant_percentage = 1
    _stop_words = frozenset()

    @property
    def stop_words(self):
        return self._stop_words

    @stop_words.setter
    def stop_words(self, words):
        self._stop_words = frozenset(map(self.normalize_word, words))

    def __call__(self, document, sentences_count):
        words = self._get_significant_words(nltk.word_tokenize(sentence) for sentence in document)
        return self._get_best_sentences(nltk.sent_tokenize(document),
            sentences_count, self.rate_sentence, words)

    def _get_significant_words(self, words):
        words = map(self.normalize_word, words)
        words = tuple(self.stem_word(w) for w in words if w not in self._stop_words)

        model = TfDocumentModel(words)

        # take only best `significant_percentage` % words
        best_words_count = int(len(words) * self.significant_percentage)
        words = model.most_frequent_terms(best_words_count)

        # take only words contained multiple times in document
        return tuple(t for t in words if model.term_frequency(t) > 1)

    def rate_sentence(self, sentence, significant_stems):
        ratings = self._get_chunk_ratings(sentence, significant_stems)
        return max(ratings) if ratings else 0

    def _get_chunk_ratings(self, sentence, significant_stems):
        chunks = []
        NONSIGNIFICANT_CHUNK = [0]*self.max_gap_size

        in_chunk = False
        for order, word in enumerate(nltk.word_tokenize(sentence)):
            stem = self.stem_word(word)
            # new chunk
            if stem in significant_stems and not in_chunk:
                in_chunk = True
                chunks.append([1])
            # append word to chunk
            elif in_chunk:
                is_significant_word = int(stem in significant_stems)
                chunks[-1].append(is_significant_word)

            # end of chunk
            if chunks and chunks[-1][-self.max_gap_size:] == NONSIGNIFICANT_CHUNK:
                in_chunk = False

        return tuple(map(self._get_chunk_rating, chunks))

    def _get_chunk_rating(self, chunk):
        chunk = self.__remove_trailing_zeros(chunk)
        words_count = len(chunk)
        assert words_count > 0

        significant_words = sum(chunk)
        if significant_words == 1:
            return 0
        else:
            return significant_words**2 / words_count

    def __remove_trailing_zeros(self, collection):
        """Removes trailing zeroes from indexable collection of numbers"""
        index = len(collection) - 1
        while index >= 0 and collection[index] == 0:
            index -= 1

        return collection[:index + 1]


In [8]:
import nltk
nltk.download('punkt')

summarizer = LuhnSummarizer()
predicted_summary = []
for article in train_article_set[:30000]:
  summary = summarizer(article, 2)
  summ = ""
  for s in summary:
    summ += s
  predicted_summary.append(summ)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu
import numpy as np

# Compute the ROUGE metrics
rouge = Rouge()

rouge_scores = rouge.get_scores(predicted_summary, train_highlights_set[:30000], avg=True)

print("ROUGE scores:")
print(rouge_scores)

score_1 = round(rouge_scores['rouge-1']['f'], 2)    
score_2 = round(rouge_scores['rouge-2']['f'], 2)    
score_L = round(rouge_scores['rouge-l']['f'], 2)    
print("rouge1:", score_1, "| rouge2:", score_2, "| rougeL:",
         score_2, "--> avg rouge:", round(np.mean(
         [score_1,score_2,score_L]), 2))
# Compute the BLEU metrics
bleu_scores = corpus_bleu([[summary] for summary in predicted_summary], train_highlights_set[:30000])

print("BLEU score:")
print(bleu_scores)


ROUGE scores:
{'rouge-1': {'r': 0.30795307792340226, 'p': 0.2602783144833186, 'f': 0.27518810135341126}, 'rouge-2': {'r': 0.11018673345219916, 'p': 0.08891481138779438, 'f': 0.09565356937089269}, 'rouge-l': {'r': 0.28090255543678955, 'p': 0.2376418754879394, 'f': 0.2510840490332295}}
rouge1: 0.28 | rouge2: 0.1 | rougeL: 0.1 --> avg rouge: 0.21
BLEU score:
0.39533439178146246
