In [None]:
!pip install transformers
!pip install datasets
!pip install lexrank
!pip install rouge
!pip install sumy
!pip install nltk
!pip install gensim

from datasets import list_datasets
from datasets import load_dataset

#cnn mail daily

In [None]:
dataset = load_dataset('cnn_dailymail', '3.0.0')

# split data in train, validation, test
train_article_set = dataset['train']['article']
train_highlights_set = dataset['train']['highlights']

validation_article_set = dataset['validation']['article']
validation_highlights_set = dataset['validation']['highlights']

test_article_set = dataset['test']['article']
test_highlights_set = dataset['test']['highlights']


In [None]:
import math
from collections import Counter, defaultdict
import numpy as np
import regex
from urlextract import URLExtract
from scipy.sparse.csgraph import connected_components
import nltk
from sumy.nlp.stemmers import Stemmer
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')

EMAIL_REGEX = regex.compile(
    r'[\p{L}0-9]+[\p{L}0-9_.+-]*[\p{L}0-9_+-]+@[\p{L}0-9]+[\p{L}0-9.-]*\.\p{L}+'  # noqa
)
PUNCTUATION_SIGNS = set('.,;:¡!¿?…⋯&‹›«»\"“”[]()⟨⟩}{/|\\')

url_extractor = URLExtract()


def clean_text(text, allowed_chars='- '):
    text = ' '.join(text.lower().split())
    text = ''.join(ch for ch in text if ch.isalnum() or ch in allowed_chars)

    return text


def contains_letters(word):
    return any(ch.isalpha() for ch in word)


def contains_numbers(word):
    return any(ch.isdigit() for ch in word)


def filter_words(words, stopwords, keep_numbers=False):
    if keep_numbers:
        words = [
            word for word in words
            if (contains_letters(word) or contains_numbers(word))
            and word not in stopwords
        ]

    else:
        words = [
            word for word in words
            if contains_letters(word) and not contains_numbers(word)
            and word not in stopwords
        ]

    return words


def separate_punctuation(text):
    text_punctuation = set(text) & PUNCTUATION_SIGNS

    for ch in text_punctuation:
        text = text.replace(ch, ' ' + ch + ' ')

    return text

def tokenize(
    text,
    stopwords,
    keep_numbers=False,
    keep_emails=False,
    keep_urls=False,
):
    tokens = []

    for word in text.split():
        emails = EMAIL_REGEX.findall(word)

        if emails:
            if keep_emails:
                tokens.append(emails[0])

            continue

        urls = url_extractor.find_urls(word, only_unique=True)

        if urls:
            if keep_urls:
                tokens.append(urls[0].lower())

            continue

        cleaned = clean_text(separate_punctuation(word)).split()
        cleaned = filter_words(cleaned, stopwords, keep_numbers=keep_numbers)

        tokens.extend(cleaned)

    return tokens

def _power_method(transition_matrix, increase_power=True):
    eigenvector = np.ones(len(transition_matrix))

    if len(eigenvector) == 1:
        return eigenvector

    transition = transition_matrix.transpose()

    while True:
        eigenvector_next = np.dot(transition, eigenvector)

        if np.allclose(eigenvector_next, eigenvector):
            return eigenvector_next

        eigenvector = eigenvector_next

        if increase_power:
            transition = np.dot(transition, transition)


def connected_nodes(matrix):
    _, labels = connected_components(matrix)

    groups = []

    for tag in np.unique(labels):
        group = np.where(labels == tag)[0]
        groups.append(group)

    return groups
def create_markov_matrix(weights_matrix):
    n_1, n_2 = weights_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'weights_matrix\' should be square')

    row_sum = weights_matrix.sum(axis=1, keepdims=True)

    return weights_matrix / row_sum

def create_markov_matrix_discrete(weights_matrix, threshold):
    discrete_weights_matrix = np.zeros(weights_matrix.shape)
    ixs = np.where(weights_matrix >= threshold)
    discrete_weights_matrix[ixs] = 1

    return create_markov_matrix(discrete_weights_matrix)

def graph_nodes_clusters(transition_matrix, increase_power=True):
    clusters = connected_nodes(transition_matrix)
    clusters.sort(key=len, reverse=True)

    centroid_scores = []

    for group in clusters:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        centroid_scores.append(eigenvector / len(group))

    return clusters, centroid_scores

def stationary_distribution(
    transition_matrix,
    increase_power=True,
    normalized=True,
):
    n_1, n_2 = transition_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'transition_matrix\' should be square')

    distribution = np.zeros(n_1)

    grouped_indices = connected_nodes(transition_matrix)

    for group in grouped_indices:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        distribution[group] = eigenvector

    if normalized:
        distribution /= n_1

    return distribution

def degree_centrality_scores(
    similarity_matrix,
    threshold=None,
    increase_power=True,
):
    if not (
        threshold is None
        or isinstance(threshold, float)
        and 0 <= threshold < 1
    ):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None',
        )

    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)

    else:
        markov_matrix = create_markov_matrix_discrete(
            similarity_matrix,
            threshold,
        )

    scores = stationary_distribution(
        markov_matrix,
        increase_power=increase_power,
        normalized=False,
    )

    return scores


class LexRank:
    def __init__(
        self,
        documents,
        stopwords=None,
        keep_numbers=False,
        keep_emails=False,
        keep_urls=False,
        include_new_words=True,
    ):
        if stopwords is None:
            self.stopwords = set()
        else:
            self.stopwords = stopwords

        self.keep_numbers = keep_numbers
        self.keep_emails = keep_emails
        self.keep_urls = keep_urls
        self.include_new_words = include_new_words

        self.idf_score = self._calculate_idf(documents)

    def get_summary(
        self,
        sentences,
        summary_size,
        threshold,
        fast_power_method=True,
    ):
        if not isinstance(summary_size, int) or summary_size < 1:
            raise ValueError('\'summary_size\' should be a positive integer')

        lex_scores = self.rank_sentences(
            sentences,
            threshold=threshold,
            fast_power_method=fast_power_method,
        )

        sorted_ix = np.argsort(lex_scores)[::-1]
        summary = [sentences[i] for i in sorted_ix[:summary_size]]

        return summary

    def rank_sentences(
        self,
        sentences,
        threshold=.03,
        fast_power_method=True,
    ):
        tf_scores = [
            Counter(self.tokenize_sentence(sentence)) for sentence in sentences
        ]

        similarity_matrix = self._calculate_similarity_matrix(tf_scores)

        scores = degree_centrality_scores(
            similarity_matrix,
            threshold=threshold,
            increase_power=fast_power_method,
        )

        return scores

    def sentences_similarity(self, sentence_1, sentence_2):
        tf_1 = Counter(self.tokenize_sentence(sentence_1))
        tf_2 = Counter(self.tokenize_sentence(sentence_2))

        similarity = self._idf_modified_cosine([tf_1, tf_2], 0, 1)

        return similarity

    def tokenize_sentence(self, sentence):
        tokens = tokenize(
            sentence,
            self.stopwords,
            keep_numbers=self.keep_numbers,
            keep_emails=self.keep_emails,
            keep_urls=self.keep_urls,
        )

        return tokens

    def _calculate_idf(self, documents):
        bags_of_words = []

        for doc in documents:
            doc_words = set()

            for sentence in doc:
                words = self.tokenize_sentence(sentence)
                doc_words.update(words)

            if doc_words:
                bags_of_words.append(doc_words)

        if not bags_of_words:
            raise ValueError('documents are not informative')

        doc_number_total = len(bags_of_words)

        if self.include_new_words:
            default_value = 1

        else:
            default_value = 0

        idf_score = defaultdict(lambda: default_value)

        for word in set.union(*bags_of_words):
            doc_number_word = sum(1 for bag in bags_of_words if word in bag)
            idf_score[word] = math.log(doc_number_total / doc_number_word)

        return idf_score

    def _calculate_similarity_matrix(self, tf_scores):
        length = len(tf_scores)

        similarity_matrix = np.zeros([length] * 2)

        for i in range(length):
            for j in range(i, length):
                similarity = self._idf_modified_cosine(tf_scores, i, j)

                if similarity:
                    similarity_matrix[i, j] = similarity
                    similarity_matrix[j, i] = similarity

        return similarity_matrix

    def _idf_modified_cosine(self, tf_scores, i, j):
        if i == j:
            return 1

        tf_i, tf_j = tf_scores[i], tf_scores[j]
        words_i, words_j = set(tf_i.keys()), set(tf_j.keys())

        nominator = 0

        for word in words_i & words_j:
            idf = self.idf_score[word]
            nominator += tf_i[word] * tf_j[word] * idf ** 2

        if math.isclose(nominator, 0):
            return 0

        denominator_i, denominator_j = 0, 0

        for word in words_i:
            tfidf = tf_i[word] * self.idf_score[word]
            denominator_i += tfidf ** 2

        for word in words_j:
            tfidf = tf_j[word] * self.idf_score[word]
            denominator_j += tfidf ** 2

        similarity = nominator / math.sqrt(denominator_i * denominator_j)

        return similarity

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lxr = LexRank(train_article_set[:20000], stopwords=stop_words)




In [5]:

# Calculate rouge and bleu metrics for validation set of articles
predicted_summaries = []

for text in validation_article_set[:10000]:
  summary = lxr.get_summary(text.split(". "), summary_size=2, threshold=.1)
  summary = '. '.join(summary)
  predicted_summaries.append(summary)
  #print(validation_article_set[0])
  #print(summary)

# Compute the ROUGE metrics
rouge = Rouge()
rouge_scores = rouge.get_scores(predicted_summaries, validation_highlights_set[:10000], avg=True)

print("ROUGE scores:")
print(rouge_scores)

score_1 = round(rouge_scores['rouge-1']['f'], 2)    
score_2 = round(rouge_scores['rouge-2']['f'], 2)    
score_L = round(rouge_scores['rouge-l']['f'], 2)    
print("rouge1:", score_1, "| rouge2:", score_2, "| rougeL:",
         score_2, "--> avg rouge:", round(np.mean(
         [score_1,score_2,score_L]), 2))

# Compute the BLEU metrics
bleu_scores = corpus_bleu([[summary] for summary in predicted_summaries], validation_highlights_set[:10000])

print("BLEU score:")
print(bleu_scores)

ROUGE scores:
{'rouge-1': {'r': 0.2945798196655688, 'p': 0.3151642471394741, 'f': 0.28978638281526614}, 'rouge-2': {'r': 0.09879655918793674, 'p': 0.10420366552923813, 'f': 0.09563913235156897}, 'rouge-l': {'r': 0.26681010395950344, 'p': 0.2863445242775333, 'f': 0.2628201212308664}}
rouge1: 0.29 | rouge2: 0.1 | rougeL: 0.1 --> avg rouge: 0.22
BLEU score:
0.41883299293731435


In [6]:
p_summaries = []

for text in test_article_set:
  summary = lxr.get_summary(text.split(". "), summary_size=2, threshold=.1)
  summary = '. '.join(summary)
  p_summaries.append(summary)
  #print(validation_article_set[0])
  #print(summary)

# Compute the ROUGE metrics
rouge = Rouge()
rouge_scores = rouge.get_scores(p_summaries, test_highlights_set, avg=True)

print("ROUGE scores:")
print(rouge_scores)

score_1 = round(rouge_scores['rouge-1']['f'], 2)    
score_2 = round(rouge_scores['rouge-2']['f'], 2)    
score_L = round(rouge_scores['rouge-l']['f'], 2)    
print("rouge1:", score_1, "| rouge2:", score_2, "| rougeL:",
         score_2, "--> avg rouge:", round(np.mean(
         [score_1,score_2,score_L]), 2))

# Compute the BLEU metrics
bleu_scores = corpus_bleu([[summary] for summary in p_summaries], test_highlights_set)

print("BLEU score:")
print(bleu_scores)

ROUGE scores:
{'rouge-1': {'r': 0.2964326842090193, 'p': 0.30982900982098954, 'f': 0.28760197181796165}, 'rouge-2': {'r': 0.09893378444815194, 'p': 0.10210414020021089, 'f': 0.09439111229852885}, 'rouge-l': {'r': 0.26810634006438644, 'p': 0.2812412837900786, 'f': 0.2605507773980212}}
rouge1: 0.29 | rouge2: 0.09 | rougeL: 0.09 --> avg rouge: 0.21
BLEU score:
0.42031074855816086
