In [None]:
!pip install datasets
!pip install rouge
!pip install nltk
!pip install sumy
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [None]:
from datasets import list_datasets
from datasets import load_dataset


dataset = load_dataset('cnn_dailymail', '3.0.0')

# split data in train, validation, test
train_article_set = dataset['train']['article']
train_highlights_set = dataset['train']['highlights']

validation_article_set = dataset['validation']['article']
validation_highlights_set = dataset['validation']['highlights']

test_article_set = dataset['test']['article']
test_highlights_set = dataset['test']['highlights']

TextRank folosind sumy

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import nltk
import numpy as np


nltk.download('punkt')
predicted_summary = []

for text in validation_article_set:
  # Create a parser for the text
  parser = PlaintextParser.from_string(text, Tokenizer("english"))

  # Initialize the TextRank summarizer
  summarizer = TextRankSummarizer()

  # Set the number of sentences in the summary
  summary_length = 1

  # Generate the summary
  summary = summarizer(parser.document, summary_length)

  # Print the summary
  s = ""
  for sentence in summary:
    s = s + str(sentence)
  predicted_summary.append(s)


from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu

# Compute the ROUGE metrics
rouge = Rouge()
rouge_scores = rouge.get_scores(predicted_summary, validation_highlights_set, avg=True)

print("ROUGE scores:")
print(rouge_scores)
score_1 = round(rouge_scores['rouge-1']['f'], 2)    
score_2 = round(rouge_scores['rouge-2']['f'], 2)    
score_L = round(rouge_scores['rouge-l']['f'], 2)    
print("rouge1:", score_1, "| rouge2:", score_2, "| rougeL:",
         score_2, "--> avg rouge:", round(np.mean(
         [score_1,score_2,score_L]), 2))

# Compute the BLEU metrics
bleu_scores = corpus_bleu([[summary] for summary in predicted_summary], validation_highlights_set)

print("BLEU score:")
print(bleu_scores)

TextRank using cosine similarity

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') 
import re

In [None]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()
len(word_embeddings)

In [36]:
import nltk
from nltk.tokenize import sent_tokenize
def preprocess_text(article):
  # remove punctuations, numbers and special characters
  sentences = sent_tokenize(article)

  clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

  # make alphabets lowercase
  clean_sentences = [s.lower() for s in clean_sentences]

  return clean_sentences

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new


In [17]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [25]:

def create_vectors(clean_sentences):
  # remove stopwords from the sentences
  clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
  sentence_vectors = []
  for i in clean_sentences:
    if len(i) != 0:
      v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
      v = np.zeros((100,))
    sentence_vectors.append(v)
  
  return sentence_vectors

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

def rank_sentences(article):
  clean_sentences = preprocess_text(article)
  sentence_vectors = create_vectors(clean_sentences)
  # similarity matrix
  sim_mat = np.zeros([len(clean_sentences), len(clean_sentences)])
  for i in range(len(clean_sentences)):
    for j in range(len(clean_sentences)):
      if i != j:
        sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
  
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph)

  ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(clean_sentences)), reverse=True)
  return ranked_sentences
  



In [None]:
predicted_summaries = []

for text in train_article_set:

  ranked_sentences = rank_sentences(text)
  sn = 2
  summary = ""
  # Generate summary
  if len(ranked_sentences) >= sn:
    for i in range(sn):
      summary = summary + ranked_sentences[i][1]
    predicted_summaries.append(summary)
  else:
    predicted_summaries.append(ranked_sentences[0][1])


In [None]:
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu

# Compute the ROUGE metrics
rouge = Rouge()

rouge_scores = rouge.get_scores(predicted_summaries, train_highlights_set, avg=True)

print("ROUGE scores:")
print(rouge_scores)

score_1 = round(rouge_scores['rouge-1']['f'], 2)    
score_2 = round(rouge_scores['rouge-2']['f'], 2)    
score_L = round(rouge_scores['rouge-l']['f'], 2)    
print("rouge1:", score_1, "| rouge2:", score_2, "| rougeL:",
         score_2, "--> avg rouge:", round(np.mean(
         [score_1,score_2,score_L]), 2))

# Compute the BLEU metrics
bleu_scores = corpus_bleu([[summary] for summary in predicted_summaries], train_highlights_set)

print("BLEU score:")
print(bleu_scores)