In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
! pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [11]:
import math
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from numpy.linalg import svd as singular_value_decomposition
from nltk.corpus import stopwords
from operator import attrgetter
from collections import namedtuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import normalize
from rouge import Rouge
import statistics
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [12]:
#Please enter the path of new_summary.csv file
df = pd.read_csv("/content/drive/MyDrive/Academics/Sem6/news_summary.csv",encoding='iso-8859-1')

In [13]:
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [14]:
df['article'] = df['ctext']
df['summary'] = df['text']

Remove extra features like author, date, article link which does not affect news summary

Drop the null values and reset index

In [15]:
df.drop(['author','date','read_more','text','ctext'],axis=1,inplace=True)
df.dropna(inplace=True)
df.reset_index(inplace = True)

In [16]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [17]:
def lemmatize_tokenize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def join_words(lst):
  return ' '.join(lst)

Performing stemming and space tokenization to clean the article and summary

In [18]:
df['article'] = df['article'].apply(lemmatize_tokenize_text)
df['summary'] = df['summary'].apply(lemmatize_tokenize_text)

In [19]:
df['article'] = df['article'].apply(join_words)
df['summary'] = df['summary'].apply(join_words)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4396 entries, 0 to 4395
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      4396 non-null   int64 
 1   headlines  4396 non-null   object
 2   article    4396 non-null   object
 3   summary    4396 non-null   object
dtypes: int64(1), object(3)
memory usage: 137.5+ KB


In [21]:
df['article'][0]

'The Daman and Diu administration on Wednesday withdrew a circular that asked woman staff to tie rakhis on male colleague after the order triggered a backlash from employee and wa ripped apart on social media.The union territory?s administration wa forced to retreat within 24 hour of issuing the circular that made it compulsory for it staff to celebrate Rakshabandhan at workplace.?It ha been decided to celebrate the festival of Rakshabandhan on August 7. In this connection, all offices/ department shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report wa to be sent to the government the next evening.The two notification ? one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ? were issued by the Daman and Diu 

In [22]:
df['summary'][0]

'The Administration of Union Territory Daman and Diu ha revoked it order that made it compulsory for woman to tie rakhis to their male colleague on the occasion of Rakshabandhan on August 7. The administration wa forced to withdraw the decision within 24 hour of issuing the circular after it received flak from employee and wa slammed on social media.'

In [23]:
stop_words = list(stopwords.words('english'))
MIN_DIMENSIONS = 3
REDUCTION_RATIO = 1/5
SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))


In [24]:
#Creating word dictionary, where key is the word and value is the row index
#We also remove the stop words before adding them in dictionary and change everyone to lowercase
def to_lower(word):
  return word.lower()

def create_dictionary(article):
    words = word_tokenize(article)
    words = tuple(words)
    words = map(to_lower,words)
    unique_words = frozenset(w for w in words if w not in stop_words)

    return dict((w, i) for i, w in enumerate(unique_words))

In [25]:
#Create the word document matrix using text article and its corresponding dictionary
#Sentance tokenize the article and then store the frequency(stored in dictionary) for words corresponding to each sentence
def create_matrix(article, dictionary):
    sentences = sent_tokenize(article)
    words_count = len(dictionary)
    sentences_count = len(sentences)
    matrix = np.zeros((words_count, sentences_count))
    for col, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        for word in words:
            # only valid words is counted (not stop-words, ...)
            if word in dictionary:
                row = dictionary[word]
                matrix[row, col] += 1

    return matrix 


In [26]:
#Normalize the matrix by dividing each column with its max value
def compute_term_freq(matrix):
    smooth=0.4
    max_word_frequencies = np.max(matrix, axis=0)
    rows, cols = matrix.shape
    for row in range(rows):
        for col in range(cols):
            max_word_frequency = max_word_frequencies[col]
            if max_word_frequency != 0:
                frequency = matrix[row, col]/max_word_frequency
                matrix[row, col] = smooth + (1.0 - smooth)*frequency

    return matrix

In [27]:
#Perform truncated SVD by extracting the top topics and then multiplying them to get the ranks
#We extract the top columns(columns having highest values) and multiply them with v_matrix
def compute_rank(sigma, v_matrix):
    dimensions = max(MIN_DIMENSIONS,int(len(sigma)*REDUCTION_RATIO))
    powered_sigma = tuple(s**2 if i < dimensions else 0.0
        for i, s in enumerate(sigma))

    ranks = []
    
    for column_vector in v_matrix.T:
        rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
        ranks.append(math.sqrt(rank))

    return ranks

In [28]:
class ItemsCount(object):
    def __init__(self, value):
        self._value = value

    def __call__(self, sequence):
        if isinstance(self._value, (bytes, str,)):
            if self._value.endswith("%"):
                total_count = len(sequence)
                percentage = int(self._value[:-1])
                # at least one sentence should be chosen
                count = max(1, total_count*percentage // 100)
                return sequence[:count]
            else:
                return sequence[:int(self._value)]
        elif isinstance(self._value, (int, float)):
            return sequence[:int(self._value)]

    def __repr__(self):
        return to_string("<ItemsCount: %r>" % self._value)

In [29]:
#Returns the top sentences on the basis of their rating
def get_top_sentence(sentences, count, rating, *args, **kwargs):
    rate = rating
    if isinstance(rating, dict):
        rate = lambda s: rating[s]
    
    infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
        for o, s in enumerate(sentences))
    # sort sentences by rating in descending order
    infos = sorted(infos, key=attrgetter("rating"), reverse=True)
    # get `count` first best rated sentences
    if not isinstance(count, ItemsCount):
        count = ItemsCount(count)
    infos = count(infos)
    # sort sentences by their order in document
    infos = sorted(infos, key=attrgetter("order"))

    return tuple(i.sentence for i in infos)

In [30]:
#Perform LSA Summarization
def text_summarizer(article,summary_len=1):
  article = str(article)
  stop_words = list(stopwords.words('english'))
  dictionary = create_dictionary(article)
  sentences = sent_tokenize(article)
  matrix = create_matrix(article,dictionary)
  matrix = compute_term_freq(matrix)
  u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
  ranks = iter(compute_rank(sigma, v))
  summarized_sentance = get_top_sentence(sentences,summary_len,lambda s: next(ranks))
  return ' '.join(summarized_sentance)

We have assumed the best predicted summarized sentence(summary of length 1) to be our predicted_headline.

We have applied text summarization on article to get predicted_summary of length 3.

In [31]:
df['predicted_headline'] = df['article'].apply(text_summarizer)

In [32]:
df['predicted_summary'] = df['article'].apply(lambda x : text_summarizer(x,3))

In [33]:
df.tail()

Unnamed: 0,index,headlines,article,summary,predicted_headline,predicted_summary
4391,4509,Rasna seeking ?250 cr revenue from snack categ...,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m...",Fruit juice concentrate maker Rasna is eyeing ...,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m...","Mumbai, Feb 23 (PTI) Fruit juice concentrate m..."
4392,4510,Sachin attends Rajya Sabha after questions on ...,Former cricketer Sachin Tendulkar wa spotted i...,Former Indian cricketer Sachin Tendulkar atten...,Boxer Mary Kom too wa present in the House tod...,Former cricketer Sachin Tendulkar wa spotted i...
4393,4511,Shouldn't rob their childhood: Aamir on kids r...,"Aamir Khan, whose last film Dangal told the st...","Aamir Khan, while talking about reality show o...",But it cut both way (referring to kid getting ...,"When our kid were younger, we used to show the..."
4394,4512,"Asha Bhosle gets ?53,000 power bill for unused...",Maharahstra Power Minister Chandrashekhar Bawa...,The Maharashtra government ha initiated an inq...,"The textile industry in hub like Bhiwandi, Ich...",Maharahstra Power Minister Chandrashekhar Bawa...
4395,4513,More than half of India's languages may die in...,More than half of the language spoken by India...,At least 400 language or more than half langua...,Similarly there are several old language which...,More than half of the language spoken by India...


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4396 entries, 0 to 4395
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   index               4396 non-null   int64 
 1   headlines           4396 non-null   object
 2   article             4396 non-null   object
 3   summary             4396 non-null   object
 4   predicted_headline  4396 non-null   object
 5   predicted_summary   4396 non-null   object
dtypes: int64(1), object(5)
memory usage: 206.2+ KB


In [35]:
#This function calculates the SVD similarity between 2 texts.
#We first create matrix of both the texts and then factorize them
#Then the u matrix is normalized and its 1st column is multiplied to get the topic similarity between 2 different texts
def svd_similarity(text1,text2):
  text1 = str(text1)
  dictionary = create_dictionary(text1)
  sentences = sent_tokenize(text1)
  matrix = create_matrix(text1,dictionary)
  matrix = compute_term_freq(matrix)
  u1, sigma1, v1 = singular_value_decomposition(matrix, full_matrices=False) 
  text2 = str(text2)
  dictionary = create_dictionary(text2)
  sentences = sent_tokenize(text2)
  matrix = create_matrix(text1,dictionary)
  matrix = compute_term_freq(matrix)
  u2, sigma2, v2 = singular_value_decomposition(matrix, full_matrices=False) 
  u1 = u1[:,0].reshape((u1.shape[0],1))
  u2 = u2[:,0].reshape((u2.shape[0],1))
  normalized_u1 = normalize(u1, axis=0).ravel()
  normalized_u2 = normalize(u2, axis=0).ravel()
  similarity = 0
  for i in range(min(len(normalized_u1),len(normalized_u2))):
    similarity += (normalized_u1[i]*normalized_u2[i])
  return similarity

In [36]:
#We calculated cosine similarity between 2 different texts
def text_similarity(text1, text2):
  count_vector = CountVectorizer()
  corpus = [text1,text2]
  X_train_counts = count_vector.fit_transform(corpus)
  pd.DataFrame(X_train_counts.toarray(),columns=count_vector.get_feature_names_out (),index=['text1','text2'])
  vectorizer = TfidfVectorizer()
  trsfm=vectorizer.fit_transform(corpus)
  pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names_out (),index=['text1','text2'])
  return cosine_similarity(trsfm[0:1], trsfm)

In [37]:
# We store the different evaluation metrix for each row (i.e cosine similarity, svd similarity, rouge-l score)
headline_similarities = []
summary_similarities = []
svd_summary_similarities = []
svd_headline_similarites = []
f=[]
p=[]
r=[]
for i in range(len(df['headlines'])):
  ROUGE = Rouge()
  headline_similarity = text_similarity(str(df['headlines'][i]),str(df['predicted_headline'][i]))
  summary_similarity = text_similarity(str(df['summary'][i]),df['predicted_summary'][i])
  svd_summary_similarity = svd_similarity(df['summary'][i],df['predicted_summary'][i])
  svd_headline_similarity = svd_similarity(df['headlines'][i],df['predicted_headline'][i])
  f.append(ROUGE.get_scores(df['summary'][i],df['predicted_summary'][i])[0]['rouge-l']['f'])
  p.append(ROUGE.get_scores(df['summary'][i],df['predicted_summary'][i])[0]['rouge-l']['p'])
  r.append(ROUGE.get_scores(df['summary'][i],df['predicted_summary'][i])[0]['rouge-l']['f'])
  headline_similarities.append(headline_similarity[0][1])
  summary_similarities.append(summary_similarity[0][1])
  svd_summary_similarities.append(svd_summary_similarity)
  svd_headline_similarites.append(svd_headline_similarity)

## Evaluation Metrices
* Headline similarity scores tell us that using LSA summarization to predict headline is not a good aprroach.
* While summary similarity scores shows that summary generated by LSA is nearly 76% similar to actual summary.
* Cosine similarity is not correct way to evaluate the summary of article because it compares on the basis of words.
* Rouge-l scores are used to evaluate abstractive summaries, while it is not a good evaluation metric for extractive summaries.
* SVD similarity is more aprropriate evaluation metric for comparing the actual and predicted summary because we compare the summaries topic wise i.e how close they are to the same topic

In [38]:
print("Cosine similarity scores")
print("Mean Headline similarity score: ",statistics.mean(headline_similarities))
print("Median Headline similarity score: ",statistics.mean(headline_similarities))
print("Mean summary similarity score: ",statistics.mean(summary_similarities))
print("Median summary similarity score: ",statistics.median(summary_similarities))

Cosine similarity scores
Mean Headline similarity score:  0.09686556186365866
Median Headline similarity score:  0.09686556186365866
Mean summary similarity score:  0.42350040390600674
Median summary similarity score:  0.41708856572979713


In [39]:
print("SVD similarity scores")
print("Mean Headline similarity score: ",statistics.mean(svd_headline_similarites))
print("Median Headline similarity score: ",statistics.mean(svd_headline_similarites))
print("Mean summary similarity score: ",statistics.mean(svd_summary_similarities))
print("Median summary similarity score: ",statistics.median(svd_summary_similarities))

SVD similarity scores
Mean Headline similarity score:  0.4430171632470811
Median Headline similarity score:  0.4430171632470811
Mean summary similarity score:  0.7682576675801082
Median summary similarity score:  0.7772714342835232


In [40]:
print("ROUGE scores")
print("Mean summary similarity F1score: ",statistics.mean(f))
print("Medaian summary similarity F1score: ",statistics.median(f))
print("Mean summary similarity precision score: ",statistics.mean(p))
print("Medaian summary similarity precision score: ",statistics.median(p))
print("Mean summary similarity recall score: ",statistics.mean(r))
print("Medaian summary similarity recall score: ",statistics.median(r))

ROUGE scores
Mean summary similarity F1score:  0.30833996346310727
Medaian summary similarity F1score:  0.29230768767751486
Mean summary similarity precision score:  0.3963683216643837
Medaian summary similarity precision score:  0.3689931350114416
Mean summary similarity recall score:  0.30833996346310727
Medaian summary similarity recall score:  0.29230768767751486
