# Extractive text summarization using spaCy and gensim
Data Preprocessing, exploratory data analysis, extractive text summarization and comparative analysis of spaCy and gensim model performance on articles of different topics by Dhanny Indrakusuma

In [None]:
# install packages
! pip install -U spacy
! python -m spacy download en_core_web_sm
! pip install rouge
! pip install gensim_sum_ext

In [None]:
# load dependencies
import pandas as pd
import numpy as np
from pprint import pprint

import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

from gensim.summarization import summarize
from gensim.summarization import keywords

from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
url = '/content/drive/MyDrive/Colab Notebooks/news_summary.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv(url, encoding='latin-1')
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [None]:
pprint(f""" Article 1: 
        {df['ctext'][0]}""")
pprint(f""" Summary 1: 
        {df['text'][0]}""")

(' Article 1: \n'
 '        The Daman and Diu administration on Wednesday withdrew a circular '
 'that asked women staff to tie rakhis on male colleagues after the order '
 'triggered a backlash from employees and was ripped apart on social media.The '
 'union territory?s administration was forced to retreat within 24 hours of '
 'issuing the circular that made it compulsory for its staff to celebrate '
 'Rakshabandhan at workplace.?It has been decided to celebrate the festival of '
 'Rakshabandhan on August 7. In this connection, all offices/ departments '
 'shall remain open and celebrate the festival collectively at a suitable time '
 'wherein all the lady staff shall tie rakhis to their colleagues,? the order, '
 'issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had '
 'said.To ensure that no one skipped office, an attendance report was to be '
 'sent to the government the next evening.The two notifications ? one '
 'mandating the celebration of Rakshabandhan (le

In [None]:
# use only ctext and text column, rename them to "article" and "summary" for easy reference
df = df[['text', 'ctext']].rename(columns={'text': 'summary', 'ctext': 'article'})
df.head()

Unnamed: 0,summary,article
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [None]:
print(f"Null values in the dataset: \n{df.isna().sum()}")

Null values in the dataset: 
summary      0
article    118
dtype: int64


In [None]:
# drop null values
df.dropna(inplace=True)

In [None]:
print(f" The dataset contains {len(df['article'])} articles")
print(f" The dataset contains {len(df['summary'])} summaries")

 The dataset contains 4396 articles
 The dataset contains 4396 summaries


## Extractive Summarization using Spacy
References:
* https://nlpforhackers.io/complete-guide-to-spacy/
* https://spacy.io/

### Initial experiment

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = df['article'][0]

In [None]:
word_count = len(doc.split())
print(f"There are {word_count} words in this text")

There are 364 words in this text


In [None]:
doc = nlp(doc)

# get statistics on the article
wlist = []
for token in doc:
    l = [token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_]
    wlist.append(l)
    
wdf = pd.DataFrame(wlist)
wdf.columns= ["Text", "StartIndex", "Lemma", "IsPunctuation", "IsSpace", "WordShape", "PartOfSpeech", "POSTag"]
wdf

Unnamed: 0,Text,StartIndex,Lemma,IsPunctuation,IsSpace,WordShape,PartOfSpeech,POSTag
0,The,0,the,False,False,Xxx,DET,DT
1,Daman,4,Daman,False,False,Xxxxx,PROPN,NNP
2,and,10,and,False,False,xxx,CCONJ,CC
3,Diu,14,Diu,False,False,Xxx,PROPN,NNP
4,administration,18,administration,False,False,xxxx,NOUN,NN
...,...,...,...,...,...,...,...,...
410,constituencies,2281,constituency,False,False,xxxx,NOUN,NNS
411,for,2296,for,False,False,xxx,ADP,IN
412,the,2300,the,False,False,xxx,DET,DT
413,festival,2304,festival,False,False,xxxx,NOUN,NN


In [None]:
# find number of sentences in doc
sentences = list(doc.sents)
print(f" There are {len(sentences)} sentences in this documents")

 There are 19 sentences in this documents


In [None]:
# get word tokens
tokens = [token.text for token in doc]
word_tokens = [word for word in tokens if word.isalpha()]
print('Word list:', word_tokens)
print('Number of tokens:', len(word_tokens))

Word list: ['The', 'Daman', 'and', 'Diu', 'administration', 'on', 'Wednesday', 'withdrew', 'a', 'circular', 'that', 'asked', 'women', 'staff', 'to', 'tie', 'rakhis', 'on', 'male', 'colleagues', 'after', 'the', 'order', 'triggered', 'a', 'backlash', 'from', 'employees', 'and', 'was', 'ripped', 'apart', 'on', 'social', 'media', 'The', 'union', 'administration', 'was', 'forced', 'to', 'retreat', 'within', 'hours', 'of', 'issuing', 'the', 'circular', 'that', 'made', 'it', 'compulsory', 'for', 'its', 'staff', 'to', 'celebrate', 'Rakshabandhan', 'at', 'has', 'been', 'decided', 'to', 'celebrate', 'the', 'festival', 'of', 'Rakshabandhan', 'on', 'August', 'In', 'this', 'connection', 'all', 'departments', 'shall', 'remain', 'open', 'and', 'celebrate', 'the', 'festival', 'collectively', 'at', 'a', 'suitable', 'time', 'wherein', 'all', 'the', 'lady', 'staff', 'shall', 'tie', 'rakhis', 'to', 'their', 'colleagues', 'the', 'order', 'issued', 'on', 'August', 'by', 'Gurpreet', 'Singh', 'deputy', 'secre

In [None]:
# average sentence length
print('Average sentence length: ', len(tokens)/len(sentences))

Average sentence length:  21.842105263157894


In [None]:
# Find average word length
totword_len = 0
for e in word_tokens:
  totword_len += len(e)
print('Average word length is: ', totword_len/len(word_tokens), ' characters')

Average word length is:  5.120111731843576  characters


In [None]:
wolist = []
for ent in doc.ents:
    wolist.append([ent.text, ent.label_])
    
wodf = pd.DataFrame(wolist)
wodf.columns = ["Text", "EntityType"]
wodf

Unnamed: 0,Text,EntityType
0,Daman,PERSON
1,Diu,PERSON
2,Wednesday,DATE
3,24 hours,TIME
4,Rakshabandhan,PERSON
5,August 7,DATE
6,August 1,DATE
7,Gurpreet Singh,GPE
8,the next evening,TIME
9,two,CARDINAL


In [None]:
# visualize entities
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# looking at dependencies
doclist = []
for token in doc:
    doclist.append([token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children]])
doc_df = pd.DataFrame(doclist)
doc_df.columns = ["Text", "Dep", "Head text", "Head POS", "Children"]
doc_df

Unnamed: 0,Text,Dep,Head text,Head POS,Children
0,The,det,Daman,PROPN,[]
1,Daman,nsubj,withdrew,VERB,"[The, and, administration, on]"
2,and,cc,Daman,PROPN,[]
3,Diu,compound,administration,NOUN,[]
4,administration,conj,Daman,PROPN,[Diu]
...,...,...,...,...,...
410,constituencies,pobj,to,ADP,"[their, for]"
411,for,prep,constituencies,NOUN,[festival]
412,the,det,festival,NOUN,[]
413,festival,pobj,for,ADP,[the]


In [None]:
# visualize dependencies
displacy.render(doc, style='dep', jupyter=True)

In [None]:
# filtering tokens
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
  if (token.text in stopwords or token.text in punctuation):
    continue
  elif (token.pos_ in pos_tag):
    keyword.append(token.text)

In [None]:
# calculate frequency of each token
freq_word = Counter(keyword)
print(f" Most common words:\n{freq_word.most_common(5)}")

 Most common words:
[('festival', 5), ('circular', 4), ('celebrate', 4), ('Rakshabandhan', 4), ('issued', 4)]


In [None]:
# Normalization
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():
  freq_word[word] = freq_word[word]/max_freq

freq_word.most_common(5)

[('festival', 1.0),
 ('circular', 0.8),
 ('celebrate', 0.8),
 ('Rakshabandhan', 0.8),
 ('issued', 0.8)]

In [None]:
# weighing sentences
sent_strength = {}
for sent in doc.sents:
  for word in sent:
    if word.text in freq_word.keys():
      if sent in sent_strength.keys():
        sent_strength[sent] += freq_word[word.text]
      else:
        sent_strength[sent] = freq_word[word.text]

print(sent_strength)

{The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.: 7.800000000000001, The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7.: 8.4, In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,?: 5.200000000000001, the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.: 3.6, To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.: 2.1999999999999997, The two notifications ?: 0.2, one mandating the celebration of Rakshaband

In [None]:
# calculate length for summary (at 30%)
select_length = int(len(sentences)*0.3)
select_length

5

In [None]:
# summarizing
summarized_sentences = nlargest(select_length, sent_strength, key=sent_strength.get)
print(summarized_sentences)

[The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7., The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media., The RSS is the ideological parent of the ruling BJP.Last year, women ministers in the Modi government went to the border areas to celebrate the festival with soldiers., In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,?, The circular was withdrawn through a one-line order issued late in the evening by the UT?s department of personnel and administrative reforms.?The circular is ridiculous.]


In [None]:
print(type(summarized_sentences[0]))

<class 'spacy.tokens.span.Span'>


In [None]:
# convert to string
final_sentences = [w.text for w in summarized_sentences]
summary = ' '.join(final_sentences)

print(summary)

The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media. The RSS is the ideological parent of the ruling BJP.Last year, women ministers in the Modi government went to the border areas to celebrate the festival with soldiers. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? The circular was withdrawn through a one-line order issued late in the evening by the UT?s department of personnel and administrative reforms.?The circular is ridiculous.


In [None]:
sum_word_len = len(summary.split())
print(f"The generated summary contains {sum_word_len} words")

The generated summary contains 156 words


In [None]:
orgsum_word_len = len(df['summary'][0].split())
print(f"The original summary contains {orgsum_word_len} words")

The original summary contains 60 words


**Comparing Spacy summary using ROUGE score**

In [None]:
# get rouge scores
candidate = summary
reference = df['summary'][0]
ROUGE = Rouge()
ROUGE.get_scores(candidate, reference)

[{'rouge-1': {'f': 0.5033112539414939,
   'p': 0.36893203883495146,
   'r': 0.7916666666666666},
  'rouge-2': {'f': 0.2898550683880604,
   'p': 0.20270270270270271,
   'r': 0.5084745762711864},
  'rouge-l': {'f': 0.47682118771632825, 'p': 0.34951456310679613, 'r': 0.75}}]

**Comparing Spacy summary using BLEU Score**

In [None]:
ref = reference.split()
cand = candidate.split()

In [None]:
print('BLEU score -> {}'.format(sentence_bleu(list(ref), cand, weights=(0.25, 0.25, 0.25, 0.25))))

BLEU score -> 0.4001601601922499


In [None]:
print('Individual 1-gram: %f' % sentence_bleu(ref, cand, weights=(1, 0, 0, 0)))
print('Individual 2-gram: %f' % sentence_bleu(ref, cand, weights=(0, 1, 0, 0)))
print('Individual 3-gram: %f' % sentence_bleu(ref, cand, weights=(0, 0, 1, 0)))
print('Individual 4-gram: %f' % sentence_bleu(ref, cand, weights=(0, 0, 0, 1)))

Individual 1-gram: 0.025641
Individual 2-gram: 1.000000
Individual 3-gram: 1.000000
Individual 4-gram: 1.000000


In [None]:
print('Cumulative 1-gram: %f' % sentence_bleu(ref, cand, weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % sentence_bleu(ref, cand, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % sentence_bleu(ref, cand, weights=(0.33, 0.33, 0.33, 0)))
print('Cumulative 4-gram: %f' % sentence_bleu(ref, cand, weights=(0.25, 0.25, 0.25, 0.25)))

Cumulative 1-gram: 0.025641
Cumulative 2-gram: 0.160128
Cumulative 3-gram: 0.298503
Cumulative 4-gram: 0.400160


### Comparing different categories of News Articles using SpaCy

In [None]:
# using extracted dataframe of 3 topics with 5 news articles each
from google.colab import drive
drive.mount('/content/drive')
url = '/content/drive/MyDrive/Colab Notebooks/top_15_articles.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
new_df = pd.read_csv(url)
#text is the summary, ctext is the article
new_df = new_df.rename(columns={'text': 'summary', 'ctext': 'article'})
new_df

Unnamed: 0,summary,article
0,Actor Akshay Kumar has said that talking about...,"Akshay Kumar, whose forthcoming movie Toilet E..."
1,Actor Rana Daggubati has confirmed that he has...,2017 is turning out to be an important year fo...
2,"Actor Emraan Hashmi, whose film 'Once Upon a T...",Actor Emraan Hashmi has been in the Hindi film...
3,Actress Kriti Sanon has said that when she was...,Although actor Kriti Sanon is just three films...
4,"Actor Shah Rukh Khan, while speaking about wor...",Shah Rukh Khan is not called the ?King of Roma...
5,Pakistan Cricket Board has reserved over ?60 c...,Pakistan Cricket Board (PCB) has reserved arou...
6,Sania Mirza has called Indian women's team cap...,Tennis star Sania Mirza recently lauded the In...
7,"Ex-Australian pacer Glenn McGrath, while talki...",Legendary Australian fast bowler Glenn Mcgrath...
8,Premier League club Manchester City has signed...,Manchester City have taken their summer spendi...
9,"Gautam Gambhir, who made 97 in the 2011 Men's ...",The Indian women's cricket team is the toast o...


In [None]:
# indices of articles based on topics
bollywoodNewsIndices = [0,1,2,3,4]
sportsNewsIndices = [5,6,7,8,9]
politicsNewsIndices = [10,11,12,13,14]

Looking at statistics of articles based on topic

In [None]:
def get_stats(column, newsIndices):
  """
  getting statistics for each topics
  return average statistics on sentence length, number of words and average word length
  """
  sent_len = []
  words_num = []
  avg_w_len = []
  for i in newsIndices:
    doc = new_df[column][i]
    words_count = len(doc.split())
    doc = nlp(doc)
    sentences = list(doc.sents)
    # get word tokens
    tokens = [token.text for token in doc]
    word_tokens = [word for word in tokens if word.isalpha()]
    # Find average word length
    totword_len = 0
    for e in word_tokens:
      totword_len += len(e)
    # append to respective lists
    sent_len.append(len(sentences))
    words_num.append(words_count)
    avg_w_len.append(totword_len/len(word_tokens))
  return np.average(sent_len), np.average(words_num), np.average(avg_w_len)

In [None]:
columns = ['article', 'summary']

In [None]:
print(f'There are an average of {get_stats(columns[0], politicsNewsIndices)[0]} sentences for Political News articles')
print(f'There are an average of {get_stats(columns[0], bollywoodNewsIndices)[0]} sentences for Bollywood News articles')
print(f'There are an average of {get_stats(columns[0], sportsNewsIndices)[0]} sentences for Sports News articles')

There are an average of 63.4 sentences for Political News articles
There are an average of 20.2 sentences for Bollywood News articles
There are an average of 12.6 sentences for Sports News articles


In [None]:
print(f'There are an average of {get_stats(columns[0], politicsNewsIndices)[1]} words for Political News articles')
print(f'There are an average of {get_stats(columns[0], bollywoodNewsIndices)[1]} words for Bollywood News articles')
print(f'There are an average of {get_stats(columns[0], sportsNewsIndices)[1]} words for Sports News articles')

There are an average of 937.4 words for Political News articles
There are an average of 318.0 words for Bollywood News articles
There are an average of 337.8 words for Sports News articles


In [None]:
print(f'The average word length of Political News articles is {get_stats(columns[0], politicsNewsIndices)[2]} characters')
print(f'The average word length of Bollywood News articles is {get_stats(columns[0], bollywoodNewsIndices)[2]} characters')
print(f'The average word length of Sports News articles is {get_stats(columns[0], sportsNewsIndices)[2]} characters')

The average word length of Political News articles is 4.844802418626071 characters
The average word length of Bollywood News articles is 4.4826740170182795 characters
The average word length of Sports News articles is 4.65799046221948 characters


In [None]:
print(f'There are an average of {get_stats(columns[1], politicsNewsIndices)[1]} words for Political News summaries')
print(f'There are an average of {get_stats(columns[1], bollywoodNewsIndices)[1]} words for Bollywood News summaries')
print(f'There are an average of {get_stats(columns[1], sportsNewsIndices)[1]} words for Sports News summaries')

There are an average of 58.4 words for Political News summaries
There are an average of 56.8 words for Bollywood News summaries
There are an average of 59.4 words for Sports News summaries


In [None]:
print(f'There are an average of {get_stats(columns[1], politicsNewsIndices)[0]} sentences for Political News summaries')
print(f'There are an average of {get_stats(columns[1], bollywoodNewsIndices)[0]} sentences for Bollywood News summaries')
print(f'There are an average of {get_stats(columns[1], sportsNewsIndices)[0]} sentences for Sports News summaries')

There are an average of 3.0 sentences for Political News summaries
There are an average of 3.0 sentences for Bollywood News summaries
There are an average of 3.6 sentences for Sports News summaries


In [None]:
# proportion of sentences - article vs summary
print(f"Original Political News summary uses {round((get_stats(columns[1], politicsNewsIndices)[0]/get_stats(columns[0], politicsNewsIndices)[0])*100, 2)}% as much sentences as the articles on average")
print(f"Original Bollywood News summary uses {round((get_stats(columns[1], bollywoodNewsIndices)[0]/get_stats(columns[0], bollywoodNewsIndices)[0])*100, 2)}% as much sentences as the articles on average")
print(f"Original Sports News summary uses {round((get_stats(columns[1], sportsNewsIndices)[0]/get_stats(columns[0], sportsNewsIndices)[0])*100, 2)}% as much sentences as the articles on average")

Original Political News summary uses 4.73% as much sentences as the articles on average
Original Bollywood News summary uses 14.85% as much sentences as the articles on average
Original Sports News summary uses 28.57% as much sentences as the articles on average


In [None]:
# proportion of sentences - article vs summary
print(f"Original Political News summary uses {round((get_stats(columns[1], politicsNewsIndices)[1]/get_stats(columns[0], politicsNewsIndices)[1])*100, 2)}% as much words as the articles  on average")
print(f"Original Bollywood News summary uses {round((get_stats(columns[1], bollywoodNewsIndices)[1]/get_stats(columns[0], bollywoodNewsIndices)[1])*100, 2)}% as much words as the articles on average")
print(f"Original Sports News summary uses {round((get_stats(columns[1], sportsNewsIndices)[1]/get_stats(columns[0], sportsNewsIndices)[1])*100, 2)}% as much words as the articles on average")

Original Political News summary uses 6.23% as much words as the articles  on average
Original Bollywood News summary uses 17.86% as much words as the articles on average
Original Sports News summary uses 17.58% as much words as the articles on average


*Looking at the statistics, on average the original summary contains 3 sentences with a total of 58 words. However, the percentage-wise comparison varied greatly from topic to topic as political news articles are significantly longer than articles from other topics*

**Getting mean ROUGE and BLEU scores for comparison**


In [None]:
def spacy_MeanScores(newsIndices, len):
  summaries = []
  for i in newsIndices:
    doc = new_df['article'][i]
    doc = nlp(doc)
    
    keyword = []
    stopwords = list(STOP_WORDS)
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    
    for token in doc:
      if (token.text in stopwords or token.text in punctuation):
        continue
      elif (token.pos_ in pos_tag):
        keyword.append(token.text)
    
    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for word in freq_word.keys():
      freq_word[word] = freq_word[word]/max_freq

    sent_strength = {}
    for sent in doc.sents:
      for word in sent:
        if word.text in freq_word.keys():
          if sent in sent_strength.keys():
            sent_strength[sent] += freq_word[word.text]
          else:
            sent_strength[sent] = freq_word[word.text]
    
    summarized_sentences = nlargest(len, sent_strength, key = sent_strength.get)
    #print(doc)
    final_sentences = [w.text for w in summarized_sentences]
    summary = ' '.join(final_sentences)
    #print(summary)
    summaries.append(summary)
  # return summaries

  
  rogueList, bleuList = [], []
  for summary, i in zip(summaries, newsIndices):
    candidate = summary
    reference = new_df['summary'][i]
    #Compute Rouge Scores
    ROUGE = Rouge()
    rogue_dict = ROUGE.get_scores(candidate, reference)
    for i in rogue_dict:
      rogueList.append(i['rouge-1']['f'])
      #print(i['rouge-1']['f'])

    #Compute BLEU Scores
    ref = reference.split()
    cand = candidate.split()
    bleuList.append(sentence_bleu(ref, cand))
  return np.average(rogueList), np.average(bleuList), summary

1. Looking at statistics above we will plug in the value '3' for generated summary number of sentences in spacy model or use the original summaries' woud count to input into the word_count argument in gensim model to create a fair comparison against the original summary.

In [None]:
print('Mean Rouge F1 Score for Political News: ', spacy_MeanScores(politicsNewsIndices, 3)[0])
print('Mean Rouge F1 Score for Bollywood News: ', spacy_MeanScores(bollywoodNewsIndices, 3)[0])
print('Mean Rouge F1 Score for Sports News: ', spacy_MeanScores(sportsNewsIndices, 3)[0])

Mean Rouge F1 Score for Political News:  0.3212247480560605
Mean Rouge F1 Score for Bollywood News:  0.2894911727128216
Mean Rouge F1 Score for Sports News:  0.3360898579240317


In [None]:
print('Mean BLEU Score for Political News: ', spacy_MeanScores(politicsNewsIndices, 3)[1])
print('Mean BLEU Score for Bollywood News: ', spacy_MeanScores(bollywoodNewsIndices, 3)[1])
print('Mean BLEU Score for Sports News: ', spacy_MeanScores(sportsNewsIndices, 3)[1])

Mean BLEU Score for Political News:  0.2902431709754367
Mean BLEU Score for Bollywood News:  0.3044783394427597
Mean BLEU Score for Sports News:  0.2616046254316835


2. Let's do the same for 30% summary length and compare

In [None]:
pol_len = int(get_stats(columns[0], politicsNewsIndices)[0]*0.3)
bol_len = int(get_stats(columns[0], bollywoodNewsIndices)[0]*0.3)
sp_len = int(get_stats(columns[0], sportsNewsIndices)[0]*0.3)

print(pol_len, bol_len, sp_len)

19 6 3


In [None]:
print('Mean Rouge F1 Score for Political News: ', spacy_MeanScores(politicsNewsIndices, pol_len)[0])
print('Mean Rouge F1 Score for Bollywood News: ', spacy_MeanScores(bollywoodNewsIndices, bol_len)[0])
print('Mean Rouge F1 Score for Sports News: ', spacy_MeanScores(sportsNewsIndices, sp_len)[0])

Mean Rouge F1 Score for Political News:  0.2837732558264333
Mean Rouge F1 Score for Bollywood News:  0.2721689037858231
Mean Rouge F1 Score for Sports News:  0.3360898579240317


In [None]:
print('Mean BLEU Score for Political News: ', spacy_MeanScores(politicsNewsIndices, pol_len)[1])
print('Mean BLEU Score for Bollywood News: ', spacy_MeanScores(bollywoodNewsIndices, bol_len)[1])
print('Mean BLEU Score for Sports News: ', spacy_MeanScores(sportsNewsIndices, sp_len)[1])

Mean BLEU Score for Political News:  0.3044658743216003
Mean BLEU Score for Bollywood News:  0.2783971126850844
Mean BLEU Score for Sports News:  0.2616046254316835


another spacy ref: https://www.numpyninja.com/post/text-summarization-through-use-of-spacy-library

## Extractive summarization using gensim
Reference: https://radimrehurek.com/gensim_3.8.3/summarization/summariser.html

### Initial experiment

In [None]:
from gensim.summarization import summarize
from gensim.summarization import keywords

In [None]:
doc = df['article'][0]

In [None]:
# calculate summary word count at 30%
sum_word_count = int(len(doc.split())*0.3)
sum_word_count

109

In [None]:
# generate summary, plug in ratio value previously calculated
gen_summary = summarize(doc, word_count=109)
print(gen_summary)

The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7.


In [None]:
print(keywords(doc, ratio=0.3))

government
ministers
order
time
times
women staff
bjp
rakshabandhan
hindu festivities
home minister
shall remain
said
administration
administrative
administrator
apart
family
issuing
issued
festival
singh deputy secretary
told
rashtriya
hindustan
kodabhai
celebrate
celebration
celebrated
offices
office
swayamsevak
sources


**Comparing gensim summaries using ROUGE score**

In [None]:
candidate = gen_summary
reference = df['summary'][0]

In [None]:
# get rouge scores
ROUGE = Rouge()
ROUGE.get_scores(candidate, reference)

[{'rouge-1': {'f': 0.7115384565680474,
   'p': 0.6607142857142857,
   'r': 0.7708333333333334},
  'rouge-2': {'f': 0.41538461042721897,
   'p': 0.38028169014084506,
   'r': 0.4576271186440678},
  'rouge-l': {'f': 0.6730769181065089, 'p': 0.625, 'r': 0.7291666666666666}}]

**Comparing gensim summaries using BLEU Score**

In [None]:
ref = reference.split()
cand = candidate.split()

In [None]:
print('BLEU score -> {}'.format(sentence_bleu(list(ref), cand)))

BLEU score -> 0.4068429398680449


In [None]:
print('Individual 1-gram: %f' % sentence_bleu(ref, cand, weights=(1, 0, 0, 0)))
print('Individual 2-gram: %f' % sentence_bleu(ref, cand, weights=(0, 1, 0, 0)))
print('Individual 3-gram: %f' % sentence_bleu(ref, cand, weights=(0, 0, 1, 0)))
print('Individual 4-gram: %f' % sentence_bleu(ref, cand, weights=(0, 0, 0, 1)))

Individual 1-gram: 0.027397
Individual 2-gram: 1.000000
Individual 3-gram: 1.000000
Individual 4-gram: 1.000000


In [None]:
print('Cumulative 1-gram: %f' % sentence_bleu(ref, cand, weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % sentence_bleu(ref, cand, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % sentence_bleu(ref, cand, weights=(0.33, 0.33, 0.33, 0)))
print('Cumulative 4-gram: %f' % sentence_bleu(ref, cand, weights=(0.25, 0.25, 0.25, 0.25)))

Cumulative 1-gram: 0.027397
Cumulative 2-gram: 0.165521
Cumulative 3-gram: 0.305101
Cumulative 4-gram: 0.406843


### Comparing different categories mean ROUGE and BLEU scores of News Articles using gensim

In [None]:
from google.colab import drive
drive.mount('/content/drive')
url = '/content/drive/MyDrive/Colab Notebooks/top_15_articles.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# using extracted dataframe of 3 topics with 5 news articles each
new_df = pd.read_csv(url)
# text is the summary, ctext is the article
new_df = new_df.rename(columns={'text': 'summary', 'ctext': 'article'})
new_df

Unnamed: 0,summary,article
0,Actor Akshay Kumar has said that talking about...,"Akshay Kumar, whose forthcoming movie Toilet E..."
1,Actor Rana Daggubati has confirmed that he has...,2017 is turning out to be an important year fo...
2,"Actor Emraan Hashmi, whose film 'Once Upon a T...",Actor Emraan Hashmi has been in the Hindi film...
3,Actress Kriti Sanon has said that when she was...,Although actor Kriti Sanon is just three films...
4,"Actor Shah Rukh Khan, while speaking about wor...",Shah Rukh Khan is not called the ?King of Roma...
5,Pakistan Cricket Board has reserved over ?60 c...,Pakistan Cricket Board (PCB) has reserved arou...
6,Sania Mirza has called Indian women's team cap...,Tennis star Sania Mirza recently lauded the In...
7,"Ex-Australian pacer Glenn McGrath, while talki...",Legendary Australian fast bowler Glenn Mcgrath...
8,Premier League club Manchester City has signed...,Manchester City have taken their summer spendi...
9,"Gautam Gambhir, who made 97 in the 2011 Men's ...",The Indian women's cricket team is the toast o...


In [None]:
def gen_MeanScores(newsIndices, count):
  summaries = []
  for i in newsIndices:
    doc = new_df['article'][i]
    summary = summarize(doc, word_count = count)
    summaries.append(summary)
  
  rogueList, bleuList = [], []
  for summary, i in zip(summaries, newsIndices):
    candidate = summary
    reference = new_df['summary'][i]
    #Compute BLEU Scores
    bleuList.append(sentence_bleu(list(reference.split()), candidate.split()))
  #Compute Rouge Scores
  ROUGE = Rouge()
  rogueDict = ROUGE.get_scores(candidate, reference)
  for i in rogueDict:
    rogueList.append(i['rouge-1']['f'])

    #Compute BLEU Scores
  #ref = reference.split()
  #cand = candidate.split()
  bleuList.append(sentence_bleu(list(ref), cand))
  return np.average(rogueList), np.average(bleuList), summary

In [None]:
# indices of articles in the extracted dataframe based on topics
bollywoodNewsIndices = [0,1,2,3,4]
sportsNewsIndices = [5,6,7,8,9]
politicsNewsIndices = [10,11,12,13,14]

1. Looking at statistics above we will plug in the value '3' for generated summary number of sentences in spacy model or  use the original summaries' woud count to input into the word_count argument in gensim model to create a fair comparison against the original summary.

In [None]:
# word count based on EDA
pol_sum_len = int(get_stats(columns[1], politicsNewsIndices)[1])
bol_sum_len = int(get_stats(columns[1], bollywoodNewsIndices)[1])
sp_sum_len = int(get_stats(columns[1], sportsNewsIndices)[1])

print(pol_sum_len, bol_sum_len, sp_sum_len)

58 56 59


In [None]:
print('Mean Rouge F1 Score for Political News: ', gen_MeanScores(politicsNewsIndices, pol_sum_len)[0])
print('Mean Rouge F1 Score for Bollywood News: ', gen_MeanScores(bollywoodNewsIndices, bol_sum_len)[0])
print('Mean Rouge F1 Score for Sports News: ', gen_MeanScores(sportsNewsIndices, sp_sum_len)[0])

Mean Rouge F1 Score for Political News:  0.3368421002681441
Mean Rouge F1 Score for Bollywood News:  0.2736842055711912
Mean Rouge F1 Score for Sports News:  0.46464645965921847


In [None]:
print('Mean BLEU Score for Political News: ', gen_MeanScores(politicsNewsIndices, pol_sum_len)[1])
print('Mean BLEU Score for Bollywood News: ', gen_MeanScores(bollywoodNewsIndices, bol_sum_len)[1])
print('Mean BLEU Score for Sports News: ', gen_MeanScores(sportsNewsIndices, sp_sum_len)[1])

Mean BLEU Score for Political News:  0.19135668327484456
Mean BLEU Score for Bollywood News:  0.32881066745296406
Mean BLEU Score for Sports News:  0.19102178482283097


2. Let's do the same for 30% summary length and compare (in terms of word count)

In [None]:
pol_w_len = int(get_stats(columns[0], politicsNewsIndices)[1]*0.3)
bol_w_len = int(get_stats(columns[0], bollywoodNewsIndices)[1]*0.3)
sp_w_len = int(get_stats(columns[0], sportsNewsIndices)[1]*0.3)

print(pol_w_len, bol_w_len, sp_w_len)

281 95 101


In [None]:
print('Mean Rouge F1 Score for Political News: ', gen_MeanScores(politicsNewsIndices, pol_w_len)[0])
print('Mean Rouge F1 Score for Bollywood News: ', gen_MeanScores(bollywoodNewsIndices, bol_w_len)[0])
print('Mean Rouge F1 Score for Sports News: ', gen_MeanScores(sportsNewsIndices, sp_w_len)[0])

Mean Rouge F1 Score for Political News:  0.24120602643872632
Mean Rouge F1 Score for Bollywood News:  0.22950819215667842
Mean Rouge F1 Score for Sports News:  0.4409448772273544


In [None]:
print('Mean BLEU Score for Political News: ', gen_MeanScores(politicsNewsIndices, pol_w_len)[1])
print('Mean BLEU Score for Bollywood News: ', gen_MeanScores(bollywoodNewsIndices, bol_w_len)[1])
print('Mean BLEU Score for Sports News: ', gen_MeanScores(sportsNewsIndices, sp_w_len)[1])

Mean BLEU Score for Political News:  0.27213710217923587
Mean BLEU Score for Bollywood News:  0.35982436691535874
Mean BLEU Score for Sports News:  0.22778708395424377
