<a href="https://colab.research.google.com/github/chimi-pt/Automatic-Text-Summarization-using-NLP-and-Tensor-Flow/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Loading and Segregating the Dataset**

In [None]:
# get the CNN stories dataset
!wget https://github.com/Manning-LP-What-s-The-News/Starter-Repository/releases/download/v0.1.0/cnn_stories.tgz

# untar the dataset
!tar -xvf cnn_stories.tgz

In [2]:
# dependencies
from tqdm.notebook import tqdm
from os import listdir
import string

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

# split a document into news story and highlights
def split_story(doc):
    
    #find first highlight
    index = doc.find('@highlight')
    
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    
    return story, highlights

# load all stories in a directory
def load_stories(directory):
    stories = list()
    
    for name in tqdm(listdir(directory)):
        filename = directory + '/' + name
        
        # load document
        doc = load_doc(filename)
        
        # split into story and highlights
        story, highlights = split_story(doc)
        
        # store
        stories.append({'story':story, 'highlights':highlights})
    
    return stories

In [4]:
# segregate into news and summaries
directory = 'cnn/stories/'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))

  0%|          | 0/92579 [00:00<?, ?it/s]

Loaded Stories 92579


In [5]:
# serialize the master list
from pickle import dump
dump(stories, open('stories.pkl', 'wb'))

# **Preprocessing the stories and the summaries**

Converting all words to lowercase.
Removing all punctuation
No numerical tokens are present.

In [6]:
#Additional dependecies
from pickle import load
import pandas as pd
import numpy as np

In [7]:
# load stories and summaries' list
stories = load(open('stories.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [10]:
#Cleaning a list of lines
def clean_lines(lines):
    cleaned=list()
    #Remove puntctuations by preparing a translation table
    table=str.maketrans('','',string.punctuation)
    for line in lines:
        # stripping the source cnn office
        index=line.find('(CNN) --')
        if index > -1:
            line=line[index+len('(CNN)'):]
        # tokenize on white space
        line=line.split()
        #convert to lowercase
        line=[word.lower() for word in line]
        #remove punctuation from each token
        line=[w.translate(table) for w in line]
        #remove tokens with numbers in item
        line=[word for word in line if word.isalpha()]
        #store as string
        cleaned.append(''.join(line))
    #remove empty strings
    cleaned=[c for c in cleaned if len(c) > 0]

    return cleaned

Cleaning the stories and summaries

In [11]:
for example in tqdm(stories):
  example['story']=clean_lines(example['story'].split('\n'))
  example['highlights']= clean_lines(example['highlights'])
  

  0%|          | 0/92579 [00:00<?, ?it/s]

Installing the rouge module  for calculating the rouge scores

In [12]:
!pip install -q Rouge

In [17]:
#Import the rouge module  and instantiate it
from rouge  import Rouge
rouge=Rouge()

#utility for calculating Rouge score between pairs of sentences
def get_rouge_f1(references,sentence):
  score_ls=[]
  for ans in references:
    scores=rouge.get_scores(ans,sentence)
    score_ls.append(scores[0]['rouge-1']['f'])

    return max(score_ls)


In [18]:
def get_list_ans_each_story(story_inp,references_inp):

  scr=[]
  hyp=[]

  #iterate through each sentence of a given story
  for i in range(0,len(story_inp)):
    #Calculate the rouge score between  the current sentence  and the provided abstractive summaries
    hypothesis=story_inp[i]
    scores=get_rouge_f1(references_inp,hypothesis)

    #track sentences iterated and store their scores
    hyp.append(hypothesis)
    scr.append(scores)

  #convert to numpy array
  hyp1=np.array(hyp)

  #sort the scores to get the indices
  scr1=np.array(scr)
  scr2 = np.sort(scr)[::-1]
  ind  = np.argsort(scr)[::-1]
    
  # take top 5 
  ind1 = ind[0:5]
  list_ref = list(hyp1[ind1])
    
  return list_ref, scr2[0:5]


In [19]:
dict_id_summary={}
dict_id_score={}

#iterate through each story
for s_id in tqdm(range(0,len(stories))):

  #story inputs(each sentence of a story)
  story_inp=stories[s_id]['story']

  #reference inputs (abstractive summaries)
  references_inp=stories[s_id]['highlights']

  #get the list of references and scores
  list_ref,list_score=get_list_ans_each_story(story_inp,references_inp)

  #store the results in the dictionaries
  dict_id_summary[s_id]=list_ref
  dict_id_score[s_id]=list_score

  0%|          | 0/92579 [00:00<?, ?it/s]