#**Milestone 2:**
Preprocess the news dataset.

### **Setting up the environment**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


###**Importing the required modules**

In [2]:
# dependencies
from tqdm.notebook import tqdm
from pickle import load
import pandas as pd
import numpy as np
import string

###**Loading the dataset**

In [3]:
# load stories and summaries' list
stories = load(open('/content/drive/MyDrive/Summarize News Articles/data/stories.pickle', 'rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 91889


In [4]:
print(stories[:5])



###**Preprocessing the data (stories & summaries)**

In [5]:
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare a translation table to remove punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # strip source cnn office if it exists
        index = line.find('(CNN) -- ')
        if index > -1:
            line = line[index+len('(CNN) -- '):]
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [w.translate(table) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    # remove empty strings
    cleaned = [c for c in cleaned if len(c) > 0]
    return cleaned

In [6]:
# clean th stories and summaries
for example in tqdm(stories):
    example['story'] = clean_lines(example['story'].split('\n'))
    example['highlights'] = clean_lines(example['highlights'])

  0%|          | 0/91889 [00:00<?, ?it/s]

In [10]:
print(example['story'][:2])

['cnn student news record the cnn special investigations unit classroom edition autism is a world when it airs commercialfree on monday march from approximately am et on cnn a short feature begins at am and precedes the program', 'program overview']


###**Extracting summaries from the stories with ROUGE score**

In [11]:
# install the Rouge module for calculating the Rouge scores
!pip install -q Rouge

In [12]:
# import the Rouge module and instantiate it
from rouge import Rouge 
rouge = Rouge()

In [13]:
# utility for calculating Rouge score between pairs of sentences
def get_rouge_f1(references, sentence):
    score_ls = []
    for ans in references:
        scores = rouge.get_scores(ans, sentence)
        score_ls.append(scores[0]['rouge-1']['f'])
    
    # return the maximum score
    return max(score_ls)

In [14]:
def get_list_ans_each_story(story_inp, references_inp):
    
    scr = []
    hyp = []

    # iterate through each sentence of a given story
    for i in range(0, len(story_inp)):
        # calculate Rouge score between the current sentence and the
        # provided (abstractive summaries)
        hypothesis = story_inp[i]
        scores = get_rouge_f1(references_inp, hypothesis)

        # track sentences iterated and store their scores
        hyp.append(hypothesis)
        scr.append(scores)
        
    # convert to NumPy array
    hyp1 = np.array(hyp)
    
    # sort the scores to get the indices
    scr1 = np.array(scr)
    scr2 = np.sort(scr)[::-1]
    ind  = np.argsort(scr)[::-1]
    
    # take top 5 
    ind1 = ind[0:5]
    list_ref = list(hyp1[ind1])
    
    return list_ref, scr2[0:5]

In [15]:
dict_id_summary = {}
dict_id_score = {}

# iterate through each story
for s_id in tqdm(range(0, len(stories))):
    
    # story inputs (each sentence of a story)
    story_inp = stories[s_id]['story']
    
    # reference inputs (abstractive summaries)
    references_inp = stories[s_id]['highlights']
    
    # get the list of references and scores
    list_ref, list_score = get_list_ans_each_story(story_inp, references_inp)
    
    # store the results in the dictionaries
    dict_id_summary[s_id] = list_ref
    dict_id_score[s_id] = list_score

  0%|          | 0/91889 [00:00<?, ?it/s]

###**Creating a DataFrame with the intermediate results**

In [16]:
story_id = []
label_sent = []
sent_id = []
list_sent = []

# iterate through each story
for i in tqdm(range(0, len(stories))):
    
    # list of references for the story
    list_ref = dict_id_summary[i]
    
    # iterate through each sentence of the current story
    for j, story in enumerate(stories[i]['story']) :
        
        # check if the story is in the list reference
        ind =  int(story in list_ref)
        
        # append the indicator as the labels
        label_sent.append(ind)
        
        # 1. append the sentences per story
        list_sent.append(story)
        
        # 2. append the sent_ids
        sent_id.append(j)
        
        # 3. append the story_id
        story_id.append(i)

  0%|          | 0/91889 [00:00<?, ?it/s]

In [17]:
# create the dataframe
df_story_summary = pd.DataFrame()

df_story_summary['story_id'] = story_id
df_story_summary['sent_id'] = sent_id
df_story_summary['sentence'] = list_sent
df_story_summary['label_sent'] = label_sent

In [20]:
# preview the dataframe
df_story_summary.head()

Unnamed: 0,story_id,sent_id,sentence,label_sent
0,0,0,cnn student news record the cnn special invest...,0
1,0,1,program overview,0
2,0,2,for years sue rubin says she was her own worst...,0
3,0,3,grade levels college,0
4,0,4,subject areas health social studies technology...,0


###**Storing the dataset**

In [22]:
# serialize
df_story_summary.to_pickle('dataframe_extractive.pkl')