In [1]:
# from segregate dataset
import pickle

def load_stories(path='./cnn_stories.pkl'):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [2]:
stories = load_stories()

In [3]:
def parse_story_into_sentences(story):
    return [sentence for sentence in story['story'].split('\n') if sentence and not sentence.isspace()]

In [4]:
from rouge import Rouge

def get_rouge_score(sentence, reference, rouge_type="rouge-l", score_type="f"):
    rouge = Rouge()
    try:
        return rouge.get_scores(sentence, reference)[0][rouge_type][score_type]
    except ValueError:
        return .0

In [5]:
def transform_story_with_sentences(story):
    result = dict(story)
    result['sentences'] = parse_story_into_sentences(story)
    return result

In [6]:
stories = [transform_story_with_sentences(story) for story in stories]

In [7]:
def get_sentence_scores(sentences, highlights, score_function=get_rouge_score):
    return [max(score_function(sentence, highlight) for highlight in highlights) for sentence in sentences]

In [8]:
get_sentence_scores(stories[0]['sentences'], stories[0]['highlights'])

[0.08510637917609795,
 0.1395348801514333,
 0.07272726975206624,
 0.10526315378116359,
 0.09756097146936366,
 0.10169491130135033,
 0.1276595711181531,
 0.17391303856332718,
 0.173913038941399,
 0.14285713877551035,
 0.07843136895040385,
 0.08888888497777794,
 0.1052631530193908,
 0.17647058366782017,
 0.12499999531250018,
 0.09756097192147545,
 0.16666666346666673,
 0.22222221777777784,
 0.14285713795918387,
 0.13636363239669433,
 0.12903225306971924,
 0.13043477807183385,
 0.42105262670360116,
 0.1034482725802617,
 0.12121211698806258,
 0.08888888543209889,
 0.13559321752370013,
 0.0588235250346024,
 0.11428570932244919,
 0.12499999625000012,
 0.0952380923557572,
 0.09756097146936366,
 0.05714285306122478,
 0.11111110809327854,
 0.1249999950000002]

In [9]:
def transform_story_with_scores(story, score_function=get_rouge_score):
    result = dict(story)
    result['scores'] = get_sentence_scores(story['sentences'], story['highlights'], score_function)
    return result

In [10]:
stories = [transform_story_with_scores(story) for story in stories]

In [11]:
def get_top_sentences_scores(sentences, scores, top=5):
    scores_argsort = sorted(range(len(scores)), key=scores.__getitem__, reverse=True)
    return [sentences[i] for i in scores_argsort][:top], [scores[i] for i in scores_argsort][:top]

In [12]:
def transform_story_with_top_scores(story, score_function=get_rouge_score, top=5):
    result = dict(story)
    scores = story['scores']
    top_sentences, top_scores = get_top_sentences_scores(story['sentences'], scores, top)
    result['top_sentences'] = top_sentences
    result['top_scores'] = top_scores
    return result

In [13]:
stories = [transform_story_with_top_scores(story) for story in stories]

In [14]:
stories[0]

{'highlights': ['NEW: Former President Jimmy Carter says integrity of the election is intact',
  'NEW: A meeting will be held Sunday for appeals and to set a date to declare results',
  'Polls show Mohamed Morsi, the Muslim Brotherhood candidate, in first',
  "He is followed by Ahmed Shafik, Hosni Mubarak's last prime minister"],
 'story': 'Cairo (CNN) -- Two distinctive Egyptian presidential candidates, one representing resurgent Islamists and the other a weathered veteran of the country\'s old guard, have begun maneuvering for wider political support ahead of an expected runoff next month.\n\nMuslim Brotherhood candidate Mohamed Morsi huddled with politicians Saturday to stave off what his campaign sees as a return to the policies of longtime President Hosni Mubarak, toppled last year in the country\'s historic uprising.\n\n"We face desperate attempts to replicate the old system of governance in new attire that might fool some, but the masses of our people and the enlightened revolut

In [15]:
import pandas as pd

def stories_to_df(stories):
    df = pd.DataFrame()
    
    story_id = []
    sent_id = []
    label_sent = []
    list_sent = []
    
    for i, story in enumerate(stories):
        for j, sentence in enumerate(story['sentences']):
            story_id.append(i)
            sent_id.append(j)
            list_sent.append(sentence)
            label_sent.append(sentence in story['top_sentences'])
    
    df['story_id'] = story_id
    df['sent_id'] = sent_id
    df['label_sent'] = label_sent
    df['list_sent'] = list_sent
    
    return df 

In [16]:
df = stories_to_df(stories)

In [17]:
df.to_pickle('dataframe_extractive.pkl')