In [1]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    GIT_ROOT = 'https://github.com/furyhawk/text_summarization/raw/master'
    os.system(f'wget {GIT_ROOT}/notebooks/setup.py')

%run -i setup.py

You are working on a local system.
Files will be searched relative to "..".


In [2]:
%run "$BASE_DIR/settings.py"

%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'png'

# to print output of all statements and not just the last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# otherwise text between $ signs will be interpreted as formula and printed in italic
pd.set_option('display.html.use_mathjax', False)

# path to import blueprints packages
sys.path.append(BASE_DIR + '/packages')

In [3]:
# adjust matplotlib resolution
matplotlib.rcParams.update({'figure.dpi': 200 })

In [4]:
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import matplotlib.pyplot as plt
import rouge_score


In [5]:
import os.path
import pandas as pd
import numpy as np

# Loading BBC News Summary dataset

In [6]:

root_path = f'../data/BBC News Summary'

types_of_articles = ['business', 'entertainment', 'politics', 'sport', 'tech']
df = pd.DataFrame(columns=['title', 'article', 'summary'])

for type_of_article in types_of_articles:
    # type_of_article = 'business'  # entertainment, politices, sport, tech
    num_of_article = len(os.listdir(
        f"{root_path}/News Articles/{type_of_article}"))

    print(f'"Reading {type_of_article} articles"')
    dataframe = pd.DataFrame(columns=['title', 'article', 'summary'])

    for i in tqdm(range(num_of_article)):
        with open(f'{root_path}/News Articles/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
            article = f.read().partition("\n")
        with open(f'{root_path}/Summaries/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
            summary = f.read()

        dataframe.loc[i] = [article[0], article[2].replace(
            '\n', ' ').replace('\r', ''), summary]

    df = df.append(dataframe, ignore_index=True)


"Reading business articles"


100%|██████████| 510/510 [00:10<00:00, 48.41it/s]


"Reading entertainment articles"


100%|██████████| 386/386 [00:07<00:00, 51.34it/s]


"Reading politics articles"


100%|██████████| 417/417 [00:08<00:00, 51.17it/s]


"Reading sport articles"


100%|██████████| 511/511 [00:09<00:00, 52.88it/s]


"Reading tech articles"


100%|██████████| 401/401 [00:08<00:00, 48.76it/s]


In [7]:
df

Unnamed: 0,title,article,summary
0,Ad sales boost Time Warner profit,"Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Go...","TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09b..."
1,Dollar gains on Greenspan speech,The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise. And Alan Greenspan highlighted the US g...,The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.China's currency remains pegged to the dol...
2,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit to pay back a $900m (Â£479m) loan. State-owned Rosneft bought the Yugansk unit for $9.3bn in a ...,Yukos' owner Menatep Group says it will ask Rosneft to repay a loan that Yugansk had secured on its assets.State-owned Rosneft bought the Yugansk unit for $9.3bn in a sale forced by Russia to part...
3,High fuel prices hit BA's profits,"British Airways has blamed high fuel prices for a 40% drop in profits. Reporting its results for the three months to 31 December 2004, the airline made a pre-tax profit of Â£75m ($141m) compared...","Rod Eddington, BA's chief executive, said the results were ""respectable"" in a third quarter when fuel costs rose by Â£106m or 47.3%.To help offset the increased price of aviation fuel, BA last yea..."
4,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Domecq have risen on speculation that it could be the target of a takeover by France's Pernod Ricard. Reports in the Wall Street Journal and the Financia...,"Pernod has reduced the debt it took on to fund the Seagram purchase to just 1.8bn euros, while Allied has improved the performance of its fast-food chains.Shares in UK drinks and food firm Allied ..."
...,...,...,...
2220,BT program to beat dialler scams,"BT is introducing two initiatives to help beat rogue dialler scams, which can cost dial-up net users thousands. From May, dial-up net users will be able to download free software to stop compute...","BT is introducing two initiatives to help beat rogue dialler scams, which can cost dial-up net users thousands.Inadvertently downloaded by surfers, rogue diallers are programs which hijack modems ..."
2221,Spam e-mails tempt net shoppers,"Computer users across the world continue to ignore security warnings about spam e-mails and are being lured into buying goods, a report suggests. More than a quarter have bought software through...",A third of them read unsolicited junk e-mail and 66% buy goods or services after receiving spam.More than a quarter have bought software through spam e-mails and 24% have bought clothes or jewelle...
2222,Be careful how you code,"A new European directive could put software writers at risk of legal action, warns former programmer and technology analyst Bill Thompson. If it gets its way, the Dutch government will conclude ...","This goes to the heart of the European project, and even those who do not care about software or patents should be worried.But small companies, and the free and open software movement do not have ..."
2223,US cyber security chief resigns,The man making sure US computer networks are safe and secure has resigned after only a year in his post. Amit Yoran was director of the National Cyber Security Division within the US Department ...,Amit Yoran was director of the National Cyber Security Division within the US Department of Homeland Security created following the 9/11 attacks.The Cyber Security Division also audited US governm...


# Summarizing text using topic representation
## Identifying important words with TF-IDF values

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
nltk.download('punkt')


sample_text = df['article'].iloc[1]
print("\nSample:", sample_text)
sentences = tokenize.sent_tokenize(sample_text)
tfidfVectorizer = TfidfVectorizer()
words_tfidf = tfidfVectorizer.fit_transform(sentences)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\furyx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True


Sample:  The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.  And Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman's taking a much more sanguine view on the current account deficit than he's taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He's taking a longer-term view, laying out a set of conditions under which the current account def

In [9]:
# Parameter to specify number of summary sentences required
num_summary_sentence = 3

# Sort the sentences in descending order by the sum of TF-IDF values
sent_sum = words_tfidf.sum(axis=1)
important_sent = np.argsort(sent_sum, axis=0)[::-1]

# Print three most important sentences in the order they appear in the article
for i in range(0, len(sentences)):
    if i in important_sent[:num_summary_sentence]:
        print (sentences[i])

On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data.
"I think the chairman's taking a much more sanguine view on the current account deficit than he's taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York.
The recent falls have partly been the result of big budget deficits, as well as the US's yawning current account gap, both of which need to be funded by the buying of US bonds and assets by foreign firms and governments.


In [42]:
from datasets import Dataset, DatasetDict, load_metric
raw_datasets = Dataset.from_pandas(df)

metric = load_metric("rouge")

Splitting the dataset in train and test split

In [52]:
# 90% train, 10% test + validation
train_testvalid = raw_datasets.train_test_split(test_size=0.1)

# train_testvalid['test']
# # Split the 10% test + validation in half test, half validation
# test_valid = train_testvalid["test"].train_test_split(test_size=0.5)
# # gather everyone if you want to have a single DatasetDict
# raw_datasets = DatasetDict({
#     "train": train_testvalid["train"],
#     "test": test_valid["test"],
#     "valid": test_valid["train"]})

raw_datasets = train_testvalid['test']
raw_datasets

Dataset({
    features: ['title', 'article', 'summary'],
    num_rows: 223
})

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize


def tfidf_summary(text, num_summary_sentence=3):
    summary_sentence = ''
    sentences = tokenize.sent_tokenize(text['article'])
    tfidfVectorizer = TfidfVectorizer()
    words_tfidf = tfidfVectorizer.fit_transform(sentences)
    sentence_sum = words_tfidf.sum(axis=1)
    important_sentences = np.argsort(sentence_sum, axis=0)[::-1]
    for i in range(0, len(sentences)):
        if i in important_sentences[:num_summary_sentence]:
            summary_sentence = summary_sentence+sentences[i]
    text['output'] = summary_sentence
    return text


In [13]:
from preparation import clean

# Parameter to specify number of summary sentences required
num_summary_sentence = 5



In [14]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [78]:


sample_text = raw_datasets[0] #clean(df.query("year==2014 and country=='SGP'")['text'].values[0])

print(sample_text['article'])
test_text = tfidf_summary(sample_text, num_summary_sentence)['article']

print("\ntest text: ", test_text)

 The prime minister has donned a life jacket and joined school children in a sailing dinghy as he sought to sell his party's education policies.  Tony Blair sailed across the lake in Bromsgrove, Worcestershire, while on a visit with Education Secretary Ruth Kelly to back school outings. Mr Blair later stressed Labour's election pledge to focus on education, when he met parents in the area. The Conservatives and Lib Dems both say his pledges are "worthless". All the parties are stepping up campaigning ahead of a General Election widely expected to be held on 5 May. Mr Blair, looking a little windswept, joined two girls from St Egwin's Middle School in Evesham and an instructor for a trip in the Wayfarer dinghy, closely followed by a boat full of photographers.  Afterwards he said outdoor activities were beneficial for children but accepted that lots of teachers now worried about taking part for fear of being sued if something went wrong. "What we're doing is introducing some simple guid

In [125]:
scorer = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(test_text, raw_datasets[0]['summary'])
print(scores)


{'rouge1': Score(precision=0.6652360515021459, recall=0.8288770053475936, fmeasure=0.738095238095238), 'rouge2': Score(precision=0.6077586206896551, recall=0.7580645161290323, fmeasure=0.6746411483253589), 'rougeL': Score(precision=0.3948497854077253, recall=0.4919786096256685, fmeasure=0.4380952380952381)}


In [141]:
updated_dataset = raw_datasets.map(lambda text: tfidf_summary(text))
# raw_datasets['article']
# updated_dataset['output']


100%|██████████| 223/223 [00:00<00:00, 446.00ex/s]


In [144]:
# {'rouge1': Score(precision=0.6652360515021459, recall=0.8288770053475936, fmeasure=0.738095238095238), 'rougeL': Score(precision=0.3948497854077253, recall=0.4919786096256685, fmeasure=0.4380952380952381)}

def compute_metrics(row):
    row['score'] = scorer.score(row['summary'], row['output'])
    
    # Extract a few results
    result = {key: value.fmeasure for key, value in scores.items()}
    # print(scores)
    # print(result['rougeL'])
    row['rouge1'] = result['rouge1']
    row['rouge2'] = result['rouge2']
    row['rougeL'] = result['rougeL']
    return row
    
    
updated_dataset = updated_dataset.map(compute_metrics)
# # final_score is returned on process with process_id==0 and will be `None` on the other processes
# final_score = metric.compute()
# result
# updated_dataset


100%|██████████| 223/223 [00:03<00:00, 70.18ex/s]


In [145]:
updated_dataset

Dataset({
    features: ['title', 'article', 'summary', 'output', 'score', 'rouge1', 'rouge2', 'rougeL'],
    num_rows: 223
})

In [151]:

np.mean(updated_dataset['rougeL'])  # 


0.4380952380952382