# TFIDF


In [1]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    GIT_ROOT = 'https://github.com/furyhawk/text_summarization/raw/master'
    os.system(f'wget {GIT_ROOT}/notebooks/setup.py')

%run -i setup.py

You are working on a local system.
Files will be searched relative to "..".


In [2]:
%run "$BASE_DIR/settings.py"

%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'png'

# to print output of all statements and not just the last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# otherwise text between $ signs will be interpreted as formula and printed in italic
pd.set_option('display.html.use_mathjax', False)

# path to import blueprints packages
sys.path.append(BASE_DIR + '/packages')

In [3]:
# adjust matplotlib resolution
matplotlib.rcParams.update({'figure.dpi': 200 })

In [4]:
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import matplotlib.pyplot as plt
import rouge_score


In [5]:
import os.path
import pandas as pd
import numpy as np

# Loading BBC News Summary dataset

In [6]:
root_path = f'../data/BBC News Summary'


# root_path = f'/kaggle/input/bbc-news-summary/BBC News Summary'


def loadDataset(root_path):

    types_of_articles = ['business',
                         'entertainment', 'politics', 'sport', 'tech']
    df = pd.DataFrame(columns=['title', 'article', 'summary'])

    for type_of_article in types_of_articles:
        # type_of_article = 'business'  # entertainment, politices, sport, tech
        num_of_article = len(os.listdir(
            f"{root_path}/News Articles/{type_of_article}"))

        print(f'"Reading {type_of_article} articles"')
        dataframe = pd.DataFrame(columns=['title', 'article', 'summary'])

        for i in tqdm(range(num_of_article)):
            with open(f'{root_path}/News Articles/{type_of_article}/{(i+1):03d}.txt', 'r', encoding="utf8", errors='ignore') as f:
                article = f.read().partition("\n")
            with open(f'{root_path}/Summaries/{type_of_article}/{(i+1):03d}.txt', 'r', encoding="utf8", errors='ignore') as f:
                summary = f.read()

            dataframe.loc[i] = [article[0], article[2].replace(
                '\n', ' ').replace('\r', ''), summary]

        df = df.append(dataframe, ignore_index=True)

    return df

In [7]:
fname = 'bbc.csv'

if os.path.isfile(fname):
    df = pd.read_csv(fname)
else:
    df = loadDataset(root_path)
    df.to_csv(fname, index=False)

# Summarizing text using topic representation
## Identifying important words with TF-IDF values

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
nltk.download('punkt')


sample_text = df['article'].iloc[1]
print("\nSample:", sample_text)
sentences = tokenize.sent_tokenize(sample_text)
tfidfVectorizer = TfidfVectorizer()
words_tfidf = tfidfVectorizer.fit_transform(sentences)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\furyx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True


Sample:  The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.  And Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman's taking a much more sanguine view on the current account deficit than he's taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He's taking a longer-term view, laying out a set of conditions under which the current account def

In [58]:
# Parameter to specify number of summary sentences required
num_summary_sentence = 1

# Sort the sentences in descending order by the sum of TF-IDF values
sent_sum = words_tfidf.sum(axis=1)
important_sent = np.argsort(sent_sum, axis=0)[::-1]

# Print three most important sentences in the order they appear in the article
for i in range(0, len(sentences)):
    if i in important_sent[:num_summary_sentence]:
        print (sentences[i])

On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data.


In [70]:
from datasets import Dataset, DatasetDict, load_metric
raw_datasets = Dataset.from_pandas(df)


Splitting the dataset in train and test split

In [71]:
# 90% train, 10% test + validation
train_testvalid = raw_datasets.train_test_split(test_size=0.1)

raw_datasets = train_testvalid['test']
raw_datasets

Dataset({
    features: ['title', 'article', 'summary', 'category'],
    num_rows: 223
})

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize


def tfidf_summary(text, num_summary_sentence=1):
    summary_sentence = ''
    sentences = tokenize.sent_tokenize(text['article'])
    tfidfVectorizer = TfidfVectorizer()
    words_tfidf = tfidfVectorizer.fit_transform(sentences)
    sentence_sum = words_tfidf.sum(axis=1)
    important_sentences = np.argsort(sentence_sum, axis=0)[::-1]
    for i in range(0, len(sentences)):
        if i in important_sentences[:num_summary_sentence]:
            summary_sentence = summary_sentence+sentences[i]
    text['output'] = summary_sentence
    return text


In [73]:
from preparation import clean

# Parameter to specify number of summary sentences required
num_summary_sentence = 1

In [15]:


sample_text = raw_datasets[0] #clean(df.query("year==2014 and country=='SGP'")['text'].values[0])

print(sample_text['article'])
test_text = tfidf_summary(sample_text, num_summary_sentence)['article']

print("\ntest text: ", test_text)

 Shares in train and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.  Paul Tellier, who was also Bombardier's president, left the company amid an ongoing restructuring. Laurent Beaudoin, part of the family that controls the Montreal-based firm, will take on the role of CEO under a newly created management structure. Analysts said the resignations seem to have stemmed from a boardroom dispute. Under Mr Tellier's tenure at the company, which began in January 2003, plans to cut the worldwide workforce of 75,000 by almost a third by 2006 were announced. The firm's snowmobile division and defence services unit were also sold and Bombardier started the development of a new aircraft seating 110 to 135 passengers.  Mr Tellier had indicated he wanted to stay at the world's top train maker and third largest manufacturer of civil aircraft until the restructuring was complete. But Bombardier has been faced with

In [16]:
scorer = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
scores = scorer.score(test_text, raw_datasets[0]['summary'])
print(scores)


{'rouge1': Score(precision=1.0, recall=0.49466192170818507, fmeasure=0.6619047619047619), 'rouge2': Score(precision=0.9710144927536232, recall=0.4785714285714286, fmeasure=0.6411483253588517), 'rougeL': Score(precision=0.5899280575539568, recall=0.2918149466192171, fmeasure=0.3904761904761904), 'rougeLsum': Score(precision=0.5899280575539568, recall=0.2918149466192171, fmeasure=0.3904761904761904)}


In [74]:
updated_dataset = raw_datasets.map(lambda text: tfidf_summary(text))
# raw_datasets['article']
# updated_dataset['output']


100%|██████████| 223/223 [00:01<00:00, 202.09ex/s]


In [75]:
# {'rouge1': Score(precision=0.6652360515021459, recall=0.8288770053475936, fmeasure=0.738095238095238), 'rougeL': Score(precision=0.3948497854077253, recall=0.4919786096256685, fmeasure=0.4380952380952381)}

def compute_metrics(row):
    scores = scorer.score(row['output'], row['summary'])
    
    # Extract a few results
    result = {key: value.fmeasure for key, value in scores.items()}
    # print(result)
    # print(result['rougeL'])
    row['rouge1'] = result['rouge1']
    row['rouge2'] = result['rouge2']
    row['rougeL'] = result['rougeL']
    row['rougeLsum'] = result['rougeLsum']
    return row
    


In [76]:

metrics = updated_dataset.map(compute_metrics)
# # final_score is returned on process with process_id==0 and will be `None` on the other processes
# final_score = metric.compute()
# result
# updated_dataset


100%|██████████| 223/223 [00:05<00:00, 40.47ex/s]


In [66]:
metrics

Dataset({
    features: ['title', 'article', 'summary', 'category', 'output', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
    num_rows: 223
})

In [56]:
# compute_metrics(updated_dataset[1])

In [57]:
# compute_metrics(updated_dataset[12])

In [77]:
np.mean(metrics['rouge1'])  # 

0.2627801675682219

In [80]:
np.mean(metrics['rougeLsum'])  # 

0.23902245716747755