# Text Summarization with TF-IDF
* This lab is aimed at conceiving an  extractive summarization mechanism with the use of TF-IDF.
* Given an article, we'll split its sentences and apply TF-IDF to them. Our extraction metric is going to be the me `np.nanmean` of the document's vector.

In [5]:
# ! mkdir data
# !wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv -P data/

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/bbc_text_cls.csv')
df = df[df.labels=='business']
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [7]:
from nltk import sent_tokenize
from typing import List

def get_sentences(df:pd.DataFrame, idx:int|None=None)->List[str]:
    ''' 
        Extracts the sentences of a certain article from the dataset.

        Parameters
        ----------
        `df`: `pd.DataFrame`
            The news articles dataset.
        `idx`: int
            The index of the desired article. It will be randomly chosen if it is
            set to None.

        Returns
        -------
        A list with the article's sentences.
    '''
    if idx is None:
        idx = np.random.randint(low=0, high=df.shape[0], size=1)[0]
    text = df.iloc[idx, 0]
    return sent_tokenize(text)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

class ExtractiveSummarizer:
    '''
        A text-extraction based summarizer. It must return the p% most important 
        fragments of a document.

        Parameter
        ---------
        `p`: float
            The desired top percentagem
    '''
    def __init__(self, p:float):
        assert p>0 and p<1, '`p` must pertain to ]0,1[.'
        self.p = p

    def __compute_means(self)->np.ndarray:
        '''
            Computes the TF-IDF means for each provided document.
        '''
        _means = TfidfVectorizer().fit_transform(self._sentences).toarray()
        return np.nanmean(np.where(_means==0, np.nan, _means), axis=1)

    def fit(self, X:List[str]):
        '''
            Computes the average TF-IDF for each presenteed document.

            Parameter
            ---------
            `X`: List[str]
                A collection of sentences from a certain document.
        '''
        self._sentences = X
        self._means = self.__compute_means()
        return self

    def transform(self)->List[str]:
        '''
            Returns the p% most important fragments of the provided text.
        '''
        amount_retrieve = int(round(self.p * len(self._sentences), 0))
        idxs_retrieve = np.argsort(self._means)[-amount_retrieve:].tolist()
        return [self._sentences[i] for i in reversed(idxs_retrieve)]

sample_sentence = get_sentences(df, )
ExtractiveSummarizer(.25).fit(sample_sentence).transform()

['Home furnishings and furniture stores also performed well, rising 2.2%.',
 'However, long-term claims slipped to their lowest level since 2001.',
 'During that time, sales grew a lacklustre 2.9% in 2001 and 2.5% a year later.',
 '"Consumers for now remain willing to spend freely, sustaining the US expansion.',
 'The belief comes despite the latest labor department report showing a surprise rise in unemployment.']