# TF-IDF (from scratch) And Word Embeddings

<hr>


In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

import spacy

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Custom imports

# Built-in library
import itertools
import re
import json
from typing import Union, Optional, Any
import logging
import warnings

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configure warnings and pther settings
warnings.filterwarnings("ignore")
sns.set()


nlp = spacy.load("en_core_web_sm")


def load_data(*, filename: str) -> pd.DataFrame:
    """This is used to load the data.

    Params;
        filename (str): The filepath.

    Returns:
        df (pd.DataFrame): The loaded dataframe.
    """
    df = pd.read_csv(filename)
    print(f"Shape of df: {df.shape}\n")
    return df

In [3]:
filename = "../../data/bbc_text_cls.csv"
data = load_data(filename=filename)

data.head(2)

Shape of df: (2225, 2)



Unnamed: 0,text,labels
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet bus...",business
1,"Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAnd Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of th...",business


In [4]:
class Tokenizer:
    """This is used to tokenize documents"""

    def __init__(self) -> None:
        self.nlp = nlp

    def __call__(self, doc: str, *args: Any, **kwargs: Any) -> list[str]:
        # Tokenize
        doc = nlp(doc)
        tokenized_doc = [word.text.lower() for word in doc]
        return tokenized_doc

In [5]:
d = {
    "text": [
        "Thank you for being an awesome father",
        "I have an awesome God. I just wanna say thank you",
    ],
    "labels": ["a", "b"],
}

df = pd.DataFrame(d)

df

Unnamed: 0,text,labels
0,Thank you for being an awesome father,a
1,I have an awesome God. I just wanna say thank you,b


In [6]:
class BagOfWordsCalculator:
    """This tokenizes all the documents and calculates the bag of words.
    i.e all the unique words in the document are stored and counted.

    Returns:
        tokenized_docs (list[int]): Tokenized documents i.e list of
            tokenized documents where every row in the data is a document.
        bag_of_words (dict[str, int]): A dict containing a unique word and the
            unique numeric representation of the word.
    """

    def __init__(self) -> None:
        self.tokenizer = Tokenizer()

    def __call__(
        self, data: pd.DataFrame, *args: Any, **kwargs: Any
    ) -> tuple[list, dict]:
        """This calculates the bag of words."""
        count = 0
        bag_of_words = {}
        tokenized_docs = []

        for doc in data:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in bag_of_words:
                    bag_of_words[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(bag_of_words.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            tokenized_docs.append(doc_as_num)
        return (tokenized_docs, bag_of_words)

In [7]:
df["text"]

0                Thank you for being an awesome father
1    I have an awesome God. I just wanna say thank you
Name: text, dtype: object

In [8]:
b_o_words_cal = BagOfWordsCalculator()
t_docs, b_o_words = b_o_words_cal(data=df["text"])

# tokenized_docs, bag_of_words
t_docs, b_o_words

([[0, 1, 2, 3, 4, 5, 6], [7, 8, 4, 5, 9, 10, 7, 11, 12, 13, 0, 1]],
 {'thank': 0,
  'you': 1,
  'for': 2,
  'being': 3,
  'an': 4,
  'awesome': 5,
  'father': 6,
  'i': 7,
  'have': 8,
  'god': 9,
  '.': 10,
  'just': 11,
  'wanna': 12,
  'say': 13})

<br><hr>

## Calculate The Term Frequency

Term frequency, `tf(t,d)`, is the relative frequency of term ***`t`*** within document ***`d`***

$$
tf(t,d) = \frac{count_{t/d}}{number_{terms/d}}
$$

where: \
$count_{t/d}$: Count of `t` in `d` \
$number_{terms/d}$: Number of `terms` in `d`


In [9]:
b_o_words_cal = BagOfWordsCalculator()
t_docs, b_o_words = b_o_words_cal(data=df["text"])

# Instantiate: number of docs and number of words
N, V = df.shape[0], len(b_o_words)

# Term Frequency
tf = np.zeros((N, V))
tf

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [10]:
t_docs

[[0, 1, 2, 3, 4, 5, 6], [7, 8, 4, 5, 9, 10, 7, 11, 12, 13, 0, 1]]

In [11]:
# Check for each word in the doc and increment the count
# of the word wherever it occurs.

# Note::
# document: a list of words/terms,
# doc_idx: index of the current document,
# doc_as_num: the current tokenized document,
# words_idx: the words represented as numbers,

for doc_idx, doc_as_num in enumerate(t_docs):
    for words_idx in doc_as_num:
        tf[doc_idx, words_idx] += 1

tf

array([[1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 1., 1., 0., 2., 1., 1., 1., 1., 1., 1.]])

## Putting It All Together

In [12]:
class CustomCountVectorizer:
    """This is used to count the terms in a given document."""

    def __init__(self, data: pd.Series) -> None:
        self.data = data
        self.vocabulary = {}
        self.tokenized_docs = []
        self.tokenizer = Tokenizer()

    def tokenize_docs(self) -> tuple[list, dict]:
        """This tokenizes the documents."""
        count = 0

        for doc in self.data:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in self.vocabulary:
                    self.vocabulary[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(self.vocabulary.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            self.tokenized_docs.append(doc_as_num)
        return (self.tokenized_docs, self.vocabulary)

    def calculate_term_frequency(self, *args: Any, **kwargs: Any) -> np.ndarray:
        """Calculate term frequency/bag of words."""
        self.tokenized_docs, self.vocabulary = self.tokenize_docs()
        # Number of docs and number of words
        N, W = self.data.shape[0], len(self.vocabulary)
        tf = np.zeros((N, W))  # Instantiate tf

        # Check each word in the doc and increment the count
        # of the word wherever it occurs
        for doc_idx, doc_as_num in enumerate(self.tokenized_docs):
            for words_idx in doc_as_num:
                tf[doc_idx, words_idx] += 1
        return tf

In [13]:
df

Unnamed: 0,text,labels
0,Thank you for being an awesome father,a
1,I have an awesome God. I just wanna say thank you,b


In [14]:
count_vectorizer = CustomCountVectorizer(data=df["text"])
tf = count_vectorizer.calculate_term_frequency()
tf

array([[1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 1., 1., 0., 2., 1., 1., 1., 1., 1., 1.]])

<br><hr>

## Calculate TF-IDF

## Document Frequency

`Document Frequency` is the number of documents, `d` in which the term, `t` is present.

```code
df = occurences of `t` in `d`
```


## Inverse Document Frequency (IDF)

> `IDF` is a measure of whether a term is `common` or `rare` in a given document corpus. It is obtained by **dividing** the **total number of documents** by the **number of documents containing the term** in the corpus.

Let $\mathbf{d_{f}(t)}$ be the number of documents term `t` appears in. There are a few other issues with the `IDF`; for example, if the corpus, $N$,is large, say 100,000,000, the IDF value explodes; 

$$
\mathbf{idf}(t,d) = \frac{100,000,000}{d_{f}(t)} \approx{0}
$$

To avoid this effect, we take the `log` of `idf`.

$$
\mathbf{idf}(t,d) = \log(\frac{N}{d_{f}(t)})
$$


When a word that is not in the vocab occurs during the query, the ***`document frequency`*** is 0. Because we can't divide by zero, we smooth the value by adding 1 to the denominator.

$$
\mathbf{idf}(t,d) = \log\frac{N}{{d_{f}(t)}+ 1}
$$

<br>

$$
\mathbf{tfidf}(t,d) =tf(t,d) \times idf(t)
$$

In [35]:
# Compute IDF
# DF: Document Frequency is the num of documents the term occurs in.
# IDF: Number of documents(N) divided by DF. The log is taken to
# reduce the impact of extremely large documents.
# Therefore, IDF = log(N / DF)
N = df.shape[0]
document_frequency = (tf > 0).sum(axis=0)  # shape (V,)
document_frequency.shape

((14,), 2)

In [37]:
document_frequency

array([2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1])

In [21]:
idf = np.log(N / document_frequency)

idf

array([0.        , 0.        , 0.69314718, 0.69314718, 0.        ,
       0.        , 0.69314718, 0.69314718, 0.69314718, 0.69314718,
       0.69314718, 0.69314718, 0.69314718, 0.69314718])

In [23]:
df["text"]

0                Thank you for being an awesome father
1    I have an awesome God. I just wanna say thank you
Name: text, dtype: object

In [22]:
tf_idf = tf * idf
tf_idf

array([[0.        , 0.        , 0.69314718, 0.69314718, 0.        ,
        0.        , 0.69314718, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.38629436, 0.69314718, 0.69314718,
        0.69314718, 0.69314718, 0.69314718, 0.69314718]])

### Putting It Together

In [41]:
class CustomTFIDF:
    """This is used to calculate the term frequency
    inverse document frequency of a given corpus."""

    def __init__(self, data: pd.Series) -> None:
        self.data = data
        self.vocabulary = {}
        self.tokenized_docs = []
        self.tokenizer = Tokenizer()

    def __repr__(self) -> str:
        "This returns the string representation of the class."
        return (
            f"{__class__.__name__}(num_vocab: {len(self.vocabulary)}, "
            f"num_doc: {len(self.tokenized_docs)})"
        )

    def tokenize_docs(self) -> tuple[list, dict]:
        """This is used to tokenize the documents."""
        count = 0

        for doc in self.data:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in self.vocabulary:
                    self.vocabulary[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(self.vocabulary.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            self.tokenized_docs.append(doc_as_num)
        return (self.tokenized_docs, self.vocabulary)

    def convert_numbers_to_words(self) -> dict:
        """This is used to map numbers to words."""
        _, self.vocabulary = self.tokenize_docs()
        nums_to_words = {num: word for word, num in self.vocabulary.items()}
        return nums_to_words

    def calculate_term_frequency(self, *args: Any, **kwargs: Any) -> np.ndarray:
        """Calculate the term frequency or bag of words."""
        self.tokenized_docs, self.vocabulary = self.tokenize_docs()
        # Number of docs and number of words
        N, W = self.data.shape[0], len(self.vocabulary)
        tf = np.zeros((N, W))  # Instantiate tf

        # Check for each word in the doc and increment the count
        # of the word wherever it occurs.

        # Note::
        # document: a list of words/terms,
        # doc_idx: index of the current document,
        # doc_as_num: the current tokenized document,
        # words_idx: the words represented as numbers,
        for doc_idx, doc_as_num in enumerate(self.tokenized_docs):
            for words_idx in doc_as_num:
                tf[doc_idx, words_idx] += 1
        return tf

    def calculate_term_freq_inv_doc_freq(self) -> np.ndarray:
        """This returns the term frequency inverse document frequency
        of a given corpus."""
        # DF: Document Frequency is the num of documents the term occurs in.
        # IDF: Number of documents(N) divided by DF. The log is taken to
        # reduce the impact of extremely large documents.
        # Therefore, IDF = log(N / DF)
        N = self.data.shape[0]
        tf = self.calculate_term_frequency()
        document_frequency = (tf > 0).sum(axis=0)  # shape (W,)
        inverse_doc_freq = np.log(N / document_frequency)
        tf_idf = tf * inverse_doc_freq
        return tf_idf

In [43]:
tfidf_vec = CustomTFIDF(data=df["text"])
tfidf = tfidf_vec.calculate_term_freq_inv_doc_freq()
print(tfidf_vec)
tfidf

CustomTFIDF(num_vocab: 14, num_doc: 2)


array([[0.        , 0.        , 0.69314718, 0.69314718, 0.        ,
        0.        , 0.69314718, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.38629436, 0.69314718, 0.69314718,
        0.69314718, 0.69314718, 0.69314718, 0.69314718]])

In [29]:
idx_2_word = tfidf_vec.convert_numbers_to_words()
idx_2_word

{0: 'thank',
 1: 'you',
 2: 'for',
 3: 'being',
 4: 'an',
 5: 'awesome',
 6: 'father',
 7: 'i',
 8: 'have',
 9: 'god',
 10: '.',
 11: 'just',
 12: 'wanna',
 13: 'say'}

In [31]:
df

Unnamed: 0,text,labels
0,Thank you for being an awesome father,a
1,I have an awesome God. I just wanna say thank you,b


In [34]:
idx_2_word = tfidf_vec.convert_numbers_to_words()

# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
# i = 0
row = df.iloc[i]
print("Label:", row["labels"])
print("Text:", row["text"])
print("Top 5 important terms:")

# Select the tfidf (scores) of a given document
# and sort the scores in descending order
scores = tfidf[i]
indices = (-scores).argsort()

for idx in indices[:5]:
    print(f"word :{idx_2_word[idx]}, score: {round(scores[idx], 3)}")

Label: a
Text: Thank you for being an awesome father
Top 5 important terms:
word :for, score: 0.693
word :being, score: 0.693
word :father, score: 0.693
word :thank, score: 0.0
word :you, score: 0.0


In [38]:
data.head(3)

Unnamed: 0,text,labels
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet bus...",business
1,"Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAnd Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of th...",business
2,"Yukos unit buyer faces loan claim\n\nThe owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit to pay back a $900m (£479m) loan.\n\nState-owned Rosneft bought the Yugansk unit for $9.3bn in a sale forced by Russia to part settle a $27.5bn tax claim against Yukos. Yukos' owner Menatep Group says it will ask Rosneft to repay a loan that Yugansk had secured on its assets. Rosneft already faces a similar $540m repayment demand from foreign banks. Legal experts said Rosneft's purchase of Yugansk would include such obligations. ""The pledged assets are wit...",business


In [None]:
tfidf_vec = CustomTFIDF(data=data["text"])
tfidf = tfidf_vec.calculate_term_freq_inv_doc_freq()
print(tfidf_vec)
idx_2_word = tfidf_vec.convert_numbers_to_words()

In [None]:
# Pick a random document, show the top 5 terms (in terms of tf_idf score)
N = data.shape[0]
i = np.random.choice(N)
row = data.iloc[i]

print(f"i: {i}")
print("Label:", row["labels"])
print("Text:", row["text"].split("\n")[0])
print("Top 5 terms:")

scores = tfidf[i]
indices = (-scores).argsort()  # Sort in descending order

for idx in indices[:5]:
    print(idx_2_word[idx])

### Verify Using SKLearn's TfidfVectorizer

In [None]:
tf_idf_vec = TfidfVectorizer(
    stop_words="english", tokenizer=Tokenizer(), max_features=25_000
)
X = data["text"]
X_tr = tf_idf_vec.fit_transform(X)

dict_ = tf_idf_vec.vocabulary_
idx_2_word_dict = {num: word for word, num in dict_.items()}

In [None]:
# i = 594
row = data.iloc[i]

print(f"i: {i}")
print("Label:", row["labels"])
print("Text:", row["text"].split("\n")[0])
print("Top 5 terms:")

scores = X_tr[i].toarray().flatten()
indices = (-scores).argsort()  # Sort in descending order

for idx in indices[:5]:
    print(idx_2_word_dict[idx])

<br>

## Text Summarization

### Using TFIDF

1. Split the document into sentences.
2. Score each sentence (using the average TFIDF of the non zero scores)
3. Rank each sentence by the scores.
4. Summary is approximately the top N ranked sentences by score.

In [None]:
data.head(1)

In [None]:
class Sentencizer:
    """This is used to convert a document into a list of sentences.
    It returns sentences."""

    def __init__(self) -> None:
        self.nlp = nlp

    def __call__(self, doc: str, *args: Any, **kwargs: Any) -> list[str]:
        # Tokenize
        doc = nlp(doc)
        sentences = list(doc.sents)
        tokenized_sentences = [str(sentence) for sentence in sentences]
        return tokenized_sentences

In [None]:
data["labels"].unique()

In [None]:
# Select document
sample_data = (
    data.loc[data["labels"] == "entertainment", "text"]
    .sample(n=3, random_state=123)
    .reset_index(drop=True)
)
sample_data

In [None]:
# Split ONCE using '\n' and exclude the title
doc = sample_data.iloc[0].split("\n", 1)[1]
doc[:200]

In [None]:
sents = Sentencizer()
sentences = sents(doc=doc)
print(len(sentences))

sentences

In [None]:
# Load spaCy stopwords
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
spacy_stopwords = list(spacy_stopwords)
spacy_stopwords[:5]

In [None]:
# Vectorize the sentences
tfidf = TfidfVectorizer(stop_words=spacy_stopwords, norm="l1")
X_tr = tfidf.fit_transform(sentences)
X_tr.shape

In [None]:
def calculate_sentence_score(tfidf_row):
    """This returns the average score of the non-zero tfidf value
    for a given sentence."""
    x = tfidf_row[tfidf_row != 0]  # Select the non-zero values
    return x.mean()

In [None]:
# Initialize the score
scores = np.zeros(len(sentences))

# Calculate the score for each sentence
for idx in range(len(sentences)):
    score = calculate_sentence_score(X_tr[idx, :])
    scores[idx] = score

In [None]:
# Sort the scores in descending order
sort_idx = np.argsort(-scores)
sort_idx

In [None]:
# # Another method for calculating the scores
# A = pd.DataFrame(X_tr.toarray())
# # Calculate the average scores for each sentence
# scores = A[A != 0].mean(axis=1).values

In [None]:
# Many options for how to choose which sentences to include:

# 1) top N sentences
# 2) top N words or characters.
# 3) top X% sentences or top X% words
# 4) sentences with scores > average score
# 5) sentences with scores > factor * average score

# You also don't have to sort. May make more sense in order.

# Title
title = sample_data.iloc[0].split("\n", 1)[0]

print(f"Title: {title}\nGenerated summary:")
for i in sort_idx[:5]:
    print(f"{i}: {round(scores[i], 3)} {sentences[i]}")

In [None]:
# Title
sample_data.iloc[0].split("\n", 1)[0]

In [None]:
import scipy
def load_text_data(*, filepath) -> list[str]:
    """This returns the data as a list of sentences."""

    with open(filepath, "r") as f:
        data = [line.strip() for line in f.readlines()]
    print(f"Number of lines: {len(data)}\n")
    return data

def preprocess_data(input_data:list[str]) -> tuple[str, list[str]]:
    """This is used to convert the data into sentences.It returns 
    the document as a string and as a list of sentences."""
     # Create the document
    data_str = "".join(input_data)

    # Extact and tokenize the sentences
    sents = Sentencizer()
    sentences = sents(doc=data_str)
    return (data_str, sentences)

def calculate_tfidf(input_data:str, stopwords: list[str]) ->scipy.sparse._csr.csr_matrix:
    """This calculates the TFIDF of the data.
    
    Params:
        input_data (str): The input text data.
        stopwords (list[str]): List of words that do not add value to the corpus.

    Returns:
        X_transformed (list[str]): The loades stopwords.
    """

    tfidf = TfidfVectorizer(
    stop_words=stopwords, norm="l1"
    )
    # Calculate TFIDF data
    X_transformed = tfidf.fit_transform(input_data)

    return X_transformed

def load_stop_words(add_words:list[str]) -> list[str]:
    """This loads spacy stopwords.
    
    Params:
        add_words (tuple[str]): Additional stopwords to add.

    Returns:
        stopwords (list[str]): The loades stopwords.
    """
    # Load spaCy stopwords
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    stopwords = list(stopwords)
    if add_words:
        stopwords.extend(add_words)
    return stopwords

def _calculate_sentence_score(tfidf_row:np.ndarray)-> float:
    """This returns the average score of the non-zero tfidf value
    for a given sentence."""
    x = tfidf_row[tfidf_row != 0]  # Select the non-zero values
    return x.mean()

def rank_sentences(input_data:str, stopwords: list[str],sentences: list[str], num:int=5) -> None:
    """This ranks and prints out the top 'num' ranked sentences."""
    # Calculate TFIDF
    X_transformed = calculate_tfidf(input_data, stopwords)
    # Initialize the score
    scores = np.zeros(len(sentences))

    # Calculate the score for each sentence
    for idx in range(len(sentences)):
        score = _calculate_sentence_score(X_transformed[idx, :])
        scores[idx] = score
    # Sort the scores in descending order 
    # and return the sorted indices
    sort_idx = np.argsort(-scores)

    top_idx = sort_idx[:num]
    sorted_idx = [idx for idx in top_idx]
    top_sentences = [sentences[idx] for idx in top_idx]
    result = tuple(itertools.zip_longest(sorted_idx, top_sentences))

    result = sorted(result, key=lambda x: x[0])
    print(f"The summary of the document showing {num} sentences")
    for idx, sent in result:
        print(f"idx:{idx}; {sent}")