# TF-IDF (from scratch) And Word Embeddings

In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Feature engine
from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

import spacy

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Custom imports

# Built-in library
import itertools
import re
import json
from typing import Union, Optional, Any
import logging
import warnings

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configure warnings and pther settings
warnings.filterwarnings("ignore")
sns.set()


nlp = spacy.load("en_core_web_sm")


def load_data(*, filename: str) -> pd.DataFrame:
    """This is used to load the data.

    Params;
        filename (str): The filepath.

    Returns:
        df (pd.DataFrame): The loaded dataframe.
    """
    df = pd.read_csv(filename)
    print(f"Shape of df: {df.shape}\n")
    return df

In [3]:
filename = "../../data/bbc_text_cls.csv"
data = load_data(filename=filename)

data.head(2)

Shape of df: (2225, 2)



Unnamed: 0,text,labels
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet bus...",business
1,"Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAnd Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of th...",business


In [62]:
class Tokenizer:
    """This is used to tokenize documents"""

    def __init__(self) -> None:
        self.nlp = nlp

    def __call__(self, doc: str, *args: Any, **kwargs: Any) -> list[str]:
        # Tokenize
        doc = nlp(doc)
        tokenized_doc = [word.text.lower() for word in doc]
        return tokenized_doc

In [5]:
d = {
    "text": [
        "Thank you for being an awesome father",
        "I have an awesome God. I just wanna say thank you",
    ],
    "labels": ["a", "b"],
}

df = pd.DataFrame(d)

df

Unnamed: 0,text,labels
0,Thank you for being an awesome father,a
1,I have an awesome God. I just wanna say thank you,b


In [6]:
class BagOfWordsCalculator:
    """This tokenizes all the documents and calculates the bag of words."""

    def __init__(self) -> None:
        self.tokenizer = Tokenizer()

    def __call__(
        self, data: pd.DataFrame, *args: Any, **kwargs: Any
    ) -> tuple[list, dict]:
        """This calculates the bag of words."""
        count = 0
        bag_of_words = {}
        tokenized_docs = []

        for doc in data:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in bag_of_words:
                    bag_of_words[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(bag_of_words.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            tokenized_docs.append(doc_as_num)
        return (tokenized_docs, bag_of_words)

In [7]:
# bow_calculator = BagOfWordsCalculator()
# tokenized_docs, bag_of_words = bow_calculator(data=data["text"])

In [8]:
b_o_words_cal = BagOfWordsCalculator()
t_docs, b_o_words = b_o_words_cal(data=df["text"])

# number of docs and number of words
N, V = df.shape[0], len(b_o_words)
tf = np.zeros((N, V))
tf

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [9]:
# Check for each word in the doc and increment the count
# of the word wherever it occurs
for doc_idx, doc_as_num in enumerate(t_docs):
    for words_idx in doc_as_num:
        tf[doc_idx, words_idx] += 1

tf

array([[1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 1., 1., 0., 2., 1., 1., 1., 1., 1., 1.]])

## Putting It All Together

In [10]:
class CustomCountVectorizer:
    """This is used to count the terms in a given document."""

    def __init__(self, data: pd.Series) -> None:
        self.data = data
        self.bag_of_words = {}
        self.tokenized_docs = []
        self.tokenizer = Tokenizer()

    def tokenize_docs_n_cal_bag_of_words(self) -> tuple[list, dict]:
        """This tokenizes the documents abd calculates the bag of words."""
        count = 0

        for doc in self.data:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in self.bag_of_words:
                    self.bag_of_words[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(self.bag_of_words.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            self.tokenized_docs.append(doc_as_num)
        return (self.tokenized_docs, self.bag_of_words)

    def calculate_term_frequency(self, *args: Any, **kwargs: Any) -> np.ndarray:
        """Count the terms in the document. i.e term frequency"""
        self.tokenized_docs, self.bag_of_words = self.tokenize_docs_n_cal_bag_of_words()
        # Number of docs and number of words
        N, W = self.data.shape[0], len(self.bag_of_words)
        tf = np.zeros((N, W))  # Instantiate tf

        # Check each word in the doc and increment the count
        # of the word wherever it occurs
        for doc_idx, doc_as_num in enumerate(self.tokenized_docs):
            for words_idx in doc_as_num:
                tf[doc_idx, words_idx] += 1
        return tf

In [11]:
df

Unnamed: 0,text,labels
0,Thank you for being an awesome father,a
1,I have an awesome God. I just wanna say thank you,b


In [12]:
count_vectorizer = CustomCountVectorizer(data=df["text"])
tf = count_vectorizer.calculate_term_frequency()
tf

array([[1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 1., 1., 0., 2., 1., 1., 1., 1., 1., 1.]])

In [13]:
# Compute IDF
# DF: Document Frequency is the num of documents the term occurs in.
# IDF: Number of documents(N) divided by DF. The log is taken to
# reduce the impact of extremely large documents.
# Therefore, IDF = log(N / DF)
N = df.shape[0]
document_frequency = (tf > 0).sum(axis=0)  # shape (W,)
idf = np.log(N / document_frequency)

idf

array([0.        , 0.        , 0.69314718, 0.69314718, 0.        ,
       0.        , 0.69314718, 0.69314718, 0.69314718, 0.69314718,
       0.69314718, 0.69314718, 0.69314718, 0.69314718])

In [15]:
tf_idf = tf * idf
tf_idf

array([[0.        , 0.        , 0.69314718, 0.69314718, 0.        ,
        0.        , 0.69314718, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.38629436, 0.69314718, 0.69314718,
        0.69314718, 0.69314718, 0.69314718, 0.69314718]])

In [16]:
class TermFrequencyInverseDocumentFrequency:
    """This is used to calculate the term frequency
    inverse document frequency of a given corpus."""

    def __init__(self, data: pd.Series) -> None:
        self.data = data
        self.bag_of_words = {}
        self.tokenized_docs = []
        self.tokenizer = Tokenizer()

    def tokenize_docs_n_cal_bag_of_words(self) -> tuple[list, dict]:
        """This tokenizes the documents abd calculates the bag of words."""
        count = 0

        for doc in self.data:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in self.bag_of_words:
                    self.bag_of_words[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(self.bag_of_words.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            self.tokenized_docs.append(doc_as_num)
        return (self.tokenized_docs, self.bag_of_words)

    def convert_numbers_to_words(self) -> dict:
        """This is used to map numbers to words."""
        _, self.bag_of_words = self.tokenize_docs_n_cal_bag_of_words()
        nums_to_words = {num: word for word, num in self.bag_of_words.items()}
        return nums_to_words

    def calculate_term_frequency(self, *args: Any, **kwargs: Any) -> np.ndarray:
        """Count the terms in the document. i.e term frequency"""
        self.tokenized_docs, self.bag_of_words = self.tokenize_docs_n_cal_bag_of_words()
        # Number of docs and number of words
        N, W = self.data.shape[0], len(self.bag_of_words)
        tf = np.zeros((N, W))  # Instantiate tf

        # Check each word in the doc and increment the count
        # of the word wherever it occurs
        for doc_idx, doc_as_num in enumerate(self.tokenized_docs):
            for words_idx in doc_as_num:
                tf[doc_idx, words_idx] += 1
        return tf

    def calculate_term_freq_inv_doc_freq(self) -> np.ndarray:
        """This returns the term frequency inverse document frequency
        of a given corpus."""
        # DF: Document Frequency is the num of documents the term occurs in.
        # IDF: Number of documents(N) divided by DF. The log is taken to
        # reduce the impact of extremely large documents.
        # Therefore, IDF = log(N / DF)
        N = self.data.shape[0]
        tf = self.calculate_term_frequency()
        document_frequency = (tf > 0).sum(axis=0)  # shape (W,)
        inverse_doc_freq = np.log(N / document_frequency)
        tf_idf = tf * inverse_doc_freq
        return tf_idf

In [17]:
tfidf_vec = TermFrequencyInverseDocumentFrequency(data=df["text"])
tfidf = tfidf_vec.calculate_term_freq_inv_doc_freq()
tfidf

array([[0.        , 0.        , 0.69314718, 0.69314718, 0.        ,
        0.        , 0.69314718, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.38629436, 0.69314718, 0.69314718,
        0.69314718, 0.69314718, 0.69314718, 0.69314718]])

In [19]:
idx_2_word = tfidf_vec.convert_numbers_to_words()

# pick a random document, show the top 5 terms (in terms of tf_idf score)
# i = np.random.choice(N)
i = 0
row = df.iloc[i]
print("Label:", row["labels"])
print("Text:", row["text"])
print("Top 5 terms:")

scores = tfidf[i]
indices = (-scores).argsort()  # Sort in descending order

for idx in indices[:5]:
    print(idx_2_word[idx])

Label: a
Text: Thank you for being an awesome father
Top 5 terms:
for
being
father
thank
you


In [20]:
data.head(3)

Unnamed: 0,text,labels
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet bus...",business
1,"Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAnd Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of th...",business
2,"Yukos unit buyer faces loan claim\n\nThe owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit to pay back a $900m (£479m) loan.\n\nState-owned Rosneft bought the Yugansk unit for $9.3bn in a sale forced by Russia to part settle a $27.5bn tax claim against Yukos. Yukos' owner Menatep Group says it will ask Rosneft to repay a loan that Yugansk had secured on its assets. Rosneft already faces a similar $540m repayment demand from foreign banks. Legal experts said Rosneft's purchase of Yugansk would include such obligations. ""The pledged assets are wit...",business


In [21]:
tfidf_vec = TermFrequencyInverseDocumentFrequency(data=data["text"])
tfidf = tfidf_vec.calculate_term_freq_inv_doc_freq()
idx_2_word = tfidf_vec.convert_numbers_to_words()

In [89]:
# Pick a random document, show the top 5 terms (in terms of tf_idf score)
N = data.shape[0]
i = np.random.choice(N)
row = data.iloc[i]

print(f"i: {i}")
print("Label:", row["labels"])
print("Text:", row["text"].split("\n")[0])
print("Top 5 terms:")

scores = tfidf[i]
indices = (-scores).argsort()  # Sort in descending order

for idx in indices[:5]:
    print(idx_2_word[idx])

i: 144
Label: business
Text: Winn-Dixie files for bankruptcy
Top 5 terms:
dixie
winn
bankruptcy
stores
foods


### Verify Using SKLearn's TfidfVectorizer

In [85]:
# tf_idf_vec = TfidfVectorizer(
#     stop_words="english", tokenizer=Tokenizer(), max_features=25_000
# )
# X = data["text"]
# X_tr = tf_idf_vec.fit_transform(X)

# dict_ = tf_idf_vec.vocabulary_
# idx_2_word_dict = {num: word for word, num in dict_.items()}

In [90]:
# i = 594
row = data.iloc[i]

print(f"i: {i}")
print("Label:", row["labels"])
print("Text:", row["text"].split("\n")[0])
print("Top 5 terms:")

scores = X_tr[i].toarray().flatten()
indices = (-scores).argsort()  # Sort in descending order

for idx in indices[:5]:
    print(idx_2_word_dict[idx])

i: 144
Label: business
Text: Winn-Dixie files for bankruptcy
Top 5 terms:
winn
dixie
bankruptcy
stores
foods


<br>

## Text Summarization

### Using TFIDF

1. Split the document into sentences.
2. Score each sentence (using the average TFIDF of the non zero scores)
3. Rank each sentence by the scores.
4. Summary is approximately the top N ranked sentences by score.

In [91]:
data.head(1)

Unnamed: 0,text,labels
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet bus...",business


In [163]:
class Sentencizer:
    """This is used to convert a document into a list of sentences.
    It returns sentences."""

    def __init__(self) -> None:
        self.nlp = nlp

    def __call__(self, doc: str, *args: Any, **kwargs: Any) -> list[str]:
        # Tokenize
        doc = nlp(doc)
        sentences = list(doc.sents)
        tokenized_sentences = [str(sentence) for sentence in sentences]
        return tokenized_sentences

In [164]:
data["labels"].unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [165]:
# Select document
sample_data = (
    data.loc[data["labels"] == "entertainment", "text"]
    .sample(n=3, random_state=123)
    .reset_index(drop=True)
)
sample_data

0    Goodrem wins top female MTV prize\n\nPop singer Delta Goodrem has scooped one of the top individual prizes at the first Australian MTV Music Awards.\n\nThe 21-year-old singer won the award for best female artist, with Australian Idol runner-up Shannon Noll taking the title of best male at the ceremony. Goodrem, known in both Britain and Australia for her role as Nina Tucker in TV soap Neighbours, also performed a duet with boyfriend Brian McFadden. Other winners included Green Day, voted best group, and the Black Eyed Peas. Goodrem, Green Day and the Black Eyed Peas took home two awards ea...
1    Tough schedule delays Elliot show\n\nPreview performances of the £3m musical Billy Elliot have been delayed to give the child actors a less arduous rehearsal schedule.\n\nDirector Stephen Daldry made the decision to re-schedule the previews to protect the young stars. Three boys will rotate the demanding role of ballet dancer Billy, which requires them to sing, dance and act. The show's 

In [166]:
# Split ONCE using '\n' and exclude the title
doc = sample_data.iloc[0].split("\n", 1)[1]
doc[:200]

'\nPop singer Delta Goodrem has scooped one of the top individual prizes at the first Australian MTV Music Awards.\n\nThe 21-year-old singer won the award for best female artist, with Australian Idol runn'

In [167]:
sents = Sentencizer()
sentences = sents(doc=doc)
print(len(sentences))

sentences

11


['\nPop singer Delta Goodrem has scooped one of the top individual prizes at the first Australian MTV Music Awards.\n\n',
 'The 21-year-old singer won the award for best female artist, with Australian Idol runner-up Shannon Noll taking the title of best male at the ceremony.',
 'Goodrem, known in both Britain and Australia for her role as Nina Tucker in TV soap Neighbours, also performed a duet with boyfriend Brian McFadden.',
 'Other winners included Green Day, voted best group, and the Black Eyed Peas.',
 'Goodrem, Green Day and the Black Eyed Peas took home two awards each.',
 'As well as best female, Goodrem also took home the Pepsi Viewers Choice Award, whilst Green Day bagged the prize for best rock video for American Idiot.',
 "The Black Eyed Peas won awards for best R 'n' B video and sexiest video, both for Hey Mama.",
 'Local singer and songwriter Missy Higgins took the title of breakthrough artist of the year, with Australian Idol winner Guy Sebastian taking the honours for b

In [168]:
# Load spaCy stopwords
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
spacy_stopwords = list(spacy_stopwords)
spacy_stopwords[:5]

['who', 'none', 'keep', 'name', 'thereby']

In [240]:
# Vectorize the sentences
tfidf = TfidfVectorizer(stop_words=spacy_stopwords, norm="l1")
X_tr = tfidf.fit_transform(sentences)
X_tr.shape

(11, 102)

In [241]:
def calculate_sentence_score(tfidf_row):
    """This returns the average value of the non-zero tfidf value
    for a given sentence."""
    x = tfidf_row[tfidf_row != 0]
    return x.mean()


# Initialize the score
scores = np.zeros(len(sentences))
scores

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [242]:
# Calculate the score for each sentence
for i in range(len(sentences)):
    score = calculate_sentence_score(X_tr[i, :])
    scores[i] = score

In [243]:
# Sort the scores in descending order
sort_idx = np.argsort(-scores)
sort_idx

array([ 8,  4,  9,  3,  6,  0,  2,  1,  5, 10,  7])

In [244]:
# # Another method for calculating the scores
# A = X_tr.toarray()

# B = pd.DataFrame(A)
# scores = B[B != 0].mean(axis=1).values

In [246]:
# Many options for how to choose which sentences to include:

# 1) top N sentences
# 2) top N words or characters.
# 3) top X% sentences or top X% words
# 4) sentences with scores > average score
# 5) sentences with scores > factor * average score

# You also don't have to sort. May make more sense in order.

# Title
title = sample_data.iloc[0].split("\n", 1)[0]

print(f"Title: {title}\nGenerated summary:")
for i in sort_idx[:5]:
    print(f"{i}: {round(scores[i], 3)} {sentences[i]}")

Title: Goodrem wins top female MTV prize
Generated summary:
8: 0.125 The VH1 First Music Award went to Cher honouring her achievements within the music industry.
4: 0.111 Goodrem, Green Day and the Black Eyed Peas took home two awards each.
9: 0.1 The ceremony was held at the Luna Park fairground in Sydney Harbour and was hosted by the Osbourne family.
3: 0.1 Other winners included Green Day, voted best group, and the Black Eyed Peas.
6: 0.1 The Black Eyed Peas won awards for best R 'n' B video and sexiest video, both for Hey Mama.


In [162]:
# Title
sample_data.iloc[0].split("\n", 1)[0]

'Goodrem wins top female MTV prize'