# TF-IDF (from scratch) And Word Embeddings

In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Feature engine
from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

import spacy

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Custom imports

# Built-in library
import itertools
import re
import json
from typing import Union, Optional, Any
import logging
import warnings

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configure warnings and pther settings
warnings.filterwarnings("ignore")
sns.set()


nlp = spacy.load("en_core_web_sm")


def load_data(*, filename: str) -> pd.DataFrame:
    """This is used to load the data.

    Params;
        filename (str): The filepath.

    Returns:
        df (pd.DataFrame): The loaded dataframe.
    """
    df = pd.read_csv(filename)
    print(f"Shape of df: {df.shape}\n")
    return df

In [3]:
filename = "../../data/bbc_text_cls.csv"
data = load_data(filename=filename)

data.head(2)

Shape of df: (2225, 2)



Unnamed: 0,text,labels
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet bus...",business
1,"Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAnd Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of th...",business


In [4]:
class Tokenize:
    def __init__(self) -> None:
        self.nlp = nlp

    def __call__(self, doc: str, *args: Any, **kwargs: Any) -> list[str]:
        # Tokenize
        doc = nlp(doc)
        tokenized_doc = [word.text.lower() for word in doc]
        return tokenized_doc

In [5]:
d = {
    "text": [
        "Thank you for being an awesome father",
        "I have an awesome God. I just wanna say thank you",
    ],
    "label": ["a", "b"],
}

df = pd.DataFrame(d)

df

Unnamed: 0,text,label
0,Thank you for being an awesome father,a
1,I have an awesome God. I just wanna say thank you,b


In [6]:
class BagOfWordsCalculator:
    """This tokenizes all the documents and calculates the bag of words."""

    def __init__(self) -> None:
        self.tokenizer = Tokenize()

    def __call__(
        self, data: pd.DataFrame, *args: Any, **kwargs: Any
    ) -> tuple[list, dict]:
        """This calculates the bag of words."""
        count = 0
        bag_of_words = {}
        tokenized_docs = []

        for doc in data["text"]:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in bag_of_words:
                    bag_of_words[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(bag_of_words.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            tokenized_docs.append(doc_as_num)
        return (tokenized_docs, bag_of_words)

In [7]:
# bow_calculator = BagOfWordsCalculator()
# tokenized_docs, bag_of_words = bow_calculator(data=data)

In [8]:
b_o_words_cal = BagOfWordsCalculator()
t_docs, b_o_words = b_o_words_cal(data=df)

# number of docs and number of words
N, V = df.shape[0], len(b_o_words)
tf = np.zeros((N, V))
tf

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [9]:
# Check for each word in the doc and increment the count
# of the word wherever it occurs
for doc_idx, doc_as_num in enumerate(t_docs):
    for words_idx in doc_as_num:
        tf[doc_idx, words_idx] += 1

tf

array([[1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 1., 1., 0., 2., 1., 1., 1., 1., 1., 1.]])

## Putting It All Together

In [10]:
class CustomCountVectorizer:
    """This is used to count the terms in a given document."""

    def __init__(self, data: pd.Series) -> None:
        self.data = data
        self.bag_of_words = {}
        self.tokenized_docs = []
        self.tokenizer = Tokenize()

    def tokenize_docs_n_cal_bag_of_words(self) -> tuple[list, dict]:
        """This tokenizes the documents abd calculates the bag of words."""
        count = 0

        for doc in self.data:
            # Tokenize docs
            tokenized_doc = self.tokenizer(doc=doc)
            doc_as_num = []

            for word in tokenized_doc:
                # Store the unique words as numbers in the dict
                if word not in self.bag_of_words:
                    self.bag_of_words[word] = count
                    count += 1
                # Save the word as a number
                doc_as_num.append(self.bag_of_words.get(word))
            # Store the tokenized docs (converted to numbers) in a list
            self.tokenized_docs.append(doc_as_num)
        return (self.tokenized_docs, self.bag_of_words)

    def calculate_term_frequency(self, *args: Any, **kwargs: Any) -> np.ndarray:
        """Count the terms in the document. i.e term frequency"""
        self.tokenized_docs, self.bag_of_words = self.tokenize_docs_n_cal_bag_of_words()
        N, V = self.data.shape[0], len(self.bag_of_words)
        tf = np.zeros((N, V))

        # Check for each word in the doc and increment the count
        # of the word wherever it occurs
        for doc_idx, doc_as_num in enumerate(self.tokenized_docs):
            for words_idx in doc_as_num:
                tf[doc_idx, words_idx] += 1
        return tf

In [11]:
df

Unnamed: 0,text,label
0,Thank you for being an awesome father,a
1,I have an awesome God. I just wanna say thank you,b


In [12]:
count_vectorizer = CustomCountVectorizer(data=df["text"])
tf = count_vectorizer.calculate_term_frequency()
tf

array([[1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 1., 1., 0., 2., 1., 1., 1., 1., 1., 1.]])