# How to Build TF-IDF From Scratch

In [1]:
import io
import typing as t

import nltk
import numpy as np
import pandas as pd
import requests
from nltk.tokenize import word_tokenize
from scipy.sparse import csr_matrix, spmatrix

In [2]:
try:
    word_tokenize('hello world')
except:
    nltk.download('punkt')

In [3]:
request = requests.get('https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv')
df = pd.read_csv(io.BytesIO(request.content))

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
def train_test_split(
    *arrays: t.Union[pd.Series, np.ndarray, spmatrix],
    seed: int = 123,
    train_pct: float = 0.75
) -> list:
    def _getitems(array, indices):
        if hasattr(array, 'iloc') or hasattr(array, 'shape'):
            return array[indices]
        return [array[i] for i in indices]
    ret = []
    for array in arrays:
        rand = np.random.default_rng(seed)
        perm = rand.permutation(len(array))
        train_count = round(train_pct * len(array))
        train_idx = perm[:train_count]
        test_idx = perm[train_count:]
        ret += [_getitems(array, train_idx), _getitems(array, test_idx)]
    return ret

In [6]:
inputs = df['text']
labels = df['labels']
inputs_train, inputs_test, Ytrain, Ytest = train_test_split(inputs, labels, seed=12345)
len(inputs_train), len(inputs_test)

(1669, 556)

In [7]:
class CountVectorizer:
    def __init__(self, ignore_case: bool = True):
        """
        Initialize the count vectorizer.
        :param ignore_case: Lowercase all documents before processing.
        """
        self.mapping: t.Dict[str, int] = {}
        self.reverse: t.Dict[int, str] = {}
        self.ignore_case = ignore_case

    def fit(self, documents: t.Iterable[str]) -> 'CountVectorizer':
        """
        Fit the vectorizer to the given documents.
        :param documents: The raw documents to fit against.
        :return: self
        """
        self.mapping.clear()
        self.reverse.clear()
        for document in documents:
            if self.ignore_case:
                document = document.lower()
            tokens = word_tokenize(document)
            for token in tokens:
                if token not in self.mapping:
                    self.reverse[len(self.mapping)] = token
                    self.mapping[token] = len(self.mapping)
        return self

    def transform(self, documents: t.Iterable[str]) -> np.ndarray:
        """
        Vectorize the given documents.
        :param documents: The raw documents to convert into vectors.
        :return: A NumPy matrix of shape (len(documents), len(self.mapping)).
        """
        mat = np.zeros((len(documents), len(self.mapping)), np.int32)
        for i, document in enumerate(documents):
            if self.ignore_case:
                document = document.lower()
            tokens = word_tokenize(document)
            for token in tokens:
                index = self.mapping.get(token)
                if index is None:
                    continue
                mat[i, index] += 1
        return mat

    def fit_transform(self, documents: t.Iterable[str]) -> np.ndarray:
        """
        Fit against and vectorize the given documents. Eqivalent to calling
        fit() and then transform().
        :param documents: The raw documents to use.
        :return: A NumPy matrix of shape (len(documents), len(self.mapping)).
        """
        return self.fit(documents).transform(documents)

In [8]:
vectorizer = CountVectorizer()
count_arr = vectorizer.fit_transform(inputs)
count_arr

array([[1, 4, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1]], dtype=int32)

In [9]:
np.random.seed(54321)
idx = np.random.choice(len(inputs))
text = df.iloc[idx]
(
    text['labels'],
    text['text'].partition('\n')[0],
    [(x, vectorizer.reverse[x], count_arr[idx][x]) for x in (-count_arr[idx]).argsort()[:5]],
)

('entertainment',
 "Police praise 'courageous' Ozzy",
 [(31, '.', 35),
  (23, 'the', 26),
  (63, 'a', 23),
  (27, ',', 20),
  (46, 'and', 18)])

In [10]:
class TFIDFWrapper(CountVectorizer):
    def _transform(self, tf, documents):
        # print(np.sum(tf), len(documents))
        df = np.sum(tf > 0, axis=0)
        idf = np.log(len(documents) / df)
        tfidf = tf * idf
        # print(np.sum(df==0), df.shape)
        # print(tf.j, df, idf)
        return tfidf - tfidf.min()

    def transform(self, documents: t.Iterable[str]) -> np.ndarray:
        return self._transform(super().transform(documents), documents)

    def fit_transform(self, documents: t.Iterable[str]) -> np.ndarray:
        self.fit(documents)
        return self.transform(documents)

In [11]:
tfidf = TFIDFWrapper()
tfidf_arr = tfidf.fit_transform(inputs)
tfidf_arr

array([[5.22260554, 9.5575688 , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 7.70751219, 7.70751219,
        7.70751219]])

In [12]:
np.random.seed(54321)
idx = np.random.choice(len(inputs))
text = df.iloc[idx]
(
    text['labels'],
    text['text'].partition('\n')[0],
    [(x, tfidf.reverse[x], tfidf_arr[idx][x]) for x in (-tfidf_arr[idx]).argsort()[:5]],
)

('entertainment',
 "Police praise 'courageous' Ozzy",
 [(16949, 'ozzy', 46.26229934152561),
  (15888, 'osbourne', 22.512282611682018),
  (12214, 'sharon', 21.61970840642518),
  (9493, 'police', 21.24278352096904),
  (16951, 'burglar', 19.826699717796693)])