# In this tutorial, we will be building a TF-IDF model from scratch

In [3]:
# Load libraries
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [4]:
nltk.download('punkt') # the tokenizer model

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\debnathk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
df = pd.read_csv('../data/bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\r\n\r\nQuart...,business
1,Dollar gains on Greenspan speech\r\n\r\nThe do...,business
2,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business
3,High fuel prices hit BA's profits\r\n\r\nBriti...,business
4,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business


In [6]:
# populate word2idx
# convert documents into sequences of ints / ids / indices

idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
        
        # save for later 
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

In [7]:
# reverse mapping
idx2word = {v:k for k,v in word2idx.items()}

In [8]:
# number of documents
N = len(df['text'])
N

2225

In [9]:
# number of words
V = len(word2idx)
V

34762

In [10]:
# Instantiate term-frequency matrix
tf = np.zeros((N, V))
tf.shape

(2225, 34762)

In [11]:
# Populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

In [12]:
tf

array([[1., 4., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [13]:
# Compute IDF
document_freq = np.sum(tf > 0, axis=0)
idf = np.log(N / document_freq)

In [14]:
# Compute TF-IDF
tf_idf = tf * idf
tf_idf

array([[5.22260554, 9.5575688 , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 7.70751219, 7.70751219,
        7.70751219]])

In [15]:
np.random.seed(123)

In [22]:
# Pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label: ", row['labels'])
print("Text: ", row['text'].split("\n", 1)[0])
print("Top 5 terms: ")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j])


Label:  sport
Text:  Wenger shock at Newcastle dip
Top 5 terms: 
wenger
fulham
newcastle
henry
matters
