<a href="https://colab.research.google.com/github/fahmi54321/nlp_tfidf/blob/main/tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

--2024-05-24 13:31:49--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5085081 (4.8M) [text/csv]
Saving to: ‘bbc_text_cls.csv’


2024-05-24 13:31:50 (41.2 MB/s) - ‘bbc_text_cls.csv’ saved [5085081/5085081]



In [2]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
df = pd.read_csv('bbc_text_cls.csv')

In [5]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [6]:
# populate word2idx
# convert documents into sequences of ints / ids / indices

idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
  words = word_tokenize(doc.lower())
  doc_as_int = []
  for word in words:
    if word not in word2idx:
      word2idx[word] = idx
      idx += 1

    # save for later
    doc_as_int.append(word2idx[word])
  tokenized_docs.append(doc_as_int)

In [18]:
# reverse mapping
# if you do it smarter you can store it as a list
idx2word = {v: k for k, v in word2idx.items()}
print(idx2word)



In [19]:
# number of documents
N = len(df['text'])
print(N)

2225


In [20]:
# number of words
V = len(word2idx)
print(V)

34762


In [21]:
# instatiate term-frequency matrix
# note: could have also used count vectorizer
tf = np.zeros((N,V))
print(tf)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [22]:
# populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
  for j in doc_as_int:
    tf[i,j] += 1

print(tf)

[[1. 4. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]]


In [23]:
# compute IDF
document_freq = np.sum(tf > 0, axis = 0) # document frequency (shape = (V,))
idf = np.log(N / document_freq)

print(idf)

[5.22260554 2.3893922  2.86332511 ... 7.70751219 7.70751219 7.70751219]


In [24]:
# compute TF-IDF
tf_idf = tf * idf
print(tf_idf)

[[5.22260554 9.5575688  2.86332511 ... 0.         0.         0.        ]
 [0.         0.         2.86332511 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 7.70751219 7.70751219 7.70751219]]


In [25]:
np.random.seed(123)

In [29]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print('Label: ',row['labels'])
print('Text: ',row['text'].split("\n", 1)[0])
print("top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
  print(idx2word[j])

Label:  tech
Text:  IBM puts cash behind Linux push
top 5 terms:
linux
ibm
workplace
software
programs
