<a href="https://colab.research.google.com/github/d212digital/TF-IDF-Recommender-Engine/blob/master/TFIDF_From_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Dataset here: https://www.kaggle.com/datasets/shivamkushwaha/bbc-full-text-document-classification


In [1]:
# if using google colab use this to import your file 

from google.colab import files
uploaded = files.upload()

# otherwise use
# read in the data
#df = pd.read_csv('bbc_text_cls.csv')

Saving bbc_text_cls.csv to bbc_text_cls.csv


In [2]:
# our imports are simple because we will be using our TFIDF using basic components

import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [3]:
#  download the data for NLTK word tokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# let us import the dataset

df = pd.read_csv('bbc_text_cls.csv')

In [5]:
# check the information we have

df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [6]:
# populate word2idx mapping
# convert documents into sequences of ints / ids / indices and see which word corresponds to which 
# columns in our TFIDF matrix

idx = 0 #initialise to 0
word2idx = {} # create an empty dict
tokenized_docs = [] # create empty list to store tokenized documents - helpful for later on when we do our counts
for doc in df['text']: #loop through text column and lower case all tokens
  words = word_tokenize(doc.lower())
  doc_as_int = []
  for word in words:
    if word not in word2idx:
      word2idx[word] = idx
      idx += 1

      # save for later
    doc_as_int.append(word2idx[word])
  tokenized_docs.append(doc_as_int)
    

In [7]:
# reverse mapping
# if you do it smarter you can store it as a list
idx2word = {v:k for k, v in word2idx.items()}


In [8]:
# number of documents
N = len(df['text'])

In [9]:
# number of words
V = len(word2idx)

In [10]:
# instantiate term-frequency matrix
# note: you could use also use count vectorizer here too
tf = np.zeros((N, V))

In [11]:
# populate term frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
  for j in doc_as_int:
    tf[i, j] += 1

In [12]:
# compute IDF Note that can be derived from the TF matrix
document_freq = np.sum(tf > 0, axis=0) # document frequency (shape = (V,))
idf = np.log(N / document_freq)

In [13]:
#  compute TF-IDF
tf_idf = tf * idf

In [14]:
np.random.seed(123)

In [18]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
  print(idx2word[j])


Label: tech
Text: IBM puts cash behind Linux push
Top 5 terms:
linux
ibm
workplace
software
programs
