In [1]:
#Importing rqruired libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Amazon product review
#https://www.amazon.in/Girnar-Detox-Green-Desi-Kahwa/dp/B092W15FNN/ref=sr_1_11?crid=77VGUM2XZZ66&keywords=food%2Breview&qid=1663130729&sprefix=food%2Breview%2Caps%2C108&sr=8-11&th=1
review_corpus = [
     'this tea is tasty and healthy',
     'good and tasty tea worth of money',
     'great tea and tasty',
     'true medicinal tea'
]

In [9]:
review_corpus

['this tea is tasty and healthy',
 'good and tasty tea worth of money',
 'great tea and tasty',
 'true medicinal tea']

### SkLearn Implementation

In [75]:
vectorizer = TfidfVectorizer()
vectorizer.fit(review_corpus)
skl_output = vectorizer.transform(review_corpus)

In [76]:
# sklearn feature names, they are sorted in alphabetic order by default.
print(vectorizer.get_feature_names())

['and', 'good', 'great', 'healthy', 'is', 'medicinal', 'money', 'of', 'tasty', 'tea', 'this', 'true', 'worth']


In [77]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the review each vocab has its idf value.
print(vectorizer.idf_)

[1.22314355 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.91629073 1.22314355 1.         1.91629073 1.91629073
 1.91629073]


In [81]:
#row - total no (4) of documents in corpus
#column-  No of distinct (13) vocab
print(skl_output.shape)

(4, 13)


In [79]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix
print(skl_output[0])

  (0, 10)	0.4946411955372537
  (0, 9)	0.258124295708245
  (0, 8)	0.3157230677330621
  (0, 4)	0.4946411955372537
  (0, 3)	0.4946411955372537
  (0, 0)	0.3157230677330621


In [80]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.
print(skl_output[0].toarray())

[[0.31572307 0.         0.         0.4946412  0.4946412  0.
  0.         0.         0.31572307 0.2581243  0.4946412  0.
  0.        ]]


In [63]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

<h3><strong>Fit method:</strong></h3>
With this function, find all unique words in the data and assign a dimension-number to each unique word.

Create a python dictionary to save all the unique words, such that the key of dictionary represents a unique word and the corresponding value represent it's dimension-number. Values are always sorted in ascending order.

For example, if you have a review, __'how are you'__ then you can represent each unique word with a dimension_number as,
dict = { 'are' : 0, 'how' : 1, 'you' : 2}
Similar to bagofwords

In [15]:
from tqdm import tqdm # tqdm is a library that helps us to visualize the runtime of for loop. refer this to know more about tqdm
#https://tqdm.github.io/

# it accepts only list of sentances
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        
        return vocab
    else:
        print("you need to pass list of sentance")

<h3><strong>Transform method:</strong></h3>
In first part of transform which take input as dataset and vocab list and returns sparse matrix same as bagofwords

In [16]:
def transform(dataset,vocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)): # for each document in the dataset
            # it will return a dict type object where key is the word and values is its frequency, {word:frequency}
            word_freq = dict(Counter(row.split()))
            # for every unique word in the document
            for word, freq in word_freq.items():  # for each unique word in the review.                
                if len(word) < 2:
                    continue
                # we will check if its there in the vocabulary that we build in fit() function
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = vocab.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    # we are storing the frequency of the word
                    values.append(freq)
        return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
    else:
        print("you need to pass list of strings")

<h3><strong>calculate tfi-df:</strong></h3>
  <li> Below formula we use as per sklear documentation</li>
  <br>          
 $IDF(t) = 1+\log_{e}\frac{1\text{ }+\text{ Total  number of documents in collection}} {1+\text{Number of documents with term t in it}}.$
        </li>   

In [49]:
def calculate_tfidf(matrix):
    total_rows=dense_matrix.shape[0]
    total_cols=dense_matrix.shape[1]
    idf = [[0 for x in range(total_cols)] for y in range(total_rows)] 
    tf= [[0 for x in range(total_cols)] for y in range(total_rows)] 
    tf_idf= [[0 for x in range(total_cols)] for y in range(total_rows)] 
    no_of_doc=total_rows
    rows1 = []
    columns1 = []
    values1 = []
    
    for i in range(total_rows):
        total_no_of_words_row=np.count_nonzero(matrix[i])
        #print("Loop No : {} and total_no_of_words_row: {}".format(i,total_no_of_words_row))
        for  j in range(total_cols):
            no_of_time_occ=matrix[i][j]
            tf[i][j]=no_of_time_occ/total_no_of_words_row
            total_no_of_words_col=np.count_nonzero(matrix[:,j])
            idf[i][j]=1+math.log((1+no_of_doc)/(1+total_no_of_words_col))
            tf_idf[i][j]=tf[i][j]*idf[i][j]
            if (math.isnan(tf_idf[i][j])):
                tf_idf[i][j]=0
            if (tf_idf[i][j] !=0):
                values1.append(tf_idf[i][j])
                rows1.append(i)
                columns1.append(j)
        
    
    return csr_matrix((values1, (rows1,columns1)), shape=(total_rows,total_cols)),idf,tf,tf_idf

In [87]:
vocab = fit(review_corpus)
saprse_matrix=transform(review_corpus, vocab)
dense_matrix=saprse_matrix.toarray()
tf_idf_s,idf_d,tf_d,tf_idf_d=calculate_tfidf(dense_matrix)
#Normalization of sparse matrix values
norm_val=normalize(tf_idf_s, norm='l2',axis=1, return_norm=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]


<h3><strong>Comparision:</strong></h3>

In [141]:
print(list(vocab.keys()))
print("*"*110)
print(vectorizer.get_feature_names())

['and', 'good', 'great', 'healthy', 'is', 'medicinal', 'money', 'of', 'tasty', 'tea', 'this', 'true', 'worth']
**************************************************************************************************************
['and', 'good', 'great', 'healthy', 'is', 'medicinal', 'money', 'of', 'tasty', 'tea', 'this', 'true', 'worth']


In [142]:
#sklearn values are more roundof of 8 decimal points
print(idf_d[0])
print("*"*120)
print(vectorizer.idf_)

[1.2231435513142097, 1.916290731874155, 1.916290731874155, 1.916290731874155, 1.916290731874155, 1.916290731874155, 1.916290731874155, 1.916290731874155, 1.2231435513142097, 1.0, 1.916290731874155, 1.916290731874155, 1.916290731874155]
************************************************************************************************************************
[1.22314355 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073
 1.91629073 1.91629073 1.22314355 1.         1.91629073 1.91629073
 1.91629073]


In [138]:
print(dense_matrix.shape)
print("*"*50)
print(skl_output.shape)

(4, 13)
**************************************************
(4, 13)


In [137]:
#sparse matrix sequense is random but it s in same position so not any issue
print(norm_val[0])
print("*"*50)
print(skl_output[0])

  (0, 0)	0.3157230677330621
  (0, 3)	0.4946411955372538
  (0, 4)	0.4946411955372538
  (0, 8)	0.3157230677330621
  (0, 9)	0.2581242957082451
  (0, 10)	0.4946411955372538
**************************************************
  (0, 10)	0.4946411955372537
  (0, 9)	0.258124295708245
  (0, 8)	0.3157230677330621
  (0, 4)	0.4946411955372537
  (0, 3)	0.4946411955372537
  (0, 0)	0.3157230677330621


In [132]:
print((norm_val[0].toarray()))
print("*"*60)
print((skl_output[0].toarray()))

[[0.31572307 0.         0.         0.4946412  0.4946412  0.
  0.         0.         0.31572307 0.2581243  0.4946412  0.
  0.        ]]
************************************************************
[[0.31572307 0.         0.         0.4946412  0.4946412  0.
  0.         0.         0.31572307 0.2581243  0.4946412  0.
  0.        ]]
