
<font face='georgia'>
    <h4><strong>Build a TFIDF Vectorizer & compare its results with Sklearn:</strong></h4>

In [123]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix
import math
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer


### Custom ```fit``` method 

In [3]:
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of strings")

In [161]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

vocab = fit(corpus)
print(list(vocab.keys()))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


**Performing sklearn tfid vectorizer**

In [124]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [125]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


**Observation** : Output from custom fit method and output from sklearn tfidf vectorizer fit method are same.

### Calculating custom ```idf```

In [162]:
def idf_(dataset, vocab):
    unique_words = list(vocab.keys())
    # intializing a dict to store idf values as values and subsequent words as keys
    idf_val = {}
    for word in unique_words: # for every word in list of unique words
        doc_with_words = 0
        for row in dataset: # for each sentence in dataset
            if word in row.split():
                doc_with_words += 1
        # idf calculation
        idf_val[word] = 1 + math.log((1 + len(dataset)) / float(1 + doc_with_words))
    return np.array(list(idf_val.values()))

print(idf_(corpus, vocab))

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [128]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


**Observation** : `idf` from custom method and `idf` from sklear tfidf vectorizer are same.

### Custom ```transform``` method

In [150]:
def transform(dataset, vocab):
    
    # intializing required list and dictionaries for calculating tf and idf
    unique_words = list(vocab.keys())
    idf_val = {}
    tf_val = {}
    tf_idf_values = []
    if isinstance(dataset, (list,)):  # check if its list type or not
        
        # calculating idf
        for word in unique_words: # for each word in list of unique words
            doc_with_words = 0
            for row in dataset:   # for each sentence in dataset
                if word in row.split():
                    doc_with_words += 1
            # idf calculation
            idf_val[word] = 1 + math.log((1 + len(dataset)) / float(1 + doc_with_words))  
         
        # calculating tf
        for word in unique_words: # for each word in list of unique words
            sent_tf_vec = []
            for row in dataset: # for each sentence in dataset
                doc_freq = 0
                for token in row.split(): # for each word in the sentence
                    if word == token:
                        doc_freq += 1
                tf = doc_freq / len(row.split()) # tf calculation
                sent_tf_vec.append(tf)
            tf_val[word] = sent_tf_vec
        
        
        # calculating tfidf values
        for token in tqdm(tf_val.keys()):  # for each word in tf-dictionary
            tf_idf_sentences = []           
            for tf_sent in tf_val[token]: 
                tf_idf_val = tf_sent * idf_val[token]
                tf_idf_sentences.append(tf_idf_val)
            tf_idf_values.append(tf_idf_sentences)
        # normalizing the final output and converting it into a sparse matrix
        return csr_matrix(normalize(np.array(tf_idf_values).T, norm = 'l2'))
    else:
        print("you need to pass list of strings")

In [144]:
# shape of custom tfidf vectorizer after apllying transform method
print(transform(corpus, vocab).shape)
# sparse matrix output of first sentence
print(transform(corpus, vocab)[0])
# dense matrix output of first sentence
print(transform(corpus, vocab)[0].toarray())

100%|██████████████████████████████████████████| 9/9 [00:00<00:00, 9004.95it/s]


(4, 9)


100%|██████████████████████████████████████████| 9/9 [00:00<00:00, 8989.93it/s]


  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


100%|██████████████████████████████████████████| 9/9 [00:00<00:00, 8992.08it/s]


[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [146]:
# shape of sklearn tfidf vectorizer output after applying transform method.
print(skl_output.shape)

(4, 9)


In [147]:
# sparse matrix output of  sklear tfidf vectorizer of first sentence
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [148]:
# dense matrix output of sklearn tfidf vectorizer of first sentence
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


**Observation**: final output of custom transform method and sklearn tdidf transform method are same. Though there are  difference in ordering of the sparse output , but values are same .

 
<font face='georgia'>
    <h4><strong>Build a custom TFIDF Vectorizer & compare its results with Sklearn with max feature functionality:</strong></h4>

Loading the file

In [156]:
import pickle
with open('cleaned_strings', 'rb') as f:
    strings = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(strings))

Number of documents in corpus =  746


### Custom ```fit``` method 

In [158]:
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of strings")
        
vocabulary = fit(strings)

### Calculating custom ```idf``` with max features functionality
This function return a dictionary with top 50 idf scores as values and subsequent words as keys. We can put different values of max features if we want.

In [164]:
def idf_(dataset, vocab, max_f):
    
    unique_words = list(vocab.keys())
    # intializing a dict to store idf values as values and subsequent words as keys
    idf_val = {}
    for word in unique_words: # for every word in list of unique words
        doc_with_words = 0
        for row in dataset: # for each sentence in dataset
            if word in row.split():
                doc_with_words += 1
        # idf calculation 
        idf_val[word] = 1 + math.log((1 + len(dataset)) / float(1 + doc_with_words))
        idf_val = Counter(idf_val)
        # dictionary with top idf values as values and words as keys
        idf_val = dict(idf_val.most_common(max_f))
    return idf_val

idf_(strings, vocabulary, 50)

{'aailiyah': 6.922918004572872,
 'abandoned': 6.922918004572872,
 'abroad': 6.922918004572872,
 'abstruse': 6.922918004572872,
 'academy': 6.922918004572872,
 'accents': 6.922918004572872,
 'accessible': 6.922918004572872,
 'acclaimed': 6.922918004572872,
 'accolades': 6.922918004572872,
 'accurate': 6.922918004572872,
 'accurately': 6.922918004572872,
 'achille': 6.922918004572872,
 'ackerman': 6.922918004572872,
 'actions': 6.922918004572872,
 'adams': 6.922918004572872,
 'add': 6.922918004572872,
 'added': 6.922918004572872,
 'admins': 6.922918004572872,
 'admiration': 6.922918004572872,
 'admitted': 6.922918004572872,
 'adrift': 6.922918004572872,
 'adventure': 6.922918004572872,
 'aesthetically': 6.922918004572872,
 'affected': 6.922918004572872,
 'affleck': 6.922918004572872,
 'afternoon': 6.922918004572872,
 'aged': 6.922918004572872,
 'ages': 6.922918004572872,
 'agree': 6.922918004572872,
 'agreed': 6.922918004572872,
 'aimless': 6.922918004572872,
 'aired': 6.922918004572872,

### Custom ```transform``` method

In [165]:
def transform(dataset, vocab, max_f ):
    # intializing required list and dictionaries for calculating tf and idf
    unique_words = list(vocab.keys())
    idf_val = {}
    tf_val = {}
    tfidf_values = []
    
    
    if isinstance(dataset, (list,)):  # check if its list type or not
         # calculating idf
        for word in unique_words: # for each word in list of unique words
            doc_with_words = 0
            for row in dataset: # for each sentence in dataset
                if word in row.split():
                    doc_with_words += 1
             # idf calculation
            idf_val[word] = 1 + math.log((1 + len(dataset)) / float(1 + doc_with_words))
        idf_val = Counter(idf_val)
        
        # dictionary with top idf values as values and words as keys
        idf_val = dict(idf_val.most_common(max_f))
        # modified list of words with top idf values
        mod_words = sorted(list(idf_val.keys()), reverse=True)
        
        # tf calculation
        for word in mod_words: # for each word in list of modified unique words
            sent_tf_vec = []
            for row in dataset: # for each sentence in dataset
                doc_freq = 0
                for token in row.split(): # for each word in the sentence
                    if word == token:
                        doc_freq += 1
                # calculating tf
                tf = doc_freq / len(row.split())
                sent_tf_vec.append(tf)
            tf_val[word] = sent_tf_vec
        # calculating tfidf 
        for token in tqdm(tf_val.keys()):  # for each word in tf_dictionary
            tfidf_sentences = []
            for tf_sentence in tf_val[token]:
                tf_idf_score = tf_sentence * idf_val[token]
                tfidf_sentences.append(tf_idf_score)
            tfidf_values.append(tfidf_sentences)
        
        return csr_matrix(normalize(np.array(tfidf_values).T, norm = 'l2'))
    else:
        print("you need to pass list of strings")

In [166]:
# shape of custom tfidf sparse matrix 
print(transform(strings, vocabulary, 50).shape)

100%|████████████████████████████████████████| 50/50 [00:00<00:00, 3562.83it/s]


(746, 50)


746 rows and 50 columns as we have selected top 50 features according to idf values.

In [168]:
# sparse output for first string
print(transform(strings, vocabulary, 50)[0])

100%|██████████████████████████████████████████████████| 50/50 [00:00<?, ?it/s]


  (0, 19)	1.0


In [169]:
# shape of sparse output for first string
print(transform(strings, vocabulary, 50)[0].shape)

100%|████████████████████████████████████████| 50/50 [00:00<00:00, 2482.22it/s]


(1, 50)


1 row and 50 columns.

In [171]:
# dense output for first string
print(transform(strings, vocabulary, 50)[0].toarray())

100%|████████████████████████████████████████| 50/50 [00:00<00:00, 3072.48it/s]


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
