### Corpus

In [0]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [0]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [0]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [0]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [0]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [0]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [0]:
# Write your code here.
# Make sure its well documented and readble with appropriate comments.
# Compare your results with the above sklearn tfidf vectorizer
# You are not supposed to use any other library apart from the ones given below

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
# from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer
import numpy

#fit function will return unique words
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of sentance")


def transform(dataset,vocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)): # for each document in the dataset
            # it will return a dict type object where key is the word and values is its frequency, {word:frequency}
            word_freq = dict(Counter(row.split()))
            # for every unique word in the document
            for word, freq in word_freq.items():  # for each unique word in the review.                
                if len(word) < 2:
                    continue
                # we will check if its there in the vocabulary that we build in fit() function
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = vocab.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    # we are storing the frequency of the word
                    values.append(freq)
        #print(rows)
        #print(columns)
        #print(values)
        return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
    else:
        print("you need to pass list of strings")


# L2 normalization using sklearn Normalizer
def get_normlized_data(X) :
  transformer_l2 = Normalizer().fit(X)
  return transformer_l2.transform(X);

def get_tf_velues( sparse_data_l2_norm ) :
  tf_values= []
  
  for i , row_  in enumerate(sparse_data_l2_norm) :
    total_words = sum(row_)
    for j , col_  in enumerate(row_) :
      if col_ != 0 :
        tf_values.append([(i,j), sparse_data_l2_norm[i][j] / total_words ])

  return tf_values;

def get_tf_dense_metrix( sparse_data_l2_m ) :
  tf_dense_m = []
  for i , row_  in enumerate(sparse_data_l2_m) :
    tf_values_row = []
    total_words = sum(row_)
    for j , col_  in enumerate(row_) :
        tf_values_row.append( ( col_ / total_words ) )

    tf_dense_m.append( tf_values_row )
    
  return tf_dense_m;

def get_tf_idf_dense_metrix(tf_dense, idf_val):
  idf_dense_m = []
  
  for i, row_ele in enumerate(tf_dense):
    each_row_d = []
    for j, col_e in enumerate(row_ele):
      cell_val = ( col_e * idf_val.get(j) );
      each_row_d.append( cell_val )
    idf_dense_m.append( each_row_d )
  return idf_dense_m;

def get_idf_score( sparse_data ) :
  idf_score = {}
  for  i in range(0,len(sparse_data[0]) ):
    word_in_doc = 0
    for j in range(0,len(sparse_data) ) :
      if sparse_data[j][i] != 0 :
        word_in_doc = word_in_doc + 1 ;
    idf_score[i] =  1 + math.log( (1 + len(sparse_data) ) / (word_in_doc +1) ) 
  return idf_score;


def get_tf_idf( tf_values , idf_score ) :
  tf_idf_values = []
  for each_tf in tf_values:
    each_tf_idf = []
    each_tf_idf.append(  each_tf[0] )
    each_tf_idf.append(  each_tf[1] * idf_score.get(each_tf[0][1]) )
    tf_idf_values.append( each_tf_idf )
  
  return tf_idf_values;


print(corpus)
vocab = fit(corpus)

print("\nvocabulary of unique words")
print(sorted(list(vocab.keys())))
#print(transform(strings, vocab).toarray())
sparse_arr = transform(corpus, vocab).toarray();
sparse_data = sparse_arr.tolist();
# print(sparse_arr)
print("\nNo. of ( Docs , vocabulary) = " , end=" ")
print(sparse_arr.shape)
print()
# L2 normalization using sklearn Normalizer
sparse_normal_data = get_normlized_data(sparse_data)

tf_values_ = get_tf_velues( sparse_normal_data );
idf_score_ = get_idf_score( sparse_data )

print("IDF score :")
print(list( idf_score_.values()))
print()

tf_idf = get_tf_idf( tf_values_ , idf_score_ );
print("TF-IDF score for the Document-1 (row-1) :")
for cell_i in tf_idf[0:5] :
  print(cell_i)


tf_dense_metrix = get_tf_dense_metrix( sparse_normal_data );
tf_idf_dense_m = get_tf_idf_dense_metrix( tf_dense_metrix , idf_score_ );
print("\nDense matrix for sparse matrix: ")
print(numpy.asarray(tf_idf_dense_m[0]))


100%|██████████| 4/4 [00:00<00:00, 14639.80it/s]

['this is the first document', 'this document is the second document', 'and this is the third one', 'is this the first document']

vocabulary of unique words
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

No. of ( Docs , vocabulary) =  (4, 9)

IDF score :
[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]

TF-IDF score for the Document-1 (row-1) :
[(0, 1), 0.24462871026284191]
[(0, 2), 0.3021651247531981]
[(0, 3), 0.19999999999999998]
[(0, 6), 0.19999999999999998]
[(0, 8), 0.19999999999999998]

Dense matrix for sparse matrix: 
[0.         0.24462871 0.30216512 0.2        0.         0.
 0.2        0.         0.2       ]





## Task-2

In [0]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus_d = pickle.load(f)
    
corpus_1 = corpus_d;
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus_1))

Number of documents in corpus =  746


In [0]:
# Write your code here.
# Try not to hardcode any values.
# Make sure its well documented and readble with appropriate comments.
def get_top_features(vocab , idf_score, top_no):
  sorted_x = sorted(idf_score.items(), key=operator.itemgetter(1),reverse = True )
  short_dict = dict(sorted_x[:top_no])
  vocab_top_keys =  ( dict(sorted_x[:top_no]).keys() ) ;
  vocab_top_list = {};
  for k , v in vocab.items():
    if v in vocab_top_keys:
      vocab_top_list[k] = v ;
  vocab_top_list_ord = {}
  i = 0;
  for k,v in vocab_top_list.items() :
    vocab_top_list_ord[k] = i ;
    i = (1 + i ) ;
  return vocab_top_list_ord;

In [161]:
# print(corpus_1)
vocab = fit(corpus_1)
# print("\n vocab of unique words")
# print(sorted(list(vocab.keys())))
# print(vocab)
#print(transform(strings, vocab).toarray())
sparse_arr_1 = transform(corpus_1, vocab).toarray();
sparse_data_1 = sparse_arr_1.tolist();
# print(sparse_arr)
print("\nTotal No. of ( Docs , vocabulary) = " , end=" ")
print(sparse_arr_1.shape)
print()
# tf_values_ = get_tf_velues( get_normlized_data(sparse_data_1) );
idf_score_1 = get_idf_score( sparse_data_1 )


# top-50
vocab_top_50 = get_top_features( vocab , idf_score_1 , 50 )
print("\nTop 50 unique words by IDF-score")
print( list(vocab_top_50.keys() ) )
sparse_arr_2 = transform(corpus_1, vocab_top_50).toarray();
sparse_data_2 = sparse_arr_2.tolist();
print("\n\nData of Top 50 unique words by IDF-score \n( Docs , vocabulary) = " , end=" ")
print(sparse_arr_2.shape)
print()

# L2 normalization using sklearn Normalizer
normal_data_2 = get_normlized_data(sparse_data_2)
# print( normal_data_2[0])
tf_values_2 = get_tf_velues(  normal_data_2 );
idf_score_2 = get_idf_score( sparse_data_2 )


print("IDF score dictionary :")
print( idf_score_2 )
print("Only IDF score : ")
print(list( idf_score_2.values()))
print()

tf_idf = get_tf_idf( tf_values_2 , idf_score_2 );

print("TF-IDF score for the Document-1 (row-1) :")
for cell_i in tf_idf[0:1] :
  print(cell_i)

tf_dense_metrix = get_tf_dense_metrix( normal_data_2 );
tf_idf_dense_m = get_tf_idf_dense_metrix( tf_dense_metrix , idf_score_2 );
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
print("\nDense matrix for sparse matrix: ")
print(numpy.asarray(tf_idf_dense_m[0]))

100%|██████████| 746/746 [00:00<00:00, 66712.52it/s]



Total No. of ( Docs , vocabulary) =  (746, 2886)



100%|██████████| 746/746 [00:00<00:00, 95505.49it/s]


Top 50 unique words by IDF-score
['aailiyah', 'abandoned', 'abroad', 'abstruse', 'academy', 'accents', 'accessible', 'acclaimed', 'accolades', 'accurate', 'accurately', 'achille', 'ackerman', 'actions', 'adams', 'add', 'added', 'admins', 'admiration', 'admitted', 'adrift', 'adventure', 'aesthetically', 'affected', 'affleck', 'afternoon', 'aged', 'ages', 'agree', 'agreed', 'aimless', 'aired', 'akasha', 'akin', 'alert', 'alike', 'allison', 'allow', 'allowing', 'alongside', 'amateurish', 'amaze', 'amazed', 'amazingly', 'amusing', 'amust', 'anatomist', 'angel', 'angela', 'angelina']


Data of Top 50 unique words by IDF-score 
( Docs , vocabulary) =  (746, 50)

IDF score dictionary :
{0: 6.922918004572872, 1: 6.922918004572872, 2: 6.922918004572872, 3: 6.922918004572872, 4: 6.922918004572872, 5: 6.922918004572872, 6: 6.922918004572872, 7: 6.922918004572872, 8: 6.922918004572872, 9: 6.922918004572872, 10: 6.922918004572872, 11: 6.922918004572872, 12: 6.922918004572872, 13: 6.922918004572872




In [0]:
# using sklearn Tfidf-Vectorizer for above corpus

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus_1)
skl_output = vectorizer.transform(corpus_1)

# sklearn feature names, they are sorted in alphabetic order by default.
print("Features Name using sklearn")
print(vectorizer.get_feature_names())

# shape of sklearn tfidf vectorizer output after applying transform method.
print(skl_output.shape)

print("\nIDF score using sklearn:")
print(vectorizer.idf_)
print("\nTD-IDF using sklearn")
print(skl_output[0])

print("\nDense metrix using sklearn")
print(skl_output[0].toarray())

Features Name using sklearn
(746, 2886)

IDF score using sklearn:
[6.922918   6.922918   6.22977082 ... 6.922918   6.5174529  6.922918  ]

TD-IDF using sklearn
  (0, 2878)	0.35781145622317734
  (0, 2287)	0.3377679916467555
  (0, 1653)	0.35781145622317734
  (0, 1651)	0.16192317905848022
  (0, 1545)	0.30566026894803877
  (0, 720)	0.4123943870778812
  (0, 688)	0.4123943870778812
  (0, 53)	0.4123943870778812

Dense metrix using sklearn
[[0. 0. 0. ... 0. 0. 0.]]
