In [1]:
'''
Count based bag of words using TF-IDF transform.
In this example, ContVectorizer and TfidfTransformers
classes, from scikit-learn, are used. Logarithms are
computed using base e.


Author: Fabricio Galende M. de Carvalho, DSc
'''

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas import DataFrame

corpus_documents = ["Gostei muito do computador",
             "Não gostei, mesmo, do computador"]

vocabulary = None #['gostei','não','computador']
vectorizer = CountVectorizer(vocabulary = vocabulary)
document_term_matrix = vectorizer.fit_transform(corpus_documents)
corpus_lexicon = vectorizer.get_feature_names_out()
print("Corpus representation lexicon: ", corpus_lexicon, '\n')
print("Representation matrix: ")
print(document_term_matrix.toarray())


Corpus representation lexicon:  ['computador' 'do' 'gostei' 'mesmo' 'muito' 'não'] 

Representation matrix: 
[[1 1 1 0 1 0]
 [1 1 1 1 0 1]]


In [2]:
# Now we perform the TF-IDF transformation with and without array normalization:
# In scikit-learn idf(t) = ln [ (1 + n) / (1 + df(t)) ] + 1.
# for "computador", idf = ln[3/1+2] + 1 = log(1) + 1 = 1
# for "não", idf = ln[3/2] + 1 = 1.4054

tf_idf_transformer_w_normalization = TfidfTransformer(norm = 'l2')
tf_idf_transformer_wo_normalization = TfidfTransformer(norm = None)
document_term_tfidf_transform_normalized = tf_idf_transformer_w_normalization.fit_transform(document_term_matrix)
document_term_tfidf_transform = tf_idf_transformer_wo_normalization.fit_transform(document_term_matrix)


document_term_df_norm = DataFrame(data = document_term_tfidf_transform_normalized.toarray(), index = ['doc1', 'doc2'], columns = corpus_lexicon)
print("Normalized feature vectors: ")
print(document_term_df_norm)
print("\n")

print("Non normalized feature vectors: ")
#Lets print in a user-friendly format:
document_term_df = DataFrame(data = document_term_tfidf_transform.toarray(), index = ['doc1', 'doc2'], columns = corpus_lexicon)
print(document_term_df)

Normalized feature vectors: 
      computador        do    gostei     mesmo     muito       não
doc1    0.448321  0.448321  0.448321  0.000000  0.630099  0.000000
doc2    0.379303  0.379303  0.379303  0.533098  0.000000  0.533098


Non normalized feature vectors: 
      computador   do  gostei     mesmo     muito       não
doc1         1.0  1.0     1.0  0.000000  1.405465  0.000000
doc2         1.0  1.0     1.0  1.405465  0.000000  1.405465


In [3]:
# Let's repeat the above representation computation but using TfidfVectorizer.
# Note that the result is the same but using a sigle processing step

tfidf_vectorizer = TfidfVectorizer(min_df = 0., max_df = 1., norm='l2', use_idf=True)
vectorized_docs = tfidf_vectorizer.fit_transform(corpus_documents)
document_term_df_b = DataFrame(data = vectorized_docs.toarray(), index = ['doc1', 'doc2'], columns = corpus_lexicon)
print(document_term_df_b)


      computador        do    gostei     mesmo     muito       não
doc1    0.448321  0.448321  0.448321  0.000000  0.630099  0.000000
doc2    0.379303  0.379303  0.379303  0.533098  0.000000  0.533098


In [4]:
# Let's check the norm for the second sentence using numpy
import numpy as np

norm_repr_1 = np.linalg.norm(document_term_tfidf_transform_normalized.toarray()[1,:])
norm_repr_1b = np.linalg.norm(document_term_tfidf_transform.toarray()[1,:])
print(norm_repr_1)
print(norm_repr_1b)


0.9999999999999999
2.6364112615862854


In [5]:
# Now, let's use the previous model to compute a new feature vector for a new phrase:
input_document = ["gostei, muito mesmo, do computador!"]
input_document_count_vectorized_repr = vectorizer.transform(input_document)
input_document_tfidf_repr = tf_idf_transformer_w_normalization.transform(input_document_count_vectorized_repr)
input_document_tfidf_repr_dataframe =  DataFrame(data=input_document_tfidf_repr.toarray(), columns = corpus_lexicon, index=["input doc"])
print("Input document representation using count based bow: ")
print(input_document_tfidf_repr_dataframe.transpose())


Input document representation using count based bow: 
            input doc
computador   0.379303
do           0.379303
gostei       0.379303
mesmo        0.533098
muito        0.533098
não          0.000000
