In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize
import pandas as pd
import numpy as np

### 1- Dados

In [2]:
train = ['The sky is blue.',
         'The sun is bright.']

test = ['The sun in the sky is bright', 
        'We can see the shining sun, the bright sun.']

### 2- Contagem de palavras removendo stoping words(and, or, if, etc.)

In [3]:
countvectorizer = CountVectorizer(analyzer='word', stop_words='english')

terms = countvectorizer.fit_transform(train)
term_vectors  = countvectorizer.transform(test)

pd.DataFrame(term_vectors.todense(), columns=countvectorizer.get_feature_names())

Unnamed: 0,blue,bright,sky,sun
0,0,1,1,1
1,0,1,0,2


Obs: Verificamos a contagem de palavras correta p/ o conjunto de teste.

### 3- Contagem de Palavras invertidas - TF-IDF (Term Frequency - Inverse Document Frequency)

In [4]:
tfidf = TfidfTransformer(norm='l2')
term_vectors.todense()

tf_idf_matrix = tfidf.fit_transform(term_vectors)
print("\nVector of idf \n", tfidf.idf_)
print("\nFinal tf-idf vectorizer matrix form :\n", tf_idf_matrix.todense())


Vector of idf 
 [2.09861229 1.         1.40546511 1.        ]

Final tf-idf vectorizer matrix form :
 [[0.         0.50154891 0.70490949 0.50154891]
 [0.         0.4472136  0.         0.89442719]]


In [5]:
pd.DataFrame({'feature_name': countvectorizer.get_feature_names(), 'idf_weights': tfidf.idf_})

Unnamed: 0,feature_name,idf_weights
0,blue,2.098612
1,bright,1.0
2,sky,1.405465
3,sun,1.0


Obs: Identifica-se o peso para cada palavra no vetor idf e a matrix final gerada pelo algoritmo

### 4a- Saindo da Caixa preta - Calculando o vetor IDF

$$\large
idf(t) = ln \left( \frac{1 + n}{1 + df(t)} \right) + 1
$$

Onde:
* $n$ é o total de documentos(exemplos treinados)
* $df(t)$ é a quantidade de documentos em que certa palavra aparece

In [6]:
num_docs = 2

doc_weights = np.array(term_vectors.todense())
doc_weights = np.minimum(doc_weights.sum(axis=0), 2)

idf_weights = np.log((num_docs + 1) / (1 + doc_weights)) + 1
idf_weights

array([2.09861229, 1.        , 1.40546511, 1.        ])

Obs: Exatamente como o resultado anterior

### 4b- Sainda da Caixa preta - Calculando a matriz TF-IDF

$$\large
tf{\text -}id{\text f}(t) = tf(t, d) \times idf(t)
$$

Obs: A fórmula do tf-idf é apenas multiplicar a frequência pelo vetor calculado idf.

In [9]:
tf_idf = np.array(term_vectors.todense()).dot(np.diag(idf_weights))
tf_idf

array([[0.        , 1.        , 1.40546511, 1.        ],
       [0.        , 1.        , 0.        , 2.        ]])

Obs: O scikit-learn faz a normalização l2 por padrão

### 4c- Normalizar com a norma para chegar no resultado

In [14]:
# Normalizando v1:  Usando SK-Learn
normalize(tf_idf, norm='l2')

array([[0.        , 0.50154891, 0.70490949, 0.50154891],
       [0.        , 0.4472136 , 0.        , 0.89442719]])

In [17]:
# Normalizando v2 : Fazendo o cálculo da norma no braço (vulgo pitágoras)
normalized = (tf_idf ** 2).sum(axis=1) ** 0.5

normalized = tf_idf / normalized.reshape(-1, 1)
normalized

array([[0.        , 0.50154891, 0.70490949, 0.50154891],
       [0.        , 0.4472136 , 0.        , 0.89442719]])

In [18]:
tf_idf_matrix.todense()

matrix([[0.        , 0.50154891, 0.70490949, 0.50154891],
        [0.        , 0.4472136 , 0.        , 0.89442719]])

Obs: Os valores batem!