In [3]:
import numpy as np 

In [4]:
## read the text vector from a file

txt_vector = np.load('/content/drive/MyDrive/AdvancedPython/data/text_vector_mini.npy')
txt_vector.shape

(1000, 1500)

$$
tfidf = tf \cdot idf \\ 
tf = \frac{f_{t', d}}{\sum_{t \in d} f_{t', d}}, \\
where\ f_{t', d} \ is\ the\ number\ of\ times\ that\ term\ t \ occurs\ in\ an\ overview\ d \\ 
idf = log (\frac{N}{n_t}), \\
where\ N\ is\ total\ number\ of\ overviews\ in\ the\ dataset\ and\ n_t\ is\ number\ of\ overviews\ where\ the\ term\ t\ appears 
$$

In [5]:
def construct_tfidf(txt_vector): 

    ## word_num_col: word frequency of each word in all overviews (column-wise)
    ## iterate all columns, count how many rows are not zero
    word_num_col = [len(np.nonzero(txt_vector[:, i])[0]) for i in range(txt_vector.shape[1])]
    word_num_col = np.array(word_num_col, dtype=np.float32)

    ## word_num_row: total word number in each overview (row-wise)
    ## iterate all rows, sum up all values in each row
    word_num_row = [np.sum(txt_vector[i, :]) for i in range(txt_vector.shape[0])]
    word_num_row = np.array(word_num_row, dtype=np.float32)[..., np.newaxis]
    
    ## idf: inverse document frequency matrix 
    ## a word is common or rare across all overviews 
    ## if a word is too common, then idf will be small to penalize the tf
    ## plus 1 to avoid denominator being 0
    idf = np.diag(np.log(txt_vector.shape[0] / (word_num_col + 1) ))

    ## tf: term frequency matrix
    ## the number of times that term t occurs in a movie's overview
    tf = txt_vector / (word_num_row+1)

    ## tfidf = tf * idf
    tfidf = np.dot(tf, idf)

    return tfidf

In [31]:
tfidf_matrix = construct_tfidf(txt_vector)
tfidf_matrix.shape

- Test runtime of the function

In [6]:
%timeit -n 100 construct_tfidf(txt_vector)

86 ms ± 5.82 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
