In [1]:
import numpy as np 

In [2]:
## read the text vector from a file

txt_vector = np.load('./output/text_vector.npy')[:1000]
txt_vector.shape

(1000, 13880)

$$
tfidf = tf \cdot idf \\ 
tf = \frac{f_{t', d}}{\sum_{t \in d} f_{t', d}}, \\
where\ f_{t', d} \ is\ the\ number\ of\ times\ that\ term\ t \ occurs\ in\ an\ overview\ d \\ 
idf = log (\frac{N}{n_t}), \\
where\ N\ is\ total\ number\ of\ overviews\ in\ the\ dataset\ and\ n_t\ is\ number\ of\ overviews\ where\ the\ term\ t\ appears 
$$

In [3]:
def construct_tfidf(txt_vector): 

    ## word_num_col: word frequency of each word in all overviews (column-wise)
    ## iterate all columns, count how many rows are not zero
    word_num_col = [len(np.nonzero(txt_vector[:, i])[0]) for i in range(txt_vector.shape[1])]
    word_num_col = np.array(word_num_col, dtype=np.float32)

    ## word_num_row: total word number in each overview (row-wise)
    ## iterate all rows, sum up all values in each row
    word_num_row = [np.sum(txt_vector[i, :]) for i in range(txt_vector.shape[0])]
    word_num_row = np.array(word_num_row, dtype=np.float32)[..., np.newaxis]
    
    ## idf: inverse document frequency matrix 
    ## a word is common or rare across all overviews 
    ## if a word is too common, then idf will be small to penalize the tf
    ## plus 1 to avoid denominator being 0
    idf = np.diag(np.log(txt_vector.shape[0] / (word_num_col + 1) ))

    ## tf: term frequency matrix
    ## the number of times that term t occurs in a movie's overview
    tf = txt_vector / (word_num_row+1)

    ## tfidf = tf * idf
    tfidf = np.dot(tf, idf)

    return tfidf

In [113]:
ts = time()
tfidf_matrix = construct_tfidf(txt_vector)
print('-->', time()-ts,'s')

--> 1.8321247100830078 s


- Test runtime of the function

In [4]:
%timeit construct_tfidf(txt_vector)

1.74 s ± 81.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## multi-process

In [5]:
from multiprocessing import cpu_count
print("number of CPU cores:", cpu_count())

number of CPU cores: 48


In [12]:
from functools import partial
from multiprocessing.pool import Pool
from functools import partial
from time import time

In [117]:
num_procs = 8
def tf(chunk):
    word_num_row = [np.sum(chunk[i, :]) for i in range(chunk.shape[0])]
    word_num_row = np.array(word_num_row, dtype=np.float32)[..., np.newaxis]
    return chunk / (word_num_row + 1)

def idf(chunk):
    word_num_col = [len(np.nonzero(chunk[:, i])[0]) for i in range(chunk.shape[1])]
    word_num_col = np.array(word_num_col, dtype=np.float32)
    return word_num_col

ts = time()

all_steps = [i for i in range(0, txt_vector.shape[0]+1, txt_vector.shape[0]//num_procs)]
chunks = [txt_vector[all_steps[i] : all_steps[i+1]] for i in range(len(all_steps)-1)]  

with Pool(4) as p:
     results = p.map(tf, chunks)
tf_mat = np.concatenate(results)

all_steps = [i for i in range(0, txt_vector.shape[1]+1, txt_vector.shape[1]//num_procs)]
chunks = [txt_vector[:, all_steps[i] : all_steps[i+1]] for i in range(len(all_steps)-1)]  

ts = time()
with Pool(num_procs) as p:
     results = p.map(idf, chunks)
idf_mat = np.diag(np.log(txt_vector.shape[0] / (np.concatenate(results) + 1) ))

tfidf = np.dot(tf_mat, idf_mat)

print('-->', time()-ts,'s')

--> 1.9582993984222412 s
