In [1]:
import numpy as np

In [2]:
# Define a list of sentences as our corpus
corpus = ['this is a a sample', 'this is another another example example example']

In [3]:
# Create a set of unique words from the corpus by iterating over each sentence and each word in the sentence
vocabs = list(set([word for doc in corpus for word in doc.split()]))
vocabs

['example', 'this', 'another', 'a', 'sample', 'is']

In [4]:
# Create a dictionary mapping each unique word to an integer index
word2id = {w:i for i, w in enumerate(vocabs)}

In [5]:
# Tokenize the sentences in the corpus by replacing each word in the sentence with its corresponding integer index
corpus = [[word2id[word] for word in doc.split()] for doc in corpus]

In [6]:
corpus

[[1, 5, 3, 3, 4], [1, 5, 2, 2, 0, 0, 0]]

In [24]:
with open('untitled.txt') as fp:
    corpus = []
    for line in fp:
        corpus.append([int(i) for i in line.split()])
    vocab_size = max([max(i) for i in corpus])+1
    vocabs = [1 for i in range(vocab_size)]

# Definition TF-IDF
1. The tf–idf is the product of two statistics, term frequency and inverse document frequency. There are various ways for determining the exact values of both statistics.
2. A formula that aims to define the importance of a keyword or phrase within a document or a web page.

In [25]:
doc_size = len(corpus)
vocab_size = len(vocabs)
counts = np.zeros((doc_size, vocab_size)) # when using ndarray it return the same address and the previous values
counts

array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 

In [26]:
for i in range(doc_size):
    for word in corpus[i]:
        counts[i][word] += 1

In [27]:
counts

array([[3., 0., 3., 5., 1., 2., 3., 2.],
       [1., 4., 3., 7., 5., 3., 4., 2.],
       [4., 4., 4., 1., 1., 1., 4., 0.],
       [2., 1., 5., 3., 2., 5., 0., 3.],
       [2., 0., 5., 3., 2., 3., 1., 2.],
       [1., 2., 1., 1., 1., 0., 1., 0.],
       [3., 2., 1., 1., 2., 2., 2., 2.],
       [4., 6., 2., 3., 3., 3., 3., 2.],
       [3., 1., 4., 2., 3., 2., 3., 3.],
       [2., 6., 1., 6., 6., 2., 5., 3.],
       [2., 1., 5., 3., 3., 2., 6., 5.],
       [1., 3., 1., 0., 0., 0., 2., 5.],
       [3., 5., 2., 2., 2., 5., 3., 2.],
       [2., 4., 2., 5., 6., 4., 6., 2.],
       [0., 2., 3., 2., 2., 3., 2., 2.],
       [4., 1., 0., 5., 0., 5., 2., 1.],
       [2., 3., 2., 2., 4., 2., 7., 6.],
       [1., 0., 0., 2., 1., 0., 1., 0.],
       [3., 5., 3., 2., 1., 4., 4., 3.],
       [3., 4., 4., 2., 5., 2., 0., 5.],
       [3., 3., 3., 2., 6., 2., 5., 3.],
       [5., 0., 2., 1., 2., 4., 1., 1.],
       [3., 1., 0., 2., 1., 1., 1., 3.],
       [3., 2., 3., 1., 2., 3., 2., 4.],
       [3., 2., 

## Term frequency
Term frequency, tf(t,d), is the relative frequency of term t within document d,
$$
{\displaystyle \mathrm {tf} (t,d)={\frac {f_{t,d}}{\sum _{t'\in d}{f_{t',d}}}}},
$$
where $f_{t,d}$ is the raw count of a term in a document, i.e., the number of times that term $t$ occurs in document $d$. Note the denominator is simply the total number of terms in document $d$ (counting each occurrence of the same term separately). 

In [28]:
tf = np.zeros_like(counts)
for i in range(doc_size): # O(nxm)
    wc_sum = 0
    for wc in counts[i]: wc_sum += wc # O(m)
    for j in range(vocab_size): # word count in doc O(m)
        tf[i][j] = counts[i][j]/wc_sum
tf

array([[0.15789474, 0.        , 0.15789474, 0.26315789, 0.05263158,
        0.10526316, 0.15789474, 0.10526316],
       [0.03448276, 0.13793103, 0.10344828, 0.24137931, 0.17241379,
        0.10344828, 0.13793103, 0.06896552],
       [0.21052632, 0.21052632, 0.21052632, 0.05263158, 0.05263158,
        0.05263158, 0.21052632, 0.        ],
       [0.0952381 , 0.04761905, 0.23809524, 0.14285714, 0.0952381 ,
        0.23809524, 0.        , 0.14285714],
       [0.11111111, 0.        , 0.27777778, 0.16666667, 0.11111111,
        0.16666667, 0.05555556, 0.11111111],
       [0.14285714, 0.28571429, 0.14285714, 0.14285714, 0.14285714,
        0.        , 0.14285714, 0.        ],
       [0.2       , 0.13333333, 0.06666667, 0.06666667, 0.13333333,
        0.13333333, 0.13333333, 0.13333333],
       [0.15384615, 0.23076923, 0.07692308, 0.11538462, 0.11538462,
        0.11538462, 0.11538462, 0.07692308],
       [0.14285714, 0.04761905, 0.19047619, 0.0952381 , 0.14285714,
        0.0952381 , 0.142857

## The inverse document frequency
The inverse document frequency is a measure of how much information the word provides, i.e., if it is common or rare across all documents. It is the logarithmically scaled inverse fraction of the documents that contain the word (obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient):

$$
\mathrm{idf}(t, D) =  \log \frac{N}{|\{d \in D: t \in d\}|}
$$

with

- N: total number of documents in the corpus $N = {|D|}$

- $|\{d \in D: t \in d\}|$  : number of documents where the term $mathrm{tf}(t,d) \neq 0)$. If the term is not in the corpus, this will lead to a division-by-zero. It is therefore common to adjust the denominator to $1 + |\{d \in D: t \in d\}|.$

In [29]:
idf = np.zeros(vocab_size)
for i in range(vocab_size):
    w_dc = 0 # word found in `w_dc` docs, start with 1 to prevent division-by-zero
    for j in range(doc_size):
        if counts[j][i] != 0: w_dc += 1
    idf[i] = np.log10(doc_size/w_dc)
idf

array([0.05799195, 0.09017663, 0.11598389, 0.09017663, 0.06581728,
       0.11598389, 0.11598389, 0.05799195])

In [30]:
tf*idf

array([[0.00915662, 0.        , 0.01831325, 0.02373069, 0.00346407,
        0.01220883, 0.01831325, 0.00610442],
       [0.00199972, 0.01243816, 0.01199833, 0.02176677, 0.01134781,
        0.01199833, 0.01599778, 0.00399944],
       [0.01220883, 0.01898455, 0.02441766, 0.00474614, 0.00346407,
        0.00610442, 0.02441766, 0.        ],
       [0.00552304, 0.00429413, 0.02761521, 0.01288238, 0.00626831,
        0.02761521, 0.        , 0.00828456],
       [0.00644355, 0.        , 0.03221775, 0.01502944, 0.00731303,
        0.01933065, 0.00644355, 0.00644355],
       [0.00828456, 0.02576475, 0.01656913, 0.01288238, 0.00940247,
        0.        , 0.01656913, 0.        ],
       [0.01159839, 0.01202355, 0.00773226, 0.00601178, 0.00877564,
        0.01546452, 0.01546452, 0.00773226],
       [0.00892184, 0.02080999, 0.00892184, 0.010405  , 0.0075943 ,
        0.01338276, 0.01338276, 0.00446092],
       [0.00828456, 0.00429413, 0.02209217, 0.00858825, 0.00940247,
        0.01104609, 0.016569

In [31]:
# Make corpus

In [1]:
import random

In [10]:
2 ** 17

131072

In [9]:
vocab_size = 2 ** 12
with open('corpus_large.txt', 'w') as fp:
    for _ in range(2 ** 17):
        for __ in range(random.randint(2**4, 2**7)):
            fp.write(str(random.randint(0, vocab_size-1)))
            fp.write(' ')
        fp.write('\n')