In [None]:
import numpy as np

# Our small corpus of documents
corpus = [
    "the sky is blue",
    "the sun is bright",
    "the sun in the sky is bright",
    "we can see the shining sun, the bright sun"
]

In [None]:
# Step.1 Build the vocab, 20 unique words, my vector will be of dim 20 [0.4, 0, 0, ....,]
# Step.2 Build a TF matrix (term frequency)

# [[0,1,1,...,0],
#  [0,2,0,....0],
#  [],
#  []]

# Step.3 Build a IDF lookup (for each word, we try to compute its rarity, throughout the corpus)
# take any word t = sky
# IDF(t) = Total no of document / total no of documents with word = t
# IDF(sky) = 4 / (2 + 1) = 2 => rare word
# IDF(the) = 4 / (4 + 1) = 1 => common word

# Step.4 Finally, we compute a TFIDF matrix

# [[0,2,....,0],
#  [],
#  [],
#  []]


In [None]:
# Document: text, one tweet, one Document, chapter
# Corpus : collection of Document, list[Document], list[Chapter]


# Build search system, Document => vector
# corpus => list[Vector]
# search system: searching over that vector, query vector and we compute distance from all the vectors in list[vector]



In [None]:
vocab = set()
for doc in corpus:
  for word in doc.split(" "):
    vocab.add(word)

In [None]:
def get_tf_vector(doc, vocab):
  example_vector = np.zeros(len(vocab))
  for word in doc.split(" "):
    current_word_index = sorted(list(vocab)).index(word)
    example_vector[current_word_index] += 1
  return example_vector

In [None]:
tf_matrix = [get_tf_vector(doc, vocab) for doc in corpus]

In [None]:
tf_matrix = np.array(tf_matrix)

In [None]:
import math

In [None]:
how_many_documents = len(corpus)

In [None]:
def get_idf_value(word, corpus):
  how_many_documents_with_word = 0
  for doc in corpus:
    if word in doc:
      how_many_documents_with_word += 1
  return math.log(how_many_documents / (how_many_documents_with_word + 1))

In [None]:
idf_dict = {}
for word in vocab:
  idf_dict[word] = get_idf_value(word, corpus)

In [None]:
tfidf_matrix = np.zeros((len(corpus), len(vocab)))

for doc_index, array in enumerate(tf_matrix):
  for index, tf_value in enumerate(array):
    tfidf_matrix[doc_index, index] = tf_value * idf_dict[sorted(list(vocab))[index]]

In [None]:
tfidf_matrix

array([[ 0.69314718,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.28768207,  0.        ,  0.        ,
        -0.22314355,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.22314355,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.28768207,  0.        ,
         0.        ,  0.        ,  0.28768207,  0.        ,  0.        ,
        -0.4462871 ,  0.        ],
       [ 0.        ,  0.        ,  0.69314718,  0.        ,  0.        ,
         0.69314718,  0.69314718,  0.        ,  0.        ,  0.69314718,
        -0.4462871 ,  0.69314718]])