In [35]:
import numpy as np
import collections

#Part 1
def doc_vocab(documents):
    """Creates a dictionary of all words and assigns a unique index to each word in the collection"""
    doc_vocab = {}
    wordSet = set()
    #Gets the set of unique words across all documents
    for document in documents:
        words = document.split()
        wordSet = wordSet.union(set(words))
    #Assigns a unique index to each word in the wordSet
    i = 0
    for word in wordSet:
        doc_vocab[word] = i
        i += 1
    return doc_vocab
               

In [53]:
#Part 2
def doc_term_matrix(vocab, documents):
    """Goes through each document and creates an array with the values of each word in the document indexed by vocab"""
    cols = len(vocab)
    #Initialize empty matrix
    doc_term_matrix = []
    #Go through each document. Split into words. Get the wordFreq count. 
    for document in documents:
        words = document.split()
        wordFreq = collections.Counter(words)
        #Initialize a temp empty array of zeroes
        temp = np.zeros(cols)
        #Fill in the empty array with the number of times a word appears in the document 
        #The words are indexed by the order in the vocab dictionary
        for key,value in wordFreq.items():
            index = vocab[key]
            temp[index] = value
        #Append the temp array to the document term matrix
        doc_term_matrix.append(temp)
    #Convert the doc_term_matrix from list into array and return it
    doc_term_matrix = np.array(doc_term_matrix)
    return doc_term_matrix
            
   

In [55]:
#Part 3
def tf_matrix(document_term_matrix):
    """Divides all values in DTM by the number of terms in document"""
    cols = len(document_term_matrix[0])
    tf_matrix = document_term_matrix/cols
    return tf_matrix

In [86]:
#Part 4
def tf_idf_matrix(document_term_matrix, tf_matrix):
    """First calculates the IDF of the terms, and then calculates the tf_idf matrix"""
    total_docs = len(document_term_matrix)
    #Create a boolean array for where values !=0 in the dtm. This lets us know if a term appears in a document or not
    #We sum the true values over the columns, so we know for each term, how many documents it appeared in
    
    booleanArray = (document_term_matrix != 0)
    idf = np.sum(booleanArray, axis = 0)
    
    #Once we have the number of documents that contain each term we calculate the idf
    idf = np.log10(total_docs/idf)
    
    #Once we have the idf, we calculate tf_idf
    tf_idf = tf_matrix * idf
    return tf_idf

In [87]:
#Part 5
documents = ['Julie loves me more than Linda loves me', 
             'Jane likes me more than Julie loves me', 
             'He likes basketball more than baseball']
vocab = doc_vocab(documents)
dtm = doc_term_matrix(vocab,documents)
tf = tf_matrix(dtm)
tf_idf = tf_idf_matrix(dtm, tf)

print("Below is the Vocab")
print(vocab)
print("Below is the Document Term Matrix")
print(dtm)
print("Below is the Term Frequency Matrix")
print(tf)
print("Below is the TF-IDF Matrix")
print(tf_idf)

Below is the Vocab
{'He': 0, 'Jane': 1, 'than': 2, 'baseball': 3, 'basketball': 4, 'likes': 5, 'more': 6, 'loves': 7, 'Linda': 8, 'Julie': 9, 'me': 10}
Below is the Document Term Matrix
[[ 0.  0.  1.  0.  0.  0.  1.  2.  1.  1.  2.]
 [ 0.  1.  1.  0.  0.  1.  1.  1.  0.  1.  2.]
 [ 1.  0.  1.  1.  1.  1.  1.  0.  0.  0.  0.]]
Below is the Term Frequency Matrix
[[ 0.          0.          0.09090909  0.          0.          0.
   0.09090909  0.18181818  0.09090909  0.09090909  0.18181818]
 [ 0.          0.09090909  0.09090909  0.          0.          0.09090909
   0.09090909  0.09090909  0.          0.09090909  0.18181818]
 [ 0.09090909  0.          0.09090909  0.09090909  0.09090909  0.09090909
   0.09090909  0.          0.          0.          0.        ]]
Below is the TF-IDF Matrix
[[ 0.          0.          0.          0.          0.          0.          0.
   0.03201659  0.04337466  0.0160083   0.03201659]
 [ 0.          0.04337466  0.          0.          0.          0.0160083
   0