Words that occur nearby frequently are more important than words that only appear once or twice. Yet words that are too frequent — ubiquitous, like "the" or "good" — are unimportant. How can we balance these two conflicting constraints?

In [58]:
import numpy as np
import os
import math
from functools import reduce
from collections import defaultdict

# directory where documents are stored
documents_directory = os.path.expanduser('documents')
documents = os.listdir(documents_directory)
documents.sort()

In [59]:
# gets a set of unique terms from a specific document
def get_all_terms_from_document(file_path):
    with open(file_path, 'r') as file:
        document = file.read()
    # split document into terms and convert to a set of unique terms
    terms = document.split()
    return terms

In [60]:
# gets all the unique terms for a set of documents in a directory
def get_all_unique_terms(directory):
    unique_terms = []

    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            unique_terms.extend(get_all_terms_from_document(file_path))

    unique_terms = set(unique_terms)

    return unique_terms

unique_terms = list(get_all_unique_terms(documents_directory))
unique_terms.sort()

In [61]:
num_documents = len([name for name in os.listdir(documents_directory) if os.path.isfile(os.path.join(documents_directory, name))])
num_terms = len(unique_terms)

print(f'Number of documents {num_documents}')
print(f'Number of terms {num_terms}')

# init the term_frequency matrix
term_frequency = np.zeros((num_terms, num_documents), dtype=float)

Number of documents 5
Number of terms 32575


We need to get the count of terms for each document and store them in a matrix of size t x d
$$
\text{term\_frequency}_{term,document} = \text{count}(term, document)
$$

In [62]:
# define a couple helper functions to get our indicies from labels
term_to_index = {term: idx for idx, term in enumerate(unique_terms)}
document_to_index = {document: idx for idx, document in enumerate(documents)}

# get the counts of terms in each document
def compute_term_frequency_matrix(directory, documents, tf):
    for document in documents:
        file_path = os.path.join(directory, document)
        all_terms_in_document = get_all_terms_from_document(file_path)

        word_frequencies = reduce(lambda freq, word: freq.update({word: freq.get(word, 0) + 1}) or freq, all_terms_in_document, defaultdict(int))
        word_frequencies = dict(word_frequencies)

        for term in word_frequencies.keys():
            term_frequency[term_to_index[term], document_to_index[document]] = word_frequencies[term]
    
compute_term_frequency_matrix(documents_directory, documents, term_frequency)

In [65]:
# we can look up individual terms 

term = 'ebook'

def term_lookup(term, term_frequency):
    term_index = term_to_index[term]
    print(f'Term frequency for {term}:')
    for document in documents:
        document_index = document_to_index[document]
        print(f'{document}: {int(term_frequency[term_index, document_index])}')
    

term_lookup(term, term_frequency)

Term frequency for ebook:
1342.txt: 13
145.txt: 13
1513.txt: 13
2641.txt: 6
2701.txt: 13


Now let's squash the raw frequency using log10. A word appearing 100 times in a document doesn’t make that word 100 times more likely to be relevant to the meaning of the document. We also need to do something special with counts of 0, since we can’t take the log of 0.

$$
\text{term\_frequency}_{term,document} =
\begin{cases}
1 + \log_{10}(\text{count}(term,document)) & \text{if } \text{count}(term,document) > 0 \\
0 & \text{otherwise}
\end{cases}
$$

In [67]:


result = term_frequency*2 + 1
term_lookup(term, term_frequency)
term_lookup(term, result)




squash = lambda x: 0 if x == 0 else 1 + math.log(14,5)

Term frequency for ebook:
1342.txt: 13
145.txt: 13
1513.txt: 13
2641.txt: 6
2701.txt: 13
Term frequency for ebook:
1342.txt: 27
145.txt: 27
1513.txt: 27
2641.txt: 13
2701.txt: 27
