In [None]:
print('I have set up the notebook and have it running, yay')

# Indexing
The following cells will demonstrate how a small index works

In [2]:
import pandas as pd

In [3]:
# documents to be indexed
documents = [
             'Boats are great',
             'Sailing boats are enviromental',
             'canal boats are narrow',
             'wooden boats are more work than metal boats',
             'a wooden boat is always the prettiest'
]
document_terms = [doc.split(' ') for doc in documents]

In [None]:
document_terms

**The long way**

In [None]:
# find the "vocabulary", i.e. all words in the doc
vocabulary = set()

for doc in documents:
  for word in doc.split(): 
    vocabulary.add(word)

vocabulary = list(vocabulary)

vocabulary

In [None]:
# CODING TIP: You can also write this in a single line
vocabulary2 = list(set(word for doc in documents for word in doc.split()))

# Lets check that the two methods lead to the same result
vocabulary == vocabulary2 

In [None]:
# Boolean matrix representation of all docs
documents_vectorized = [ 
                        [ 1 if term in doc else 0 for term in vocabulary]
                        for doc in document_terms]

documents_vectorized

In [None]:
# CODING TIP: Alternatively you can also cast the boolean result of evaluating term in doc to an int
documents_vectorized2 = [[int(term in doc) for term in vocabulary] for doc in document_terms]

# Lets check that the two methods lead to the same result
documents_vectorized2 == documents_vectorized 

In [None]:
# Show in pandas for clarity
df = pd.DataFrame(documents_vectorized, columns = vocabulary)
df

In [10]:
# An important part of the index is a dictionary for term: doc_id pairs. So let's create one
term_occurence_dict = {term:[] for term in vocabulary}

for doc_id, doc in enumerate(document_terms):
  for term in doc:
    term_occurence_dict[term].append(doc_id)

In [None]:
term_occurence_dict

In [12]:
# CODING TIP: We can also use the defaultdict module from the collections package
# This dictionary implementation has a little more functionality
from collections import defaultdict

term_occurence_dict = defaultdict(list)

for doc_id, doc in enumerate(document_terms):
  for term in doc:
    term_occurence_dict[term].append(doc_id)

In [None]:
term_occurence_dict

In [14]:
# So if we want to find all documents with the word "boats" or "than" instead of looping over everything we just do:
terms = ['boats', 'than']
result_docs = []

for term in terms:
  doc_ids = term_occurence_dict[term]
  for doc_id in doc_ids:
    result_docs.append(documents[doc_id])

result_docs = list(set(result_docs))

In [None]:
result_docs

In [None]:
# CODING TIP: it's faster and easier to get rid of duplicate integers than duplicate text. 
# This method uses 3 lines 
doc_indexes = []

# Get all doc indexes containing the terms
for term in terms:
  doc_indexes.extend(term_occurence_dict[term])
# Retrieve documents from indexes
result_docs2 = [documents[idx] for idx in set(doc_indexes)]

# Checking that both methods lead to the same answer
result_docs.sort()
result_docs2.sort()
result_docs == result_docs2

In [None]:
result_docs2

In [19]:
# TASK: How about if we want only documents with "boats" AND "than"?

**The short way**

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
# See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vectorizer = CountVectorizer()

In [None]:
documents_vectorized = vectorizer.fit_transform(documents)
# a sparse matrix representaion of the data. this is easy to turn into a numpy array with .toarray()
# however, don't do it unless you have to as it is costly
documents_vectorized

In [None]:
vocabulary = vectorizer.get_feature_names_out()
vocabulary

In [None]:
# pandas to visualise. things are always done faster in numpy
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
df

In [25]:
# TASK: Do you notice anything different about the vocabulary when using Count Vectorizer compared to the vocabulary of our longer method?

In [26]:
# TASK : What other methods can be used to preprocess words (tokens) in a corpus of text documents? Describe two

In [27]:
# TASK : Write code to calculate the term frequency of the word 'boats' in the 3rd document

In [28]:
# TASK : Write code to calculate the document frequency of the word 'are'