In [1]:
print('I have set up the notebook and have it running, yay')

I have set up the notebook and have it running, yay


# Indexing
The following cells will demonstrate how a small index works

In [2]:
import pandas as pd

In [3]:
# documents to be indexed
documents = [
             'Boats are great',
             'Sailing boats are enviromental',
             'canal boats are narrow',
             'wooden boats are more work than metal boats',
             'a wooden boat is always the prettiest'
]
document_terms = [doc.split(' ') for doc in documents]

In [4]:
document_terms

[['Boats', 'are', 'great'],
 ['Sailing', 'boats', 'are', 'enviromental'],
 ['canal', 'boats', 'are', 'narrow'],
 ['wooden', 'boats', 'are', 'more', 'work', 'than', 'metal', 'boats'],
 ['a', 'wooden', 'boat', 'is', 'always', 'the', 'prettiest']]

**The long way**

In [5]:
# find the "vocabulary", i.e. all words in the doc
vocabulary = set()

for doc in documents:
  for word in doc.split(): 
    vocabulary.add(word)

vocabulary = list(vocabulary)

vocabulary

['boats',
 'are',
 'boat',
 'work',
 'wooden',
 'more',
 'a',
 'than',
 'prettiest',
 'the',
 'enviromental',
 'is',
 'always',
 'canal',
 'narrow',
 'Sailing',
 'great',
 'Boats',
 'metal']

In [6]:
# CODING TIP: You can also write this in a single line
vocabulary2 = list(set(word for doc in documents for word in doc.split()))

# Lets check that the two methods lead to the same result
vocabulary == vocabulary2 

True

In [7]:
# Boolean matrix representation of all docs
documents_vectorized = [ 
                        [ 1 if term in doc else 0 for term in vocabulary]
                        for doc in document_terms]

documents_vectorized

[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
 [1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]]

In [8]:
# CODING TIP: Alternatively you can also cast the boolean result of evaluating term in doc to an int
documents_vectorized2 = [[int(term in doc) for term in vocabulary] for doc in document_terms]

# Lets check that the two methods lead to the same result
documents_vectorized2 == documents_vectorized 

True

In [9]:
# Show in pandas for clarity
df = pd.DataFrame(documents_vectorized, columns = vocabulary)
df

Unnamed: 0,boats,are,boat,work,wooden,more,a,than,prettiest,the,enviromental,is,always,canal,narrow,Sailing,great,Boats,metal
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
3,1,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,1,0,1,0,1,1,0,1,1,0,0,0,0,0,0


In [10]:
# An important part of the index is a dictionary for term: doc_id pairs. So let's create one
term_occurence_dict = {term:[] for term in vocabulary}

for doc_id, doc in enumerate(document_terms):
  for term in doc:
    term_occurence_dict[term].append(doc_id)

In [11]:
term_occurence_dict

{'boats': [1, 2, 3, 3],
 'are': [0, 1, 2, 3],
 'boat': [4],
 'work': [3],
 'wooden': [3, 4],
 'more': [3],
 'a': [4],
 'than': [3],
 'prettiest': [4],
 'the': [4],
 'enviromental': [1],
 'is': [4],
 'always': [4],
 'canal': [2],
 'narrow': [2],
 'Sailing': [1],
 'great': [0],
 'Boats': [0],
 'metal': [3]}

In [12]:
# CODING TIP: We can also use the defaultdict module from the collections package
# This dictionary implementation has a little more functionality
from collections import defaultdict

term_occurence_dict = defaultdict(list)

for doc_id, doc in enumerate(document_terms):
  for term in doc:
    term_occurence_dict[term].append(doc_id)

In [13]:
term_occurence_dict

defaultdict(list,
            {'Boats': [0],
             'are': [0, 1, 2, 3],
             'great': [0],
             'Sailing': [1],
             'boats': [1, 2, 3, 3],
             'enviromental': [1],
             'canal': [2],
             'narrow': [2],
             'wooden': [3, 4],
             'more': [3],
             'work': [3],
             'than': [3],
             'metal': [3],
             'a': [4],
             'boat': [4],
             'is': [4],
             'always': [4],
             'the': [4],
             'prettiest': [4]})

In [14]:
# So if we want to find all documents with the word "boats" or "than" instead of looping over everything we just do:
terms = ['boats', 'than']
result_docs = []

for term in terms:
  doc_ids = term_occurence_dict[term]
  for doc_id in doc_ids:
    result_docs.append(documents[doc_id])

result_docs = list(set(result_docs))

In [15]:
result_docs

['canal boats are narrow',
 'wooden boats are more work than metal boats',
 'Sailing boats are enviromental']

In [16]:
# CODING TIP: it's faster and easier to get rid of duplicate integers than duplicate text. 
# This method uses 3 lines 
doc_indexes = []

# Get all doc indexes containing the terms
for term in terms:
  doc_indexes.extend(term_occurence_dict[term])
# Retrieve documents from indexes
result_docs2 = [documents[idx] for idx in set(doc_indexes)]

# Checking that both methods lead to the same answer
result_docs.sort()
result_docs2.sort()
result_docs == result_docs2

True

In [17]:
result_docs2

['Sailing boats are enviromental',
 'canal boats are narrow',
 'wooden boats are more work than metal boats']

In [18]:
# TASK: How about if we want only documents with "boats" AND "than"?

**The short way**

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vectorizer = CountVectorizer()

In [21]:
documents_vectorized = vectorizer.fit_transform(documents)
# a sparse matrix representaion of the data. this is easy to turn into a numpy array with .toarray()
# however, don't do it unless you have to as it is costly
documents_vectorized

<5x17 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [22]:
vocabulary = vectorizer.get_feature_names_out()
vocabulary

array(['always', 'are', 'boat', 'boats', 'canal', 'enviromental', 'great',
       'is', 'metal', 'more', 'narrow', 'prettiest', 'sailing', 'than',
       'the', 'wooden', 'work'], dtype=object)

In [24]:
# pandas to visualise. things are always done faster in numpy
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
df

Unnamed: 0,always,are,boat,boats,canal,enviromental,great,is,metal,more,narrow,prettiest,sailing,than,the,wooden,work
0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0
3,0,1,0,2,0,0,0,0,1,1,0,0,0,1,0,1,1
4,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0


In [25]:
# TASK: Do you notice anything different about the vocabulary when using Count Vectorizer compared to the vocabulary of our longer method?

In [26]:
# TASK : What other methods can be used to preprocess words (tokens) in a corpus of text documents? Describe two

In [27]:
# TASK : Write code to calculate the term frequency of the word 'boats' in the 3rd document

In [29]:
df['boats'][3]

2

In [28]:
# TASK : Write code to calculate the document frequency of the word 'are'

In [30]:
df['are'].sum()

4