# Revision of the whole indexing process

![Sort-Based-Index](img/indexing-steps.png)

In [9]:
!pip install nltk




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Preprocessing:

In [11]:
from nltk.tokenize import sent_tokenize, TweetTokenizer
import nltk
nltk.download('punkt_tab')
from string import punctuation
tokenizer = TweetTokenizer()

def preprocess_document(content):
    """
    Returns a list of tokens for a document's content. 
    Tokens should not contain punctuation and should be lower-cased.
    """
    sentences = sent_tokenize(content)
    tokens = []
    for _sent in sentences:
        sent_tokens = tokenizer.tokenize(_sent)
        sent_tokens = [_tok.lower() for _tok in sent_tokens if _tok not in punctuation]
        tokens += sent_tokens
    return tokens

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [12]:
preprocess_document(open('data/mini_newsgroups/rec.autos/101629').read())[:10]

['path',
 'cantaloupe.srv.cs.cmu.edu',
 'crabapple.srv.cs.cmu.edu',
 'fs7.ece.cmu.edu',
 'europa.eng.gtefsd.com',
 'howland.reston.ans.net',
 'wupost',
 'uunet',
 'caen',
 'rphroy']

# Extract Pairs of (token, document_id) tuples 
These will eventually end up sorted by document_id.

In [1]:
from os import scandir # can be used for easier iteration of documents in a folder
# can check is_file() on the objects returned by scan_dir 
# contain whole document path, so no need to join with the directory

import os

def get_token_doc_id_pairs(category_dir):
    """
    Iteratively goes through the documents in the category_dir and constructs/returns:
    1. A list of (token, doc_id) tuples
    2. A dictionary of doc_id:doc_name
    """
    pass

In [153]:
token_docid, doc_ids = get_token_doc_id_pairs('data/mini_newsgroups/rec.autos/')

In [154]:
print(doc_ids[2])

101592


In [155]:
token_docid[:10]

[('path', 0),
 ('cantaloupe.srv.cs.cmu.edu', 0),
 ('das-news.harvard.edu', 0),
 ('ogicse', 0),
 ('uwm.edu', 0),
 ('wupost', 0),
 ('uunet', 0),
 ('world', 0),
 ('edwards', 0),
 ('from', 0)]

__Example output:__ <br>
token_docid, doc_ids = get_token_doc_id_pairs('data/mini_newsgroups/rec.autos/')<br>
print(doc_ids[2])<br>
token_docid[:10]<br>

> DirEntry '101577' <br>
>[('newsgroups', 0),
> ('rec.autos', 0),
> ('path', 0),
> ('cantaloupe.srv.cs.cmu.edu', 0),
> ('magnesium.club.cc.cmu.edu', 0),
> ('news.sei.cmu.edu', 0),
> ('fs7.ece.cmu.edu', 0),
> ('europa.eng.gtefsd.com', 0),
> ('howland.reston.ans.net', 0),
> ('ux1.cso.uiuc.edu', 0)]

# Sort by token

In [156]:
from operator import itemgetter
from collections import defaultdict
sorted_token_docid = sorted(token_docid, key=itemgetter(0))
sorted_token_docid[-10:]

[('zaphod.mps.ohio-state.edu', 97),
 ('zaphod.mps.ohio-state.edu', 98),
 ('zaphod.mps.ohio-state.edu', 99),
 ('zauberer', 57),
 ('zeolite', 40),
 ('zip', 49),
 ('zip', 49),
 ('zx', 79),
 ('zx-r', 94),
 ("|'8", 71)]

# Merge token occurrences in a single document -> (token, doc_id, term_freq) tuples

In [157]:
def merge_token_in_doc(sorted_token_docid):
    """
    Returns a list of (token, doc_id, term_freq) tuples from a sorted list of (token, doc_id) list, 
    where if a token appears n times in a doc_id, we merge it in a tuple (toke, doc_id, n).
    """
    pass

In [158]:
merged_token_in_docs = merge_token_in_doc(sorted_token_docid)
merged_token_in_docs[-10:]

[('zaphod.mps.ohio-state.edu', '96', 1),
 ('zaphod.mps.ohio-state.edu', '97', 1),
 ('zaphod.mps.ohio-state.edu', '98', 1),
 ('zaphod.mps.ohio-state.edu', '99', 1),
 ('zauberer', '57', 1),
 ('zeolite', '40', 1),
 ('zip', '49', 2),
 ('zx', '79', 1),
 ('zx-r', '94', 1),
 ("|'8", '71', 1)]

__Example output:__ <br>

merge_token_in_doc = merge_token_in_doc(sorted_token_docid) <br>
merged_tokens_in_doc[-10:] <br>

>[('zaphod.mps.ohio-state.edu', 96, 1),
 ('zaphod.mps.ohio-state.edu', 97, 1),
 ('zaphod.mps.ohio-state.edu', 98, 1),
 ('zaphod.mps.ohio-state.edu', 99, 1),
 ('zauberer', 47, 1),
 ('zeolite', 49, 1),
 ('zip', 77, 2),
 ('zx', 86, 1),
 ('zx-r', 57, 1),
 ("|'8", 70, 1)]

# Split into Dictionary and Postings (usually linked lists for each word)

In [159]:
from collections import defaultdict
dictionary = defaultdict(lambda: (0, 0)) # term : doc_freq, tot freq
postings = defaultdict(lambda: []) # term: doc_ids, doc_freq

for token, doc_id, doc_freq in merged_token_in_docs:
    dictionary[token] = (dictionary[token][0]+1, dictionary[token][0]+doc_freq)

# usually implemented as linked lists
for token, doc_id, doc_freq in merged_token_in_docs:
    postings[token].append((doc_id, doc_freq)) 

In [160]:
dictionary['zip'], dictionary['zaphod.mps.ohio-state.edu']

((1, 2), (51, 51))

In [161]:
postings['zip'], postings['zaphod.mps.ohio-state.edu'][:3]

([('49', 2)], [('3', 1), ('14', 1), ('17', 1)])

# Boolean Queries in the index

## AND query: we want to find documents which contain both 'living' and 'dead' in them.

We use a merging algorithm for conjunction queries, which simultaneously traverses the postings of the given words.

- Living: 1 -> 2 -> 5 -> 17 -> 30 -> 31 -> 44 -> 45 -> 47
- Dead: 5 -> 17 -> 44
- Intersection: 5 -> 17 -> 44

It takes __linear time__ over the number of documents the two words appear in. <br/>
It is important to have postings for each word __sorted by document id__.

In [141]:
def and_query(postings, word1, word2):
    """
    merging postings lists of two words
    """
    pass

In [144]:
doc_id = and_query(postings, 'living', 'dead')
doc_id

[36]

In [145]:
postings['living']

[(30, 1), (36, 1), (78, 1)]

In [146]:
postings['dead']

[(29, 1), (36, 1), (65, 1)]

__Example Output:__ <br>

doc_id = and_query(postings, 'living', 'dead') <br>
doc_ids[doc_id[0]] <br>

> DirEntry '102983'

## Questions:

- How about if we want to find a document containing N words? 
- What will be the execution time for queries with NOT/ OR and different combinations?
- What are the downsides of boolean queries?

## Advanced query features
* We can optimize with : processing in order of increasing freq; skip pointers
* Proximity
* Zones
* Phrase queries (bi-word indexes, positional indexes)