# Lab 3: Information Retrieval

Student: John Wu

In [65]:
import numpy as np
import sys, re, nltk, time
from collections import Counter
from operator import itemgetter

## (a) Build in-memory inverted file

The input document will be processed one by one, with the result being appended into a inverted file, which is a dictionary. These will be performed by 2 utilty functions below.

In [51]:
def processDoc(txt, docID, vocab):
    d = Counter( nltk.word_tokenize(txt) ) # count of each token (as by NLTK)
    for tk in d: # merge dict of this doc with the bigger vocab dict
        if tk not in vocab: # if not in vocab
            vocab[tk] = [(docID, d[tk])] # first posting for token: (docID, DF)
        else: # if already in vocab
            vocab[tk].append( (docID, d[tk]) ) # append to posting list
    return vocab, d

def processDocsFile(docFile):
    nDocs = 0 # count number of total docs processed
    vcb = dict() # dict for inverted file

    with open(docFile, 'r') as f:
        for line in f: # NOTE: read line by line due to possibly large size
            docID,txt = line.split('\t')
            docID = int(docID) # parse into int
            vcb, tmpDict = processDoc(txt, docID, vcb) # process single doc
            nDocs += 1

        for term in vcb:  # go through dict and sort the posting lists
            vcb[term].sort(key=itemgetter(0)) # sort by first elem, or docID
            
    return vcb, nDocs

Perform the parsing of TIME dataset and building of inverted file

In [86]:
################################################################################
fName = './data/time-documents.txt'
t0 = time.process_time()
timeVcb, timeDocs = processDocsFile(fName)
t1 = time.process_time()

__Posting List Tuples for Terms__

In [55]:
terms = ['COMPUTER', 'THAILAND', 'ROCKETS']
for t in terms:
    posts = timeVcb[t][:10]
    print('%s -> %s'%(t,posts))

COMPUTER -> [(308, 1)]
THAILAND -> [(203, 1), (243, 5), (280, 14), (396, 1), (449, 1), (498, 1), (516, 1), (534, 5), (543, 12), (544, 2)]
ROCKETS -> [(27, 1), (117, 1), (186, 1), (313, 6), (404, 1), (464, 2), (495, 1), (509, 2), (545, 2)]


__Print DF and IDF__

In [56]:
for t in terms:
    df = len(timeVcb[t])
    print('%s: DF=%d, IDF=%f'%(t,df,1/df))

COMPUTER: DF=1, IDF=1.000000
THAILAND: DF=11, IDF=0.090909
ROCKETS: DF=9, IDF=0.111111


__Timing of Processing Documents__

The time is measured as CPU process time

In [88]:
t = t1-t0
print('Processed in %d minutes and %.3f seconds.'%(np.floor(t/60),t))

Processed in 0 minutes and 1.823 seconds.


## (b) Document vector length