# Lab 3: Information Retrieval

Student: John Wu

In [1]:
import numpy as np
import sys, re, nltk, time, math
from collections import Counter
from operator import itemgetter

## (a) Build in-memory inverted file

The input document will be processed one by one, with the result being appended into a inverted file, which is a dictionary. These will be performed by 2 utilty functions below.

In [2]:
def processDoc(txt, docID, vocab):
    d = Counter( nltk.word_tokenize(txt) ) # count of each token (as by NLTK)
    for tk in d: # merge dict of this doc with the bigger vocab dict
        if tk not in vocab: # if not in vocab
            vocab[tk] = [(docID, d[tk])] # first posting for token: (docID, DF)
        else: # if already in vocab
            vocab[tk].append( (docID, d[tk]) ) # append to posting list
    return vocab, d

def processDocsFile(docFile):
    nDocs = 0 # count number of total docs processed
    vcb = dict() # dict for inverted file

    with open(docFile, 'r') as f:
        for line in f: # NOTE: read line by line due to possibly large size
            docID,txt = line.split('\t')
            docID = int(docID) # parse into int
            vcb, tmpDict = processDoc(txt, docID, vcb) # process single doc
            nDocs += 1

        for term in vcb:  # go through dict and sort the posting lists
            vcb[term].sort(key=itemgetter(0)) # sort by first elem, or docID
            
    return vcb, nDocs

Perform the parsing of TIME dataset and building of inverted file

In [4]:
################################################################################
fName = './data/time-documents.txt'
t0 = time.perf_counter()
timeInv, timeNdocs = processDocsFile(fName)
tt = time.perf_counter() - t0

__Posting List Tuples for Terms__

In [5]:
terms = ['COMPUTER', 'THAILAND', 'ROCKETS']
for t in terms:
    posts = timeInv[t][:10]
    print('%s -> %s'%(t,posts))

COMPUTER -> [(308, 1)]
THAILAND -> [(203, 1), (243, 5), (280, 14), (396, 1), (449, 1), (498, 1), (516, 1), (534, 5), (543, 12), (544, 2)]
ROCKETS -> [(27, 1), (117, 1), (186, 1), (313, 6), (404, 1), (464, 2), (495, 1), (509, 2), (545, 2)]


__Print DF and IDF__

In [6]:
for t in terms:
    df = len(timeInv[t])
    print('%s: DF=%d, IDF=%f'%(t,df,1/df))

COMPUTER: DF=1, IDF=1.000000
THAILAND: DF=11, IDF=0.090909
ROCKETS: DF=9, IDF=0.111111


__Timing of Processing Documents__

The time is measured as CPU process time

In [7]:
print('Processed in %d minutes and %.3f seconds.'%(tt//60,tt%60))

Processed in 0 minutes and 6.278 seconds.


## (b) Document vector length

The function below implement the algorithm provided in the assignment. Note that +1.0 is added to raw IDF so to not end up with 0 if a term appears in all documents.

In [8]:
def calcDocLens(vcb, nDocs):
    docLens = Counter() # use dict since docID may not be contiguous
    idfs = dict()
    
    for term,posts in vcb.items(): # loop over all terms in collection
        idf = math.log2(1.0 + nDocs/len(posts)) # +1.0 for term in all docs
        idfs[term] = (len(posts), idf)
        for docID,tf in posts: # loop over docID and tf(term,docid)
            docLens[docID] += (tf*idf)**2 # accumulate doc vector length
            
    for docID,accum in docLens.items(): # loop calculate proper doc vec length
        docLens[docID] = math.sqrt(accum) # sqrt of sum of squared terms
    
    return docLens,idfs

__Document Vector Lengths__

In [9]:
timeDocLens,timeIDFs = calcDocLens(timeInv, timeNdocs)
tmp = sorted(timeDocLens.items(), key=itemgetter(0))[:10] # sorted by docID
for docID,docLen in tmp: # print 10 lowest by numerical docID
    print('DocID=%d, length=%f'%(docID,docLen))

DocID=17, length=187.101712
DocID=18, length=71.207738
DocID=19, length=155.398830
DocID=20, length=75.471414
DocID=21, length=185.960048
DocID=23, length=145.982374
DocID=24, length=243.981400
DocID=25, length=73.511986
DocID=26, length=135.834620
DocID=27, length=84.270999


## (c) Query representation

In [10]:
def processQueryFile(queryFile):
    with open(queryFile, 'r') as f:
        txts = f.read().splitlines()
    qs = [None for x in range(len(txts))]
    qIDs = [0 for x in range(len(txts))]
    for n,line in enumerate(txts):
        qID,qTxt = line.split('\t')
        qIDs[n] = int(qID)
        qs[n] = Counter(nltk.word_tokenize(qTxt))

    return list(zip(qIDs, qs))

In [11]:
################################################################################
fName = './data/time-queries.txt'
timeQs = processQueryFile(fName)


In [14]:
qLen = 0
for term,tf in timeQs[0][1].items():
    if term in timeIDFs:
        df,idf = timeIDFs[term]
        print('%s: tf=%d, idf=%f'%(term,tf,idf))
        qLen += (tf*idf) ** 2
    else:
        print('%s: not found in Corpus'%term)
qLen = math.sqrt(qLen)
    
print('\nQuery Vector Length: %f'%qLen)

KENNEDY: tf=1, idf=3.321928
ADMINISTRATION: tf=1, idf=4.539975
PRESSURE: tf=1, idf=3.829723
ON: tf=1, idf=1.071462
NGO: tf=1, idf=4.469235
DINH: tf=1, idf=4.469235
DIEM: tf=1, idf=4.277338
TO: tf=1, idf=1.001708
STOP: tf=1, idf=3.872352
SUPPRESSING: not found in Corpus
THE: tf=1, idf=1.000000
BUDDHISTS: tf=1, idf=5.179909
.: tf=1, idf=1.000000

Query Vector Length: 12.269275


## (d) Score Documents

In [15]:
def cosineSim(qDict, invFile, idfs, docLens):
    sims = Counter()  # counter for storing simularity scores
    qLen = 0 # vector length of query
    for tk,quTF in qDict.items(): # loop over terms in a query
        if tk not in invFile: # skip query term if not in corpus
            continue
        df,idf = idfs[tk]
        qLen += (quTF*idf) ** 2
        for docID,corpTF in invFile[tk]: # iterate through posting list
            sims[docID] += corpTF*idf * quTF*idf 
    
    qLen = math.sqrt(qLen)
    for docID in sims:
        sims[docID] /= (docLens[docID] * qLen)
    return sims # return simularity scores of each document (most are 0)

def processQueries(qs, invFile, idfs, docLens):
    scores = [None for x in range(len(qs))]
    for n,(qID,qDict) in enumerate(qs):
        scores[n] = (qID,cosineSim(qDict, invFile, idfs, docLens))
    return scores

__Processing Queries and Timing__

In [31]:
t0 = time.perf_counter()
timeQscores = processQueries(timeQs, timeInv, timeIDFs, timeDocLens)
tt = time.perf_counter() - t0
print('Processed in %d minutes and %.3f seconds.'%(tt/60,tt%60))

Processed in 0 minutes and 0.506 seconds.


__Sample of Cosine Similarity Scores__

This shows the cosine similarity scores of the first query for 20 arbitrary document IDs.

In [17]:
for n,(qID,s) in enumerate(timeQscores[0][1].items()):
    if n>=20:
        break
    print('QueryID: %d, similarity score = %f'%(qID,s))

QueryID: 17, similarity score = 0.078335
QueryID: 21, similarity score = 0.061475
QueryID: 28, similarity score = 0.065611
QueryID: 29, similarity score = 0.071132
QueryID: 43, similarity score = 0.067917
QueryID: 45, similarity score = 0.067563
QueryID: 57, similarity score = 0.047027
QueryID: 62, similarity score = 0.088443
QueryID: 67, similarity score = 0.052492
QueryID: 70, similarity score = 0.055056
QueryID: 71, similarity score = 0.087184
QueryID: 105, similarity score = 0.057302
QueryID: 126, similarity score = 0.068939
QueryID: 163, similarity score = 0.083421
QueryID: 183, similarity score = 0.120382
QueryID: 188, similarity score = 0.075368
QueryID: 196, similarity score = 0.095314
QueryID: 204, similarity score = 0.068869
QueryID: 217, similarity score = 0.050837
QueryID: 221, similarity score = 0.075570


## (d) Ranked List
Since we're using `Counter` to store similarity scores, we can use the built-in `most_common()` function, which implements a binary heap for extracting the top N items with highest value.

In [18]:
def getTopNSimDocs(qID, simScore, N=50): # return top N document for a query
	topN = simScore.most_common(N) # use binary heap for extracting top N
	fmt = '%d Q0 %d %d %.6f jwu74\n' # format for output file lines
	return [fmt % (qID,docID,n+1,score) for n,(docID,score) in enumerate(topN)]

def writeQueryResult(outName, qScores, N=50):
    with open(outName, 'w') as fh:
        for qInd,score in qScores: # loop over query results
            out = getTopNSimDocs(qInd,score) # get top N docs based on sim
            fh.writelines(out) # write out lines for output

Outputting query results to `time-jwu74.txt`.

In [19]:
writeQueryResult('time-jwu74.txt', timeQscores)

In [21]:
def queryCorpus(corpusFile, queryFile, outFile):
    t0 = time.perf_counter()
    invFile, nDocs = processDocsFile(corpusFile)
    docLens, idfs = calcDocLens(invFile, nDocs)
    buildTime = time.perf_counter() - t0
    
    t0 = time.perf_counter()
    qTxts = processQueryFile(queryFile)
    qryScores = processQueries(qTxts, invFile, idfs, docLens)
    queryTime = time.perf_counter() - t0
    
    writeQueryResult(outFile, qryScores)
    
    return buildTime, queryTime

In [25]:
bt, qt = queryCorpus('./data/fire10-documents.txt', 
                     './data/fire10-queries.txt', 'fire10-jwu74.txt' )

In [33]:
print('Build time for fire10: %d minutes %.3f seconds'%(bt//60,bt%60))
print('Query time for fire10: %d minutes %.3f seconds'%(qt//60,qt%60))

Build time for fire10: 19 minutes 30.096 seconds
Query time for fire10: 0 minutes 43.885 seconds


In [30]:
5//3

1