# IR Assignment

## Libraries

In [21]:
import glob
import os, sys
import numpy as np
import codecs
import nltk
from collections import defaultdict, OrderedDict
from nltk.stem import WordNetLemmatizer
import pickle
import logging
from datetime import datetime
import time
from IR import * 
index_file = 'index.pkl'
encoding = 'latin1'

## Pre-processor Class
Text preprocessing class. Responsible for term normalization using stop word and lemmatization

In [5]:
class Normaliser(object):
    def __init__(self, lemmatizer=None, stop_words=None, dictionary=None, lower_case=True):
        self.lemmatizer = lemmatizer
        self.stop_words = stop_words
        print("The number of stop words is %d" % (len(self.stop_words)))
        print("lemmatizer is " + str(self.lemmatizer))
        self.dictionary = dictionary
        self.lower_case = lower_case

    def normalise(self, token):
        '''
        normalization  using stop word removal and lemmatization
        :param token:
        :return:
        '''
        if self.lower_case:
            token = token.lower()
        if token in self.stop_words:
            return None
        if self.lemmatizer:
            token = self.lemmatizer.lemmatize(token)
        if self.dictionary:
            if token not in self.dictionary:
                return None
        return token


## Indexer Class
Indexer class responsible for generating ,storing inverted indexes and searching queries

In [16]:
class Indexer(object):
    def __init__(self, tokeniser, normaliser=None):
        self.tokeniser = tokeniser
        self.normaliser = normaliser
        self.inverted_index = defaultdict(PostingList)
        # total number of documents
        self.N = 0
        self.document_lengths = defaultdict(float)
        self.dl = defaultdict(int)
        self.dic = defaultdict(int)

    def index(self, docID, text):
        '''
        Generate index from document

        :param docID:
        :param text:
        :return:
        '''
        tokens = self.tokeniser.tokenize(text)
        token_position = 0
        term_documents = defaultdict(TermDocument)
        for token in tokens:
            if self.normaliser:
                token = self.normaliser.normalise(token)
            if not token:
                continue
            term_document = term_documents[token]
            term_document.tf += 1
            term_document.positions.append(token_position)
            token_position += 1
            self.dl[docID] += 1
            self.dic[token] += 1
        # update the main index
        for term, term_document in term_documents.items():
            tf = term_document.tf
            self.document_lengths[docID] += np.square(tf)
            self.inverted_index[term].posts.append([docID, term_document])
            self.inverted_index[term].df += 1
        self.N += 1
        self.document_lengths[docID] = np.sqrt(self.document_lengths[docID])
        
    def search(self, query):
            '''
            Searching query from indxes

            :param query:
            :return:
            '''
            results = defaultdict(float)
            for term in query:
                posting_list = self.inverted_index[term]
                df = posting_list.df
                idf = np.log(self.N / (df + 1))
                posts = posting_list.posts
                for post in posts:
                    docID = post[0]
                    term_document = post[1]
                    tf = term_document.tf
                    tfidf = tf * idf
                    results[docID] += tfidf
            for docID in results:
                results[docID] = results[docID] / self.document_lengths[docID]
            ranked_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
            return ranked_results
        
    def dump(self, filename):
        '''
        Storing raw indexes and pickel file

        :param filename:
        :return:
        '''
        logging.info("dumping index to %s" % (filename))
        with open(filename, 'wb') as outf:
            pickle.dump((self.inverted_index, self.document_lengths, self.N, self.document_lengths, self.dl, self.dic),
                        outf)
        with open(filename + ".txt", 'w') as outf:
            for item in self.inverted_index.items():
                term = item[0]
                text = str(term)
                for post in item[1].posts:
                    doc_id = post[0]
                    tf = post[1].tf
                    text = text + ":" + "(doc:" + str(doc_id) + ",tf:" + str(tf) + "), "
                # print(text)
                outf.write(text)


## Query Processor Class

In [48]:
class QueryProcessor(object):
    '''
    Responsible for query preprocessing
    '''

    def __init__(self, tokeniser, normaliser=None):
        self.tokeniser = tokeniser
        self.normaliser = normaliser

    def process(self, query):
        tokens = self.tokeniser.tokenize(query)
        query_terms = []
        for token in tokens:
            if self.normaliser:
                token = self.normaliser.normalise(token)
            if not token:
                continue
            query_terms.append(token)
        query_terms = self._expandQuery(query_terms)
        return query_terms
    def _expandQuery(self, query_terms):
        return query_terms

## Code Usage
Initialize pre-processer and indexer 

In [49]:
lemmatizer = WordNetLemmatizer()

normaliser = Normaliser(lemmatizer=lemmatizer, stop_words=nltk.corpus.stopwords.words('english'),
                        dictionary=None,
                        lower_case=True)
tokeniser = NLTKWordTokenizer()
indexer = Indexer(tokeniser, normaliser)
query_processor = QueryProcessor(tokeniser, normaliser)


The number of stop words is 153
lemmatizer is <WordNetLemmatizer>


## Index generation

In [50]:
base_dir = "Data/"
docs = getfilenames(base_dir=base_dir)
docs_length = len(docs)
print("Indexing %d docs in %s" % (docs_length, base_dir))
docs_processed = 1
for docID, filename in docs.items():
    docs_processed += 1
    filename = os.path.join(base_dir, filename)
    text = getContent(filename, encoding=encoding)
    indexer.index(docID, text)
indexer.dump(base_dir + index_file)
print("Done")

05/04/2018 12:36:08 PM dumping index to Data/index.pkl


Indexing 6 docs in Data/
Done


In [51]:
for item in indexer.inverted_index.items():
    term = item[0]
    text = str(term)
    for post in item[1].posts:
        doc_id = post[0]
        tf = post[1].tf
        text = text + ":" + "(doc:" + str(doc_id) + ",tf:" + str(tf) + "), "
    text = text + "\n"
    print(text)

3d:(doc:0,tf:3), 

radiology:(doc:0,tf:3), 

lab:(doc:0,tf:4), :(doc:3,tf:2), :(doc:5,tf:1), 

stanford:(doc:0,tf:13), :(doc:1,tf:19), :(doc:2,tf:2), :(doc:3,tf:1), :(doc:4,tf:5), :(doc:5,tf:9), 

university:(doc:0,tf:2), :(doc:1,tf:3), :(doc:3,tf:1), :(doc:4,tf:1), :(doc:5,tf:3), 

school:(doc:0,tf:6), :(doc:1,tf:8), :(doc:4,tf:4), :(doc:5,tf:1), 

medicine:(doc:0,tf:9), :(doc:1,tf:10), :(doc:4,tf:5), 

quantitative:(doc:0,tf:4), 

imaging:(doc:0,tf:5), 

department:(doc:0,tf:3), :(doc:1,tf:9), :(doc:3,tf:2), 

search:(doc:0,tf:1), :(doc:1,tf:1), :(doc:3,tf:1), :(doc:4,tf:1), :(doc:5,tf:2), 

site:(doc:0,tf:3), :(doc:1,tf:4), :(doc:4,tf:3), :(doc:5,tf:2), 

medical:(doc:0,tf:5), :(doc:1,tf:4), :(doc:4,tf:1), 

way:(doc:0,tf:3), :(doc:1,tf:3), :(doc:4,tf:2), 

give:(doc:0,tf:3), :(doc:1,tf:3), :(doc:4,tf:2), 

find:(doc:0,tf:5), :(doc:1,tf:5), :(doc:4,tf:2), 

person:(doc:0,tf:3), :(doc:1,tf:3), :(doc:2,tf:1), :(doc:4,tf:2), 

alumnus:(doc:0,tf:2), :(doc:1,tf:6), :(doc:4,tf:1), :(doc:5

## Search query

In [52]:
def searching(base_dir, indexer, query_processor, query):
    '''
    Search given string query in index
    :param base_dir:
    :param indexer: indexer obj
    :param query_processor: query processor obj
    :param query: string query
    :return:
    '''
    # print(query)
    docs = getfilenames(base_dir=base_dir)
    docs_length = len(docs)

    op = np.array([False] * docs_length)
    query_terms = query_processor.process(query)
    results = indexer.search(query_terms)
    for result in results:
        docID, score = result
        op[docID] = True
        # docName = docs[docID]
        # print("DocID: %d DocName: %s Score: %0.2f" % (docID, docName, score))
    return op

In [53]:
while (True):
    query = input("Please enter query:")
    if query == "q:":
        break
    query = query.replace("(", " ( ")
    query = query.replace(")", " ) ")
    str_eval = ""
    for split_word in query.split():
        if split_word == "(" or split_word == ")":
            str_eval += split_word + " "
        elif split_word == "OR":
            str_eval += "| "
        elif split_word == "AND":
            str_eval += "& "
        elif split_word == "NOT":
            str_eval += "~ "
        else:
            str_eval += "searching(base_dir,indexer,query_processor, '" + split_word + "')" + " "
    # print(str_eval)

    op = eval(str_eval)
    for docID in np.where(np.array(op) == True)[0]:
        docName = docs[docID]
        print("DocName: %s" % docName)



Please enter query:medicine
DocName: Doc1.txt
DocName: Doc2.txt
DocName: Doc5.txt
Please enter query:NOT medicine
DocName: Doc3.txt
DocName: Doc4.txt
DocName: Doc6.txt
Please enter query:(Lab OR technology) AND (aeroastro OR Josef)
DocName: Doc4.txt
Please enter query:(medicine OR technology) AND (block OR design)
DocName: Doc1.txt
DocName: Doc6.txt
Please enter query:q:
