# Information Retrieval

In [3]:
!pip --quiet install whoosh
!pip --quiet install nltk

Extract the dataset

In [1]:
!unzip -q /resources/data/ir_data.zip -d /resources/data

unzip:  cannot find or open /resources/data/ir_data.zip, /resources/data/ir_data.zip.zip or /resources/data/ir_data.zip.ZIP.


Compile TREC_EVAL we will later use to evaluate our performance

In [2]:
!make -s -w -C /resources/data/DSS_Fall2016_Assign1/trec_eval.8.1 > /dev/null 2>&1

Import necessary libraries: whoosh(for IR) and os,shutil(for working with files)

In [4]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os, os.path
import shutil

Define constants for the paths of our dataset

In [5]:
DOCUMENTS_DIR = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents"
INDEX_DIR = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/index1"
QUER_FILE = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/topics/air.topics"
QRELS_FILE = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/qrels/air.qrels"
OUTPUT_FILE = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/myres"
TREC_EVAL = "/resources/data/DSS_Fall2016_Assign1/trec_eval.8.1/trec_eval"
INDEX_DIR2 = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/index2"
OUTPUT_FILE2 = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/myres2"

### Building the index

In [6]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

In [7]:
# if index exists - remove it
if os.path.isdir(INDEX_DIR):
    shutil.rmtree(INDEX_DIR)

# create the directory for the index
os.makedirs(INDEX_DIR)

# create index
myIndex = index.create_in(INDEX_DIR, mySchema)

### Indexing the files

In [8]:
# First, lets review the documents in our sample dataset
!ls $DOCUMENTS_DIR

email01  email03  email05  email07  email09  email14
email02  email04  email06  email08  email10


In [9]:
# first we build a list of all the full paths of the files in DOCUMENTS_DIR
filesToIndex = []
for root, dirs, files in os.walk(DOCUMENTS_DIR):
    filePaths = [os.path.join(root, fileName) for fileName in files if not fileName.startswith('.')]
    filesToIndex.extend(filePaths)

In [10]:
# print the first 5 paths to make sure it worked
print("\n".join(filesToIndex[:5]))

/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email08
/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email09
/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email03
/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email07
/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email06


In [11]:
# count files to index
print("number of files:", len(filesToIndex))

number of files: 11


In [46]:
# open writer
myWriter = writing.BufferedWriter(myIndex, period=20, limit=1000)

try:
    # write each file to index
    for docNum, filePath in enumerate(filesToIndex):
        with open(filePath, "r") as f:
            fileContent = f.read()
            myWriter.add_document(file_path = filePath,
                                  file_content = fileContent)
            
            if (docNum % 1000 == 0):
                print("already indexed:", docNum+1)
    print("done indexing.")

finally:
    # save the index
    myWriter.close()

already indexed: 1
done indexing.


## Querying

In [13]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [14]:
# run a sample query for the phrase "item"
sampleQuery = myQueryParser.parse("item")
sampleQueryResults = mySearcher.search(sampleQuery, limit=None)

# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

email01 0 2.6746417187049216


### TREC_EVAL

In order to evaluate our results we will use a topic file - a list of topics we use to evaluate our IR system

In [15]:
# print the topic file
!cat $QUER_FILE

01 ducks
02 ig nobel prizes
03 mathematics
04 flowing hair
05 music
06 AIR TV


We will compare our evaluate our results with a set of judged results(qrels file) using TREC_EVAL 

In [16]:
# print the first 10 lines in the qrels file
!head -n 10 $QRELS_FILE

01 0 email01 0
01 0 email02 0
01 0 email03 0
01 0 email04 1
01 0 email05 1
01 0 email06 1
01 0 email07 0
01 0 email08 0
01 0 email09 0
01 0 email10 0


### Evaluation

First we build a file with our results accoring to TREC_EVAL format (see assignment PDF for more details)

In [17]:
# Load topic file - a list of topics(search phrases) used for evalutation
topicsFile = open(QUER_FILE,"r")
topics = topicsFile.read().splitlines()

# create an output file to which we'll write our results
outputTRECFile = open(OUTPUT_FILE, "w")

# for each evaluated topic:
# build a query and record the results in the file in TREC_EVAL format
for topic in topics:
    topic_id, topic_phrase = tuple(topic.split(" ", 1))
    topicQuery = myQueryParser.parse(topic_phrase)
    topicResults = mySearcher.search(topicQuery, limit=None)
    for (docnum, result) in enumerate(topicResults):
        score = topicResults.score(docnum)
        outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

# close the topic and results file
outputTRECFile.close()
topicsFile.close()

Now, we use TREC_EVAL to compare our results with the provided qrels file

In [18]:
!$TREC_EVAL -q $QRELS_FILE $OUTPUT_FILE

num_ret        	01	1
num_rel        	01	3
num_rel_ret    	01	1
map            	01	0.3333
R-prec         	01	0.3333
bpref          	01	0.3333
recip_rank     	01	1.0000
ircl_prn.0.00  	01	1.0000
ircl_prn.0.10  	01	1.0000
ircl_prn.0.20  	01	1.0000
ircl_prn.0.30  	01	1.0000
ircl_prn.0.40  	01	0.0000
ircl_prn.0.50  	01	0.0000
ircl_prn.0.60  	01	0.0000
ircl_prn.0.70  	01	0.0000
ircl_prn.0.80  	01	0.0000
ircl_prn.0.90  	01	0.0000
ircl_prn.1.00  	01	0.0000
P5             	01	0.2000
P10            	01	0.1000
P15            	01	0.0667
P20            	01	0.0500
P30            	01	0.0333
P100           	01	0.0100
P200           	01	0.0050
P500           	01	0.0020
P1000          	01	0.0010
num_ret        	05	1
num_rel        	05	2
num_rel_ret    	05	0
map            	05	0.0000
R-prec         	05	0.0000
bpref          	05	0.0000
recip_rank     	05	0.0000
ircl_prn.0.00  	05	0.0000
ircl_prn.0.10  	05	0.0000
ircl_prn.0.20  	05	0.0000
ircl_prn.0.30  	05	0.0000
ircl

# Evaluating different configurations

## Inspecting our index

In [19]:
# Is it empty?
print("Index is empty?", myIndex.is_empty())

# How many files indexed?
print("Number of indexed files:", myIndex.doc_count())

Index is empty? False
Number of indexed files: 11


In [20]:
# define a reader object on the index
myReader = myIndex.reader()

In [21]:
# print first 5 indexed documents
[(docnum, doc_dict) for (docnum, doc_dict) in myReader.iter_docs()][0:5]

[(0,
  {'file_path': '/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email08'}),
 (1,
  {'file_path': '/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email09'}),
 (2,
  {'file_path': '/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email03'}),
 (3,
  {'file_path': '/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email07'}),
 (4,
  {'file_path': '/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents/email06'})]

In [22]:
# list indexed terms for field "file_content"
[term for term in myReader.field_terms("file_content")][1000:1025]

['Care',
 'Carlos',
 'Carmen',
 'Carnivalesque',
 'Carolina',
 'Case',
 'Cat',
 'Catalysis',
 'Catalyst',
 'Catchers',
 'Cater',
 'Caused',
 'Caveat',
 'CbZF1d0021swQuc57kfqHt',
 'Cechetto',
 'Ceder',
 'Celebratory',
 'Center',
 'Cereal',
 'Ceremony',
 'Cerrahi',
 'Certolizumab',
 'Cervical',
 'Chair',
 'Chalfie']

In [23]:
#how many terms do we have?
print(myReader.field_length("file_content"))

29729


In [24]:
# how many documents have the phares "bit", blob"
#   in the field "file_content"?
print("# docs with 'bit'", myReader.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader.doc_frequency("file_content", "get"))

# docs with 'bit' 1
# docs with 'are' 11
# docs with 'get' 6


## Text Analyzers

In [25]:
# we start with basic tokenizer
tokenizer = RegexTokenizer()
[token.text for token in tokenizer("We are going to do Text Analysis with whoosh.analysis")]

['We',
 'are',
 'going',
 'to',
 'do',
 'Text',
 'Analysis',
 'with',
 'whoosh.analysis']

In [26]:
# we might want use stemming:
stmAnalyzer = RegexTokenizer() | StemFilter()
[token.text for token in stmAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['We', 'ar', 'go', 'to', 'do', 'Text', 'Analysi', 'with', 'whoosh.analysi']

In [27]:
# We probably want to lower-case it
# so we add LowercaseFilter
stmLwrAnalyzer = RegexTokenizer() | LowercaseFilter() | StemFilter()
[token.text for token in stmLwrAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['we', 'ar', 'go', 'to', 'do', 'text', 'analysi', 'with', 'whoosh.analysi']

In [28]:
# we probably want to ignore words like "we", "are", "with" when we index files
# so we add StopFilter to filter stop words
stmLwrStpAnalyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['go', 'do', 'text', 'analysi', 'whoosh.analysi']

In [29]:
# we also probably want to break phrases like "whoosh.analysis" into "whoosh" and "analysis"
# so we add IntraWordFilter
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpIntraAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['go', 'do', 'text', 'analysi', 'whoosh', 'analysi']

## Evaluating the new analyzers

In [30]:
mySchema2 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

In [31]:
# if index exists - remove it
if os.path.isdir(INDEX_DIR2):
    shutil.rmtree(INDEX_DIR2)

# create the directory for the index
os.makedirs(INDEX_DIR2)

# create index or open it if already exists
myIndex2 = index.create_in(INDEX_DIR2, mySchema2)

In [32]:
# open writer
myWriter2 = writing.BufferedWriter(myIndex2, period=20, limit=1000)

try:
    # write each file to index
    for docNum, filePath in enumerate(filesToIndex):
        with open(filePath, "r") as f:
            fileContent = f.read()
            myWriter2.add_document(file_path = filePath,
                                  file_content = fileContent)
            
            if (docNum % 1000 == 0):
                print("already indexed:", docNum+1)
    print("done indexing.")

finally:
    # save the index
    myWriter2.close()

already indexed: 1
done indexing.


In [33]:
# define a query parser for the field "file_content" in the index
myQueryParser2 = QueryParser("file_content", schema=myIndex2.schema)
mySearcher2 = myIndex2.searcher()

In [34]:
# Load topic file - a list of topics(search phrases) used for evalutation
topicsFile = open(QUER_FILE,"r")
topics = topicsFile.read().splitlines()

# create an output file to which we'll write our results
outputTRECFile2 = open(OUTPUT_FILE2, "w")

# for each evaluated topic:
# build a query and record the results in the file in TREC_EVAL format
for topic in topics:
    topic_id, topic_phrase = tuple(topic.split(" ", 1))
    topicQuery = myQueryParser2.parse(topic_phrase)
    topicResults = mySearcher2.search(topicQuery, limit=None)
    for (docnum, result) in enumerate(topicResults):
        score = topicResults.score(docnum)
        outputTRECFile2.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

# close the topic and results file
outputTRECFile2.close()
topicsFile.close()

In [35]:
!$TREC_EVAL -q $QRELS_FILE $OUTPUT_FILE2

num_ret        	01	3
num_rel        	01	3
num_rel_ret    	01	3
map            	01	1.0000
R-prec         	01	1.0000
bpref          	01	1.0000
recip_rank     	01	1.0000
ircl_prn.0.00  	01	1.0000
ircl_prn.0.10  	01	1.0000
ircl_prn.0.20  	01	1.0000
ircl_prn.0.30  	01	1.0000
ircl_prn.0.40  	01	1.0000
ircl_prn.0.50  	01	1.0000
ircl_prn.0.60  	01	1.0000
ircl_prn.0.70  	01	1.0000
ircl_prn.0.80  	01	1.0000
ircl_prn.0.90  	01	1.0000
ircl_prn.1.00  	01	1.0000
P5             	01	0.6000
P10            	01	0.3000
P15            	01	0.2000
P20            	01	0.1500
P30            	01	0.1000
P100           	01	0.0300
P200           	01	0.0150
P500           	01	0.0060
P1000          	01	0.0030
num_ret        	02	11
num_rel        	02	8
num_rel_ret    	02	8
map            	02	0.9207
R-prec         	02	0.8750
bpref          	02	0.7500
recip_rank     	02	1.0000
ircl_prn.0.00  	02	1.0000
ircl_prn.0.10  	02	1.0000
ircl_prn.0.20  	02	1.0000
ircl_prn.0.30  	02	1.0000
irc

In [36]:
# let count the same words again
myReader2 = myIndex2.reader()
print("# docs with 'bit'", myReader2.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader2.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader2.doc_frequency("file_content", "get"))

# docs with 'bit' 11
# docs with 'are' 0
# docs with 'get' 7


## Using NLTK's stemmers and lemmatizers

In [37]:
import nltk
from nltk.stem import *

In [38]:
# download required resources
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/notebook/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [39]:
# we'll compare two stemmers and a lemmatizer
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()

In [40]:
# define a list of words to compare the stemmers on
listWords = ["going", "saying", "minimize", "maximum", 
             "meeting", "files", "tries", "is", "are", "beautiful",
             "summarize", "better", "dogs", "phenomena"]

In [41]:
for word in listWords:
    print("%15s %15s %15s %15s" % (lrStem.stem(word),
                                   sbStem.stem(word),
                                   wnLemm.lemmatize(word),
                                   wnLemm.lemmatize(word, 'v')))

          going              go           going              go
            say             say          saying             say
          minim           minim        minimize        minimize
          maxim         maximum         maximum         maximum
           meet            meet         meeting            meet
            fil            file            file            file
            tri             tri             try             try
             is              is              is              be
             ar             are             are              be
         beauty          beauti       beautiful       beautiful
           summ          summar       summarize       summarize
            bet          better          better          better
            dog             dog             dog             dog
       phenomen       phenomena      phenomenon       phenomena


## Whoosh

In [42]:
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [43]:
# Whoosh filter for NLTK's LancasterStemmer
myFilter1 = RegexTokenizer() | CustomFilter(LancasterStemmer().stem)
[token.text for token in myFilter1("We are going to do Text Analysis with whoosh.analysis")]

['we', 'ar', 'going', 'to', 'do', 'text', 'analys', 'with', 'whoosh.analysis']

In [44]:
# Whoosh filter for NLTK's WordNetLemmatizer
myFilter2 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize)
[token.text for token in myFilter2("We are going to do Text Analysis with whoosh.analysis")]

['We',
 'are',
 'going',
 'to',
 'do',
 'Text',
 'Analysis',
 'with',
 'whoosh.analysis']

In [45]:
# Whoosh filter for NLTK's WordNetLemmatizer for verbs
myFilter3 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize, 'v')
[token.text for token in myFilter3("We are going to do Text Analysis with whoosh.analysis")]

['We', 'be', 'go', 'to', 'do', 'Text', 'Analysis', 'with', 'whoosh.analysis']

In [1]:
!$TREC_EVAL -h

/bin/sh: 1: -h: not found
