In [3]:
!pip --quiet install whoosh
!pip --quiet install nltk

In [1]:
!unzip -q /resources/data/ir_data.zip -d /resources/data

In [2]:
!make -s -w -C /resources/data/DSS/trec_eval.8.1 > /dev/null 2>&1

In [4]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os, os.path
import shutil

In [5]:
DOCUMENTS_DIR = "/resources/data/DSS/documents"
INDEX_DIR = "/resources/data/index1"
QUER_FILE = "/resources/data/DSS/topics/air.topics"
QRELS_FILE = "/resources/data/DSS/qrels/air.qrels"
OUTPUT_FILE = "/resources/data/DSS/myres"
TREC_EVAL = "/resources/data/DSS/trec_eval.8.1/trec_eval"
INDEX_DIR2 = "/resources/data/DSS/index2"
OUTPUT_FILE2 = "/resources/data/DSS/myres2"

In [6]:
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

In [7]:
# if index exists remove it
if os.path.isdir(INDEX_DIR):
    shutil.rmtree(INDEX_DIR)

os.makedirs(INDEX_DIR)

# create new index
myIndex = index.create_in(INDEX_DIR, mySchema)

#### indexing

In [8]:
!ls $DOCUMENTS_DIR

email01  email03  email05  email07  email09  email14
email02  email04  email06  email08  email10


In [9]:
# build a list of all full paths 
filesToIndex = []
for root, dirs, files in os.walk(DOCUMENTS_DIR):
    filePaths = [os.path.join(root, fileName) for fileName in files if not fileName.startswith('.')]
    filesToIndex.extend(filePaths)

In [5]:
myWriter = writing.BufferedWriter(myIndex, period=20, limit=1000)

try:
    # write each file to index
    for docNum, filePath in enumerate(filesToIndex):
        with open(filePath, "r") as f:
            fileContent = f.read()
            myWriter.add_document(file_path = filePath,
                                  file_content = fileContent)

finally:
    myWriter.close()

#### querying

In [13]:
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [14]:
sampleQuery = myQueryParser.parse("item")
sampleQueryResults = mySearcher.search(sampleQuery, limit=None)

# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

email01 0 2.6746417187049216


#### TREC_EVAL

In [15]:
!cat $QUER_FILE

01 ducks
02 ig nobel prizes
03 mathematics
04 flowing hair
05 music
06 AIR TV


In [16]:
!head -n 10 $QRELS_FILE

01 0 email01 0
01 0 email02 0
01 0 email03 0
01 0 email04 1
01 0 email05 1
01 0 email06 1
01 0 email07 0
01 0 email08 0
01 0 email09 0
01 0 email10 0


In [17]:
topicsFile = open(QUER_FILE,"r")
topics = topicsFile.read().splitlines()

outputTRECFile = open(OUTPUT_FILE, "w")

# build a query and record the results in the file in TREC_EVAL format
for topic in topics:
    topic_id, topic_phrase = tuple(topic.split(" ", 1))
    topicQuery = myQueryParser.parse(topic_phrase)
    topicResults = mySearcher.search(topicQuery, limit=None)
    for (docnum, result) in enumerate(topicResults):
        score = topicResults.score(docnum)
        outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

outputTRECFile.close()
topicsFile.close()

compare results

In [18]:
!$TREC_EVAL -q $QRELS_FILE $OUTPUT_FILE

num_ret        	01	1
num_rel        	01	3
num_rel_ret    	01	1
map            	01	0.3333
R-prec         	01	0.3333
bpref          	01	0.3333
recip_rank     	01	1.0000
ircl_prn.0.00  	01	1.0000
ircl_prn.0.10  	01	1.0000
ircl_prn.0.20  	01	1.0000
ircl_prn.0.30  	01	1.0000
ircl_prn.0.40  	01	0.0000
ircl_prn.0.50  	01	0.0000
ircl_prn.0.60  	01	0.0000
ircl_prn.0.70  	01	0.0000
ircl_prn.0.80  	01	0.0000
ircl_prn.0.90  	01	0.0000
ircl_prn.1.00  	01	0.0000
P5             	01	0.2000
P10            	01	0.1000
P15            	01	0.0667
P20            	01	0.0500
P30            	01	0.0333
P100           	01	0.0100
P200           	01	0.0050
P500           	01	0.0020
P1000          	01	0.0010
num_ret        	05	1
num_rel        	05	2
num_rel_ret    	05	0
map            	05	0.0000
R-prec         	05	0.0000
bpref          	05	0.0000
recip_rank     	05	0.0000
ircl_prn.0.00  	05	0.0000
ircl_prn.0.10  	05	0.0000
ircl_prn.0.20  	05	0.0000
ircl_prn.0.30  	05	0.0000
ircl

#### try different configurations

In [20]:
# define a reader object on the index
myReader = myIndex.reader()

In [22]:
[term for term in myReader.field_terms("file_content")][1000:1025]

['Care',
 'Carlos',
 'Carmen',
 'Carnivalesque',
 'Carolina',
 'Case',
 'Cat',
 'Catalysis',
 'Catalyst',
 'Catchers',
 'Cater',
 'Caused',
 'Caveat',
 'CbZF1d0021swQuc57kfqHt',
 'Cechetto',
 'Ceder',
 'Celebratory',
 'Center',
 'Cereal',
 'Ceremony',
 'Cerrahi',
 'Certolizumab',
 'Cervical',
 'Chair',
 'Chalfie']

In [23]:
print(myReader.field_length("file_content"))

29729


In [24]:
print("# docs with 'bit'", myReader.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader.doc_frequency("file_content", "get"))

# docs with 'bit' 1
# docs with 'are' 11
# docs with 'get' 6


In [29]:
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpIntraAnalyzer("i'm going to do Text Analysis with whoosh.analysis")]

['go', 'do', 'text', 'analysi', 'whoosh', 'analysi']

In [30]:
mySchema2 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

In [31]:
if os.path.isdir(INDEX_DIR2):
    shutil.rmtree(INDEX_DIR2)

os.makedirs(INDEX_DIR2)

myIndex2 = index.create_in(INDEX_DIR2, mySchema2)

In [32]:
myWriter2 = writing.BufferedWriter(myIndex2, period=20, limit=1000)

try:
    for docNum, filePath in enumerate(filesToIndex):
        with open(filePath, "r") as f:
            fileContent = f.read()
            myWriter2.add_document(file_path = filePath,
                                  file_content = fileContent)
            
            if (docNum % 1000 == 0):
                print("already indexed:", docNum+1)
    print("done indexing.")

finally:
    myWriter2.close()

already indexed: 1
done indexing.


In [33]:
myQueryParser2 = QueryParser("file_content", schema=myIndex2.schema)
mySearcher2 = myIndex2.searcher()

In [34]:
topicsFile = open(QUER_FILE,"r")
topics = topicsFile.read().splitlines()

outputTRECFile2 = open(OUTPUT_FILE2, "w")

for topic in topics:
    topic_id, topic_phrase = tuple(topic.split(" ", 1))
    topicQuery = myQueryParser2.parse(topic_phrase)
    topicResults = mySearcher2.search(topicQuery, limit=None)
    for (docnum, result) in enumerate(topicResults):
        score = topicResults.score(docnum)
        outputTRECFile2.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

outputTRECFile2.close()
topicsFile.close()

In [35]:
!$TREC_EVAL -q $QRELS_FILE $OUTPUT_FILE2

num_ret        	01	3
num_rel        	01	3
num_rel_ret    	01	3
map            	01	1.0000
R-prec         	01	1.0000
bpref          	01	1.0000
recip_rank     	01	1.0000
ircl_prn.0.00  	01	1.0000
ircl_prn.0.10  	01	1.0000
ircl_prn.0.20  	01	1.0000
ircl_prn.0.30  	01	1.0000
ircl_prn.0.40  	01	1.0000
ircl_prn.0.50  	01	1.0000
ircl_prn.0.60  	01	1.0000
ircl_prn.0.70  	01	1.0000
ircl_prn.0.80  	01	1.0000
ircl_prn.0.90  	01	1.0000
ircl_prn.1.00  	01	1.0000
P5             	01	0.6000
P10            	01	0.3000
P15            	01	0.2000
P20            	01	0.1500
P30            	01	0.1000
P100           	01	0.0300
P200           	01	0.0150
P500           	01	0.0060
P1000          	01	0.0030
num_ret        	02	11
num_rel        	02	8
num_rel_ret    	02	8
map            	02	0.9207
R-prec         	02	0.8750
bpref          	02	0.7500
recip_rank     	02	1.0000
ircl_prn.0.00  	02	1.0000
ircl_prn.0.10  	02	1.0000
ircl_prn.0.20  	02	1.0000
ircl_prn.0.30  	02	1.0000
irc

In [36]:
myReader2 = myIndex2.reader()
print("# docs with 'bit'", myReader2.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader2.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader2.doc_frequency("file_content", "get"))

# docs with 'bit' 11
# docs with 'are' 0
# docs with 'get' 7


#### compare to NLTK's stemmers and lemmatizers

In [37]:
import nltk
from nltk.stem import *

In [38]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/notebook/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [39]:
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()

In [40]:
listWords = ["going", "saying", "minimize", "maximum", 
             "meeting", "files", "tries", "is", "are", "beautiful",
             "summarize", "better", "dogs", "phenomena"]

In [41]:
for word in listWords:
    print("%15s %15s %15s %15s" % (lrStem.stem(word),
                                   sbStem.stem(word),
                                   wnLemm.lemmatize(word),
                                   wnLemm.lemmatize(word, 'v')))

          going              go           going              go
            say             say          saying             say
          minim           minim        minimize        minimize
          maxim         maximum         maximum         maximum
           meet            meet         meeting            meet
            fil            file            file            file
            tri             tri             try             try
             is              is              is              be
             ar             are             are              be
         beauty          beauti       beautiful       beautiful
           summ          summar       summarize       summarize
            bet          better          better          better
            dog             dog             dog             dog
       phenomen       phenomena      phenomenon       phenomena


### whoosh

In [42]:
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [43]:
# NLTK's LancasterStemmer
myFilter1 = RegexTokenizer() | CustomFilter(LancasterStemmer().stem)
[token.text for token in myFilter1("We are going to do Text Analysis with whoosh.analysis")]

['we', 'ar', 'going', 'to', 'do', 'text', 'analys', 'with', 'whoosh.analysis']

In [44]:
# NLTK's WordNetLemmatizer
myFilter2 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize)
[token.text for token in myFilter2("We are going to do Text Analysis with whoosh.analysis")]

['We',
 'are',
 'going',
 'to',
 'do',
 'Text',
 'Analysis',
 'with',
 'whoosh.analysis']

In [45]:
# NLTK's WordNetLemmatizer for verbs
myFilter3 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize, 'v')
[token.text for token in myFilter3("We are going to do Text Analysis with whoosh.analysis")]

['We', 'be', 'go', 'to', 'do', 'Text', 'Analysis', 'with', 'whoosh.analysis']