# Lab 2.a. Basic Indexing

## Part 1: Preparation for lab

Install whoosh, an open-source python IR system:

In [None]:
!pip --quiet install whoosh
!pip --quiet install nltk

### Upload the provided dataset DSS_Fall2016_Assign1.zip to /resources/data using "My Data"

Extract the dataset

In [None]:
!unzip -q /resources/data/DSS_Fall2016_Assign1.zip -d /resources/data

Compile TREC_EVAL we will later use to evaluate our performance

In [None]:
!make -s -w -C /resources/data/DSS_Fall2016_Assign1/trec_eval.8.1 > /dev/null 2>&1

## Part 2: Building the Index

Import necessary libraries: whoosh(for IR) and os,shutil(for working with files)

In [None]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os, os.path
import shutil

Define constants for the paths of our dataset

In [None]:
DOCUMENTS_DIR = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/documents"
INDEX_DIR = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/index1"
QUER_FILE = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/topics/air.topics"
QRELS_FILE = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/qrels/air.qrels"
OUTPUT_FILE = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/myres"
TREC_EVAL = "/resources/data/DSS_Fall2016_Assign1/trec_eval.8.1/trec_eval"
INDEX_DIR2 = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/index2"
OUTPUT_FILE2 = "/resources/data/DSS_Fall2016_Assign1/lab1-q1-test-collection/myres2"

### Building the index

In [None]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

In [None]:
# if index exists - remove it
if os.path.isdir(INDEX_DIR):
    shutil.rmtree(INDEX_DIR)

# create the directory for the index
os.makedirs(INDEX_DIR)

# create index
myIndex = index.create_in(INDEX_DIR, mySchema)

### Indexing the files

In [None]:
# First, lets review the documents in our sample dataset
!ls $DOCUMENTS_DIR

In [None]:
# first we build a list of all the full paths of the files in DOCUMENTS_DIR
filesToIndex = []
for root, dirs, files in os.walk(DOCUMENTS_DIR):
    filePaths = [os.path.join(root, fileName) for fileName in files if not fileName.startswith('.')]
    filesToIndex.extend(filePaths)

In [None]:
# print the first 5 paths to make sure it worked
print("\n".join(filesToIndex[:5]))

In [None]:
# count files to index
print("number of files:", len(filesToIndex))

In [None]:
# open writer
myWriter = writing.BufferedWriter(myIndex, period=20, limit=1000)

try:
    # write each file to index
    for docNum, filePath in enumerate(filesToIndex):
        with open(filePath, "r") as f:
            fileContent = f.read()
            myWriter.add_document(file_path = filePath,
                                  file_content = fileContent)
            
            if (docNum % 1000 == 0):
                print("already indexed:", docNum+1)
    print("done indexing.")

finally:
    # save the index
    myWriter.close()

## Part 3: Querying

In [None]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [None]:
# run a sample query for the phrase "item"
sampleQuery = myQueryParser.parse("item")
sampleQueryResults = mySearcher.search(sampleQuery, limit=None)

# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

## Part 4: Evaluate our results using TREC_EVAL

### TREC_EVAL

In order to evaluate our results we will use a topic file - a list of topics we use to evaluate our IR system

In [None]:
# print the topic file
!cat $QUER_FILE

We will compare our evaluate our results with a set of judged results(qrels file) using TREC_EVAL 

In [None]:
# print the first 10 lines in the qrels file
!head -n 10 $QRELS_FILE

### Evaluation

First we build a file with our results accoring to TREC_EVAL format (see assignment PDF for more details)

In [None]:
# Load topic file - a list of topics(search phrases) used for evalutation
topicsFile = open(QUER_FILE,"r")
topics = topicsFile.read().splitlines()

# create an output file to which we'll write our results
outputTRECFile = open(OUTPUT_FILE, "w")

# for each evaluated topic:
# build a query and record the results in the file in TREC_EVAL format
for topic in topics:
    topic_id, topic_phrase = tuple(topic.split(" ", 1))
    topicQuery = myQueryParser.parse(topic_phrase)
    topicResults = mySearcher.search(topicQuery, limit=None)
    for (docnum, result) in enumerate(topicResults):
        score = topicResults.score(docnum)
        outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

# close the topic and results file
outputTRECFile.close()
topicsFile.close()

Now, we use TREC_EVAL to compare our results with the provided qrels file

In [None]:
!$TREC_EVAL -q $QRELS_FILE $OUTPUT_FILE

# Lab 2.b. Evaluating different configurations

## Inspecting our index

In [None]:
# Is it empty?
print("Index is empty?", myIndex.is_empty())

# How many files indexed?
print("Number of indexed files:", myIndex.doc_count())

In [None]:
# define a reader object on the index
myReader = myIndex.reader()

In [None]:
# print first 5 indexed documents
[(docnum, doc_dict) for (docnum, doc_dict) in myReader.iter_docs()][0:5]

In [None]:
# list indexed terms for field "file_content"
[term for term in myReader.field_terms("file_content")][1000:1025]

In [None]:
#how many terms do we have?
print(myReader.field_length("file_content"))

In [None]:
# how many documents have the phares "bit", blob"
#   in the field "file_content"?
print("# docs with 'bit'", myReader.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader.doc_frequency("file_content", "get"))

## Text Analyzers

In [None]:
# we start with basic tokenizer
tokenizer = RegexTokenizer()
[token.text for token in tokenizer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# we might want use stemming:
stmAnalyzer = RegexTokenizer() | StemFilter()
[token.text for token in stmAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# We probably want to lower-case it
# so we add LowercaseFilter
stmLwrAnalyzer = RegexTokenizer() | LowercaseFilter() | StemFilter()
[token.text for token in stmLwrAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# we probably want to ignore words like "we", "are", "with" when we index files
# so we add StopFilter to filter stop words
stmLwrStpAnalyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# we also probably want to break phrases like "whoosh.analysis" into "whoosh" and "analysis"
# so we add IntraWordFilter
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpIntraAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

## Evaluating the new analyzers

In [None]:
mySchema2 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

In [None]:
# if index exists - remove it
if os.path.isdir(INDEX_DIR2):
    shutil.rmtree(INDEX_DIR2)

# create the directory for the index
os.makedirs(INDEX_DIR2)

# create index or open it if already exists
myIndex2 = index.create_in(INDEX_DIR2, mySchema2)

In [None]:
# open writer
myWriter2 = writing.BufferedWriter(myIndex2, period=20, limit=1000)

try:
    # write each file to index
    for docNum, filePath in enumerate(filesToIndex):
        with open(filePath, "r") as f:
            fileContent = f.read()
            myWriter2.add_document(file_path = filePath,
                                  file_content = fileContent)
            
            if (docNum % 1000 == 0):
                print("already indexed:", docNum+1)
    print("done indexing.")

finally:
    # save the index
    myWriter2.close()

In [None]:
# define a query parser for the field "file_content" in the index
myQueryParser2 = QueryParser("file_content", schema=myIndex2.schema)
mySearcher2 = myIndex2.searcher()

In [None]:
# Load topic file - a list of topics(search phrases) used for evalutation
topicsFile = open(QUER_FILE,"r")
topics = topicsFile.read().splitlines()

# create an output file to which we'll write our results
outputTRECFile2 = open(OUTPUT_FILE2, "w")

# for each evaluated topic:
# build a query and record the results in the file in TREC_EVAL format
for topic in topics:
    topic_id, topic_phrase = tuple(topic.split(" ", 1))
    topicQuery = myQueryParser2.parse(topic_phrase)
    topicResults = mySearcher2.search(topicQuery, limit=None)
    for (docnum, result) in enumerate(topicResults):
        score = topicResults.score(docnum)
        outputTRECFile2.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

# close the topic and results file
outputTRECFile2.close()
topicsFile.close()

In [None]:
!$TREC_EVAL -q $QRELS_FILE $OUTPUT_FILE2

In [None]:
# let count the same words again
myReader2 = myIndex2.reader()
print("# docs with 'bit'", myReader2.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader2.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader2.doc_frequency("file_content", "get"))

** Can you explain the differences? **

## Using NLTK's stemmers and lemmatizers

In [None]:
import nltk
from nltk.stem import *

In [None]:
# download required resources
nltk.download("wordnet")

In [None]:
# we'll compare two stemmers and a lemmatizer
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()

In [None]:
# define a list of words to compare the stemmers on
listWords = ["going", "saying", "minimize", "maximum", 
             "meeting", "files", "tries", "is", "are", "beautiful",
             "summarize", "better", "dogs", "phenomena"]

In [None]:
for word in listWords:
    print("%15s %15s %15s %15s" % (lrStem.stem(word),
                                   sbStem.stem(word),
                                   wnLemm.lemmatize(word),
                                   wnLemm.lemmatize(word, 'v')))

## How to use NLTK stemmers / lemmatizers in Whoosh

In [None]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [None]:
# Example1: Whoosh filter for NLTK's LancasterStemmer
myFilter1 = RegexTokenizer() | CustomFilter(LancasterStemmer().stem)
[token.text for token in myFilter1("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# Example2: Whoosh filter for NLTK's WordNetLemmatizer
myFilter2 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize)
[token.text for token in myFilter2("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# Example3: Whoosh filter for NLTK's WordNetLemmatizer for verbs
myFilter3 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize, 'v')
[token.text for token in myFilter3("We are going to do Text Analysis with whoosh.analysis")]

You can now use myFilter1/2/3 as part of your Schema

** You can find details of other NLTK Stemmers and Lemmatizers here:**

http://www.nltk.org/api/nltk.stem.html