# Getting started

In [1]:
from whoosh.index import create_in
from whoosh.fields import *

In [2]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

In [4]:
print(schema)

<Schema: ['content', 'path', 'title']>


Make sure you have the directory "indexdir" created beforehand in the folder where you start the notebook

In [6]:
ix = create_in("indexdir", schema)
writer = ix.writer()

In [7]:
writer.add_document(title=u"First document", path=u"/a",content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", path=u"/b",content=u"The second one is even more interesting!")
writer.commit()

In [8]:
from whoosh.qparser import QueryParser

the query we will use now is "first"

In [9]:
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    print(results[0], results[0].score)

<Hit {'path': '/a', 'title': 'First document'}> 1.047619047619048


# Custom ranking functions

In [11]:
from whoosh import scoring

In [12]:
w = scoring.TF_IDF()

In [13]:
with ix.searcher(weighting =w) as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    print(results[0], results[0].score)

<Hit {'path': '/a', 'title': 'First document'}> 1.0


You can define a custom scoring function too. pos_score_fn computes a score for a given document using only one field. Here the score is based on the first occurence (position) of the query term.

In [14]:
def pos_score_fn(searcher, fieldname, text, matcher):
    poses = matcher.value_as("positions")
    return 1.0 / (poses[0] + 1)

pos_weighting = scoring.FunctionWeighting(pos_score_fn)

In [15]:
with ix.searcher(weighting =pos_weighting) as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    print(results[0], results[0].score)

<Hit {'path': '/a', 'title': 'First document'}> 0.25


# Indexing a collection and computing metrics

In [45]:
import csv

In [None]:
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
    csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
    for row in csv_reader:
        yield [unicode(cell, 'utf-8') for cell in row]

In [46]:
def read_file(file_path, delimiter='\t'):
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        doc_list = []
        for row in reader:
            try:
                doc_list.append((row[0],row[1], row[2].replace('\n',' ')))
                print((row[0],row[1], row[2].replace('\n',' ')))
            except Exception as e:
                print(e);
                print((row[0],row[1], row[2].replace('\n',' ')))

    return doc_list

In [44]:
try:
    doc_list = read_file("collection.tsv")
except Exception as e:
    print(e)

('п»їclueweb09-en0010-79-02218', '15.8423805237', 'Create your Own Trivia Quiz! Don\'t just play, contribute! Developing your own trivia quizzes is the most fun and intellectually challenging part in being a Masters of Trivia Contributor... You can select from thousands of categories and topics to write for. Is your passion "baseball", or more specifically the "Boston Red Sox"? Drill down our category tree and find the topic that fits your interest best. Then start creating your game. Remember to assign a clear and descriptive title to your trivia quiz. People use our search function in a logical way. For example, if you are writing a quiz about "african elephants", entitle your quiz "African Elephants" or "The African Elephant Quiz". But if your quiz is about "african elephant poaching", your title should reflect the content of your quiz as faithfully as possible. Hence, "African Elephant Poaching" or "Poaching of African Elephants" would be appropriate titles.  Try to capitalize the 

In [24]:
print(len(doc_list),'\n', doc_list[0])

NameError: name 'doc_list' is not defined

In [21]:
schema = Schema(id=ID(stored=True), content=TEXT)
ix = create_in("cw_index", schema)
writer = ix.writer()

In [22]:
for doc in doc_list[:1000]:
    writer.add_document(id=doc[0],content=doc[2])
writer.commit()

NameError: name 'doc_list' is not defined

In [32]:
query_str =  "403b"
result_list = []

In [34]:
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse(query_str)
    results = searcher.search(query, limit=None)
    print("Results found:", len(results))
    for result in results:
        print(result['id'], result.score)
        result_list.append(result['id'])

Results found: 288
﻿﻿clueweb09-en0010-82-12593 4.68235845867756
clueweb09-en0002-19-09466 4.606890901190984
clueweb09-en0010-82-12588 4.576296584299555
clueweb09-en0010-82-12589 4.574809817919019
clueweb09-en0010-82-12587 4.483470812852932
clueweb09-en0009-84-33862 4.450273110200792
clueweb09-en0008-24-06210 4.341685424381006
clueweb09-en0000-14-03360 4.234154273566411
clueweb09-en0004-01-03541 4.194615271966087
clueweb09-en0004-80-00508 4.193528868778884
clueweb09-en0008-41-10509 4.1769417838009195
clueweb09-en0009-52-27808 4.111156204215051
clueweb09-en0009-82-05607 4.097154045886272
clueweb09-en0007-55-00718 4.065995269601156
clueweb09-en0001-41-08510 4.035306840104864
clueweb09-en0010-82-12518 3.9562982164117613
clueweb09-en0007-55-00719 3.9464327968911745
clueweb09-en0008-24-06121 3.913863776352913
clueweb09-en0009-94-31189 3.8456964156242117
clueweb09-en0005-82-07314 3.813692823434889
clueweb09-en0008-62-39243 3.746270626291283
clueweb09-en0003-88-24169 3.738143499793505
clueweb0

In [35]:
def read_qrels(file_path, delimiter=' '):
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        qrels = {}
        for row in reader:
            qrels[row[0]] = int(row[1])

    return qrels

In [36]:
qrels_hash = read_qrels("403b-qrels.csv")

In [48]:
def precision(doc_list, qrels, k=10):
    f = lambda x: qrels[x] if x in qrels else 0
    vals = list(map(lambda q: 1 if q>0 else 0, map(f, doc_list[:k])))
    print(vals)
    return sum(vals)/k

In [49]:
precision(result_list, qrels_hash, k=15)

[0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1]


0.5333333333333333

In [52]:
result_list = []
with ix.searcher(weighting =w) as searcher:
    query = QueryParser("content", ix.schema).parse(query_str)
    results = searcher.search(query, limit=None)
    print("Results found:", len(results))
    for result in results:
        print(result['id'], result.score)
        result_list.append(result['id'])

Results found: 288
﻿﻿clueweb09-en0010-82-12593 105.34244377087613
clueweb09-en0010-82-12589 73.96384349870026
clueweb09-en0010-82-12588 60.51587195348203
clueweb09-en0010-82-12518 60.51587195348203
clueweb09-en0010-82-12587 56.03321477174262
clueweb09-en0002-19-09466 44.8265718173941
clueweb09-en0004-37-02167 40.34391463565469
clueweb09-en0004-01-03541 35.86125745391528
clueweb09-en0009-84-33862 26.89594309043646
clueweb09-en0007-18-30966 24.654614499566755
clueweb09-en0003-93-27146 20.171957317827346
clueweb09-en0011-52-03669 20.171957317827346
clueweb09-en0004-80-00508 17.93062872695764
clueweb09-en0003-95-21017 15.689300136087935
clueweb09-en0008-41-10509 11.206642954348524
clueweb09-en0011-67-00065 11.206642954348524
clueweb09-en0009-79-14364 11.206642954348524
clueweb09-en0007-55-00719 8.96531436347882
clueweb09-en0008-24-06210 8.96531436347882
clueweb09-en0010-82-12592 8.96531436347882
clueweb09-en0007-18-30977 8.96531436347882
clueweb09-en0003-88-24170 8.96531436347882
clueweb09

In [53]:
precision(result_list, qrels_hash, k=15)

[0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]


0.4666666666666667