**Exercise 1:**  

Make sure you understand the algorithm for implementing search described in the lecture notes. Both slow and efficient versions. Describe
the number of sums you need to do in both slow and quick versions for the following toy example with a vocabulary of size 4 and four documents:

- $q = 0,1,1,0$

- document-term matrix:
<center>


|        | t1  | t2  | t3  | t4  |
|--------|-----|-----|-----|-----|
| **d1** | 1.2 | 0.0 | 0.0 | 0.0 |
| **d2** | 0.7 | 0.3 | 1.5 | 0.1 |
| **d3** | 0.0 | 0.0 | 0.0 | 0.7 |
| **d4** | 2.0 | 0.0 | 0.0 | 0.0 |

</center>

---

In [None]:
q = [0, 1, 1, 0]
DMat = [[1.2, 0, 0, 0], [0.7, 0.3, 1.5, 0.1], [0, 0, 0, 0.7], [2, 0, 0, 0]]

In [None]:
## Slow version

count = 0

for i in range(len(DMat)):
    for j in range(len(q)):
        count = count + 1

print(count)

16


In [None]:
## Fast version

count = 0
DMatT = [[DMat[i][k] for i in range(len(DMat))] for k in range(len(DMat[0]))]

for j in range(len(q)):
    if q[j] !=0:
        for i in range(len(DMatT[j])):
            count = count + 1

print(count)

8


**Exercise 2:**

Implement the quick version; run both slow and quick versions and report times (as a reference, in my old laptop it takes around 5m30s to run the slow version in the code above). Make sure both versions return the same answer. Note that you will need to build an inverted index in order to implement the efficient version as explained in class; it may take time but this is done once for all queries, and can be done "off-line".

---

In [None]:
from elasticsearch.helpers import scan
from pprint import pprint
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Index, analyzer, tokenizer
from elasticsearch.exceptions import NotFoundError
import tqdm
import numpy as np

In [None]:
client = Elasticsearch("http://localhost:9200", request_timeout=1000)

my_ind = Index('arxiv', using=client)
my_ind.settings(number_of_shards=1)

try:
    # drop if exists
    my_ind.delete()
except NotFoundError:
    pass

# create it
my_ind.create()

# create new analyzer
my_analyzer = analyzer('default',
    type='custom',
    tokenizer=tokenizer('letter'),
    filter=['lowercase', 'asciifolding', 'stop', 'snowball']
)

# close to update analyzer to custom `my_analyzer`
my_ind.close()
my_ind.analyzer(my_analyzer)
my_ind.save()
my_ind.open()

%run -i IndexFilesPreprocess.py --path="/Users/lauragarcialopez/Documents/uni/4o/4A/CAI/Labs/Lab 03/arxiv" --index=arxiv --token='letter' --filter='snowball'

Indexing 58103 files
Reading files ...
Index settings= {'arxiv': {'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'arxiv', 'creation_date': '1697190074277', 'analysis': {'analyzer': {'default': {'filter': ['snowball'], 'type': 'custom', 'tokenizer': 'letter'}}}, 'number_of_replicas': '1', 'uuid': 'c_wu2SHWR6eYuiBELFOZ7w', 'version': {'created': '8100099'}}}}}
Indexing ...


In [None]:
import math

def encode_doc(doc_id):
    doc_dict = {}
    tv = client.termvectors(index='arxiv', id=doc_id, fields=['text'], term_statistics=True, positions=False)
    D = tv['term_vectors']['text']['field_statistics']['doc_count']
    maxf = 0
    if 'text' in tv['term_vectors']:   # just in case some document has no field named 'text'
        for word in tv['term_vectors']['text']['terms']:
            f = tv['term_vectors']['text']['terms'][word]['term_freq']
            if f > maxf:
                maxf = f

            df = tv['term_vectors']['text']['terms'][word]['doc_freq']
            doc_dict[word] = (f/maxf)*(math.log(D/df, 2))

    return doc_dict

def scalar_product(doc1, doc2):
    prod = 0
    for word in doc1.keys():
        if word in doc2.keys():
            prod += doc1[word]*doc2[word]

    return prod

def my_norm(doc):
    return math.sqrt(scalar_product(doc, doc))

In [None]:
## SLOW VERSION
def SlowSearch(query, r):
    sims = dict()

    l2query  = np.sqrt(len(query.split()))  # l2 of query assuming 0-1 vector representation

    # get nr. of docs; just for the progress bar
    ndocs = int(client.cat.count(index='arxiv', format = "json")[0]['count'])

    # scan through docs, compute cosine sim between query and each doc
    for s in tqdm.tqdm(scan(client, index='arxiv', query={"query" : {"match_all": {}}}), total=ndocs):
        docid = s['_source']['path']   # use path as id
        weights = encode_doc(s['_id'])   # gets weights as a python dict of term -> weight
        sims[docid] = 0.0
        for w in query.split():  # gets terms as a list
            if w in weights.keys():    # probably need to do something fancier to make sure that word is in vocabulary etc.
                sims[docid] += weights[w]   # accumulates if w in current doc
        # normalize sim
        sims[docid] /= (l2query*my_norm(weights))
    # now sort by cosine similarity
    sorted_answer = sorted(sims.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_answer[:r]

In [None]:
def inverted_file():
    PLists = dict()
    ndocs = int(client.cat.count(index='arxiv', format = "json")[0]['count'])

    for s in tqdm.tqdm(scan(client, index='arxiv', query={"query" : {"match_all": {}}}), total=ndocs):
        docid = s['_source']['path']
        weights = encode_doc(s['_id'])
        n_weights = my_norm(weights)

        for w in weights.keys():
            try:
                PLists[w] += [(docid, weights[w]/n_weights)]
            except:
                PLists[w] = [(docid, weights[w]/n_weights)]

    return PLists

PostLists = inverted_file()


100%|██████████| 58103/58103 [06:30<00:00, 148.73it/s]


In [None]:
def FastSearch(query, r, PL):
    sims = dict()

    l2query  = np.sqrt(len(query.split()))  # l2 of query assuming 0-1 vector representation

    # get nr. of docs; just for the progress bar
    ndocs = int(client.cat.count(index='arxiv', format = "json")[0]['count'])

    # scan through words in the query, compute cosine sim between query and each doc
    for w in query.split():
        L = PL[w] # find posting list for the word
        for (docid, weight) in L:
            try:
                sims[docid] += weight/l2query # accumulates similarities
            except:
                sims[docid] = weight/l2query

    # now sort by cosine similarity
    sorted_answer = sorted(sims.items(), key=lambda kv: kv[1], reverse=True)
    return dict(sorted_answer[:r])

In [None]:
slow_answer = SlowSearch('computer magic', 10)

100%|██████████| 58103/58103 [07:11<00:00, 134.56it/s]


In [None]:
fast_answer = FastSearch('computer magic', 10, PostLists)

In [None]:
for i in range(10):
    if fast_answer[i] != slow_anwer[i]:
        print(fast_answer[i])
        print(slow_anwer[i])

('/Users/lauragarcialopez/Documents/uni/4o/4A/CAI/Labs/Lab 03/arxiv/quant-ph.updates.on.arXiv.org/000650', 0.2882522830471679)
('/Users/lauragarcialopez/Documents/uni/4o/4A/CAI/Labs/Lab 03/arxiv/quant-ph.updates.on.arXiv.org/000650', 0.2882522830471678)


**Exercise 3:**

Compare the results for a few sample queries that you get from your quick version and ElasticSearch search. Do you get similar results? Which is faster?

---

In [None]:
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q

def ESSearch(my_query, r):
    s = Search(using=client, index='arxiv')

    qsplit = my_query.split()
    q = Q('query_string',query=qsplit[0])
    for w in qsplit[1:]:
        q = q & Q('query_string',query=w)

    s = s.query(q)

    answer = dict()
    for a in s[0:r].execute():  # only returns a specific number of results
        answer[a.path] = a.meta.score

    return answer

In [None]:
def print_rank(r, answer1, answer2):
    docs1 = list(answer1.keys())
    docs2 = list(answer2.keys())
    print(" ", "  ES  ", " FAST ")
    for k in range(min(r, len(docs1), len(docs2))):
        # print('-----------------------------------------------------------------------------------')
        print(str(k), str(docs1[k][-6:]), str(docs2[k][-6:]))

In [None]:
answer_ES = ESSearch('computer magic', 10)

In [None]:
answer_fast = FastSearch('computer magic', 10, PostLists)

In [None]:
print_rank(10, answer_ES, answer_fast)

    ES    FAST 
0 000677 001475
1 000650 000650
2 001475 000265
3 000992 000955
4 001477 002825
5 013376 000677
6 006074 000255
7 001652 000896
8 001630 003482
9 000521 004798


In [None]:
answer_ES = ESSearch('star brown', 10)
answer_fast = FastSearch('star brown', 10, PostLists)

print_rank(10, answer_ES, answer_fast)

    ES    FAST 
0 011045 011045
1 010049 000616
2 009349 001260
3 012590 010049
4 008869 008869
5 012600 002067
6 000616 008734
7 004880 012006
8 004320 012511
9 003245 012118


In [None]:
answer_ES = ESSearch('planet star Newton', 10)
answer_fast = FastSearch('planet star Newton', 10, PostLists)

print_rank(10, answer_ES, answer_fast)

    ES    FAST 
0 002772 002147
1 001639 003164
2 012669 017650
3 005541 007419
4 014669 001236
5 014695 006959


IndexError: list index out of range