# Session 3 - User Relevance Feedback

## 1 Document Relevance

In this sesion we are going to implement a pseudo user relevance feedback on top of ElasticSearch

One possibility that we have not used from the query results of ElasticSeach is the score computed as the relevance of the document respect to the terms of a query.

You have the script `SearchIndexWeights.py` that allows searching for keywords in an index just like we do in any seach engine (like Google search or Bing).

This script returns a limited number of hits and also shows the score of the documents (the documents are sorted by its score)

**Read the first section** of the session documentation and play a little bit with the documents that you have in the `news` index.

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q


client = Elasticsearch()
s = Search(using=client, index='news')


q = Q('query_string',query='toronto')  # Feel free to change the word

s = s.query(q)
response = s[0:3].execute()
for r in response:  # only returns a specific number of results
    print('ID= %s SCORE=%s' % (r.meta.id,  r.meta.score))
    print('PATH= %s' % r.path)
    print('TEXT: %s' % r.text[:50])
    print('-----------------------------------------------------------------')

: 

***

## 2 Rocchio's Rule


For implementing the relevance we are going to use the Rocchio's rule. We are going to extend the query for a number of interations using the terms in the more relevant documents that are retrieved.

As is described in the session documentation you will need to write a scripts that given a query, repeats a number ($nrounds$) of times:

1. Obtain the $k$ more relevant documents
2. Compute a new query using the current query and the terms of the $k$ documents

The Rocchio's rule involves computing the folowing:

$$Query' = 	\alpha \times Query + \beta \times \frac{d_1 + d_2 + \cdots + d_k}{k}$$

So we have different parameters to play with:

1. The number of rounds ($nrounds$)
2. The number of relevand documents ($k$)
3. The parameters of the Rocchio's rule ($\alpha$ and $\beta$)
4. The numbeer of terms in the recomputed query ($R$)

**Read the documentation** and pay attention specially to how you have to build the query that you pass to ElasticSearch to include thw weights computed by the Rocchio's rule.

Think that some of the elements that you need for this part are functions that you programmed already as part of the past session assignment.

**Pay attention** to the documentation that you have to deliver for this session.



In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import NotFoundError

import argparse

from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q
import numpy as np
from elasticsearch.exceptions import NotFoundError
from elasticsearch_dsl import Search
from elasticsearch.client import CatClient
from elasticsearch_dsl.query import Q
from elasticsearch import Elasticsearch

import matplotlib.pyplot as plt

import operator
import argparse
__author__ = 'walter'

def doc_count(client, index):
    """
    Returns the number of documents in an index

    :param client:
    :param index:
    :return:
    """
    return int(CatClient(client).count(index=[index], format='json')[0]['count'])

def document_term_vector(client, index, id):
    """
    Returns the term vector of a document and its statistics a two sorted list of pairs (word, count)
    The first one is the frequency of the term in the document, the second one is the number of documents
    that contain the term

    :param client:
    :param index:
    :param id:
    :return:
    """
    termvector = client.termvectors(index=index, id=id, fields=['text'],
                                    positions=False, term_statistics=True)

    file_td = {}
    file_df = {}

    if 'text' in termvector['term_vectors']:
        for t in termvector['term_vectors']['text']['terms']:
            file_td[t] = termvector['term_vectors']['text']['terms'][t]['term_freq']
            file_df[t] = termvector['term_vectors']['text']['terms'][t]['doc_freq']
    return sorted(file_td.items()), sorted(file_df.items())

def toTFIDF(client, index, file_id):
    """
    Returns the term weights of a document

    :param file:
    :return:
    """

    # Get the frequency of the term in the document, and the number of documents
    # that contain the term
    file_tv, file_df = document_term_vector(client, index, file_id)

    max_freq = max([f for _, f in file_tv])

    dcount = doc_count(client, index)

    tfidfw = {}
    for (term, w),(_, df) in zip(file_tv, file_df):
        #
        idfi = np.log2((dcount/df))
        tfdi = w/max_freq
        tfidfw[term] = tfdi * idfi
        # Something happens here
        #

    return normalize(tfidfw)

def normalize(document):
    summ = sum(document.values())
    sqrt = np.sqrt(summ)
    norm = {term: document.get(term, 0)/sqrt for term in set(document)}
    return norm

def search_file_by_path(client, index, path):
    """
    Search for a file using its path

    :param path:
    :return:
    """
    s = Search(using=client, index=index)
    q = Q('match', path=path)  # exact search in the path field
    s = s.query(q)
    result = s.execute()

    lfiles = [r for r in result]
    if len(lfiles) == 0:
        raise NameError(f'File [{path}] not found')
    else:
        return lfiles[0].meta.id

#
def get_dictionary_from_query(query):
    dQuery = {}
    for elem in query:

        if '^' in elem:
            key, val = elem.split('^')
            val = float(val)

        else:
            val = 1.0
            key = elem
        
        dQuery[key] = val
        
    return normalize(dQuery)

def get_query_from_dictionary(theDict):
    query = []

    for elem in theDict:
        q = elem + '^' + str(theDict[elem])
        query.append(q)
    
    return query

nrounds_study = {}
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--index', default=None, help='Index to search')
    parser.add_argument('--k', default=10, type=int, help='Number of documents to return')
    parser.add_argument('--beta', default=1, type=float, help="beta coefficient of Rocchio's rule")
    parser.add_argument('--alpha', default=2, type=float, help="Alpha coefficient of Rocchio's rule")
    parser.add_argument('--R', default=4, type=int, help="Number of R most important terms of a document to use in document fusion")
    parser.add_argument('--nrounds', default=100, type=int, help="Number of times Rocchio's law is applied to the original query")
    parser.add_argument('--query', default=None, nargs=argparse.REMAINDER, help='List of words to search')
    args = parser.parse_args()

    index = args.index
    k = args.k
    beta = args.beta
    alpha = args.alpha 
    R = args.R 
    nrounds = args.nrounds 
    query = args.query
    
    try:
        client = Elasticsearch()
        s = Search(using=client, index=index)
        
        if query is not None:
            for iteration in range(0, nrounds):
                q = Q('query_string',query=query[0])
                for i in range(1, len(query)):
                    q &= Q('query_string',query=query[i])

                print(query)
                s = s.query(q)
                response = s[0:k].execute()

                dict_query = get_dictionary_from_query(query)
                
                x0 = 0
                for term,val in dict_query.items():
                    x0 = x0 + val 
                merged_documents = {}
                
                #Convert all the K most relevant documents to tfidf dictionaries and merge them into 
                for r in response:
                    file_tw = toTFIDF(client, index, r.meta.id) # tf-idf
                    merged_documents = {term: merged_documents.get(term, 0) + file_tw.get(term, 0) for term in set(merged_documents) | set(file_tw)} # sumem els valors de cada document
                    print(f'ID= {r.meta.id} SCORE={r.meta.score}')
                    print(f'PATH= {r.path}')
                    print(f'TEXT: {r.text[:50]}')
                    print(f'ITERATION: {iteration}')
                    print('-----------------------------------------------------------------')
                    
                
                #Apply Rocchio's rule 
                merged_documents = {term: merged_documents.get(term,0)*beta/k for term in set(merged_documents)} # B * merged_documents / k
                old_query = {term: dict_query.get(term,0)*alpha for term in set(dict_query)} # a * query
                new_query = {}
                new_query = {term: merged_documents.get(term, 0) + old_query.get(term, 0) for term in set(merged_documents) | set(old_query)} # alpha * query + beta * merged_documents / K
                
                # sorterm and get the R most relevant terms, this can be done sorting or using priority queue in R*log(n) time
                new_query = sorted(new_query.items(), key=operator.itemgetter(1), reverse = True) 
                x1 = 0
                for (term,val) in new_query:
                    x1 = x1 + val 
                    
                nrounds_study[iteration] = abs(x1-x0)
                new_query = new_query[:R] 
                # get query from dict
                dict_query = dict((term, val) for (term, val) in new_query) 
                
                
                query = get_query_from_dictionary(normalize(dict_query))
                print (f"{response.hits.total['value']} Documents")

        else:
            print('No query parameters passed')


    except NotFoundError:
        print(f'Index {index} does not exists')



: 

## Nrounds plots

In [None]:
            
        plt.plot(nrounds_study.keys(), nrounds_study.values())
        plt.xlabel('')
        plt.ylabel('Y-axis label')
        plt.title('Line Plot')
        plt.show()

: 