# Information Retrieval I #
## Assignment 2: retrieval models [100 points + 10 bonus points] ##
**TA**: Christophe Van Gysel (cvangysel@uva.nl; C3.258B, Science Park 904)

**Secondary TAs**: Harrie Oosterhuis, Nikos Voskarides

In [None]:
import pyndri
import collections
import io
import logging
import sys
import time
import pickle
import math
from itertools import islice
import subprocess
import random
import re
from operator import itemgetter

In [None]:
index = pyndri.Index('index/')
token2id, id2token, _ = index.get_dictionary()

### Parsing the query file
You can parse the query file (`ap_88_89/topics_title`) using the following snippet:

In [None]:
def parse_topics(file_or_files,
                 max_topics=sys.maxsize, delimiter=';'):
    assert max_topics >= 0 or max_topics is None

    topics = collections.OrderedDict()

    if not isinstance(file_or_files, list) and \
            not isinstance(file_or_files, tuple):
        if hasattr(file_or_files, '__iter__'):
            file_or_files = list(file_or_files)
        else:
            file_or_files = [file_or_files]

    for f in file_or_files:
        assert isinstance(f, io.IOBase)

        for line in f:
            assert(isinstance(line, str))

            line = line.strip()

            if not line:
                continue

            topic_id, terms = line.split(delimiter, 1)

            if topic_id in topics and (topics[topic_id] != terms):
                    logging.error('Duplicate topic "%s" (%s vs. %s).',
                                  topic_id,
                                  topics[topic_id],
                                  terms)

            topics[topic_id] = terms

            if max_topics > 0 and len(topics) >= max_topics:
                break

    return topics

## Setups

In [None]:
n = index.maximum_document()-index.document_base()

def get_rid_of_zeros(n):
    collection = []
    for i in range(1,n+1):
        collection.append([word for word in index.document(i)[1] if word > 0])
    return collection

collection = get_rid_of_zeros(n)
# pickle.dump(collection, open("./results/collection.p", "wb"))
# collection = pickle.load(open( "./results/collection.p", "rb"))

def get_collection_length():
    length_collection = 0
    for i in range(n):
        length_collection += len(collection[i])
    return length_collection

col_len = get_collection_length()

def get_unique_collection(n):
    unique_words_docs = []
    for doc in collection:
        unique_words_docs.append(list(set(doc)))
    return unique_words_docs

unique_words_docs = get_unique_collection(n)
# pickle.dump(unique_words_docs, open("./results/unique_words_docs.p", "wb"))
# unique_words_docs = pickle.load(open( "./results/unique_words_docs.p", "rb"))

** Inverted Index List **

In [None]:
def get_inverted_index():
    inverted_list = {}
    query_list = []

    with open('./ap_88_89/topics_title', 'r') as f_topics:   
        for query in parse_topics([f_topics]).items():
            query_list.append(query)

    nr = 0
    for query in query_list:
        nr += 1
        if nr % 10 == 0: #
            print('\r',str(nr)+'/'+str(len(query_list)))

        # getting the query term ids
        query_id = query[0]
        query_string = query[1]
        query_tokens = index.tokenize(query_string)
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]

        for qti in query_id_tokens: # for every query term
            if qti not in inverted_list: # only check unique query token once
                tot_count = 0
                inverted_list[qti] = [[],0]
                for i in range(n):                
                    word_counter = collection[i].count(qti)
                    if word_counter > 0:
                        tot_count += word_counter
                        docid = index.document_ids([index.document(i+1)[0]])[0][1]
                        inverted_list[qti][0].append(docid) # add document to query tok id
                inverted_list[qti][1]= (tot_count)
    return inverted_list, query_list

inverted_list, query_list = get_inverted_index()
# pickle.dump(inverted_list, open("./results/inverted_index.p", "wb"))
# pickle.dump(query_list, open("./results/query_list.p", "wb"))
# query_list = pickle.load(open( "./results/query_list.p", "rb"))
# inverted_list = pickle.load(open( "./results/inverted_index.p", "rb"))

**More setups**

In [None]:
# the query ids of the validation list
with open('./ap_88_89/qrel_validation', 'r') as val_queries: 
    val_queries = list(set([line.split(' ')[0] for line in val_queries]))

    
# the query ids of the test list
with open('./ap_88_89/qrel_test', 'r') as test_queries_: 
    test_queries = list(set([line.split(' ')[0] for line in test_queries_]))
    
queries_dict = {} # {qid: qstring, qid: qstring...}
with open('./ap_88_89/topics_title', 'r') as f_topics: 
    for query in parse_topics([f_topics]).items():
        queries_dict[query[0]] = query[1]
        
def make_dict_format(dic):
    results = []
    for doc, score in dic.items():
        results.append((score, doc))
    return tuple(results)

def get_query_docs(queryset):
    query_docus = {}
    nr = 0
    for query_id in queryset:
        nr +=1
        if nr % 5 ==0:
            print('doc', nr)
        query_tokens = index.tokenize(queries_dict[str(query_id)])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]

        relev_documents = []
        for token in query_id_tokens:
            for doc in inverted_list[token][0]:
                if doc not in relev_documents:
                    relev_documents.append(doc)

        query_docus[query_id] = relev_documents
    return query_docus

query_val_docs = get_query_docs(val_queries)
# pickle.dump(query_val_docs, open("./results/query_val_docs.p", "wb"))

query_test_docs = get_query_docs(test_queries)
# pickle.dump(query_test_docs, open("./results/query_test_docs.p", "wb"))

# query_val_docs = pickle.load(open( "./results/query_val_docs.p", "rb"))
# query_test_docs = pickle.load(open( "./results/query_test_docs.p", "rb"))

def idf(t):
    return math.log(n)-math.log(len(inverted_list[t][0]))

def background_prob(w):
    if w in inverted_list:
        tf_w_C = inverted_list[w][-1]
    else: 
        tf_w_C = 0
    return tf_w_C/float(col_len)

# Retrieval Models

## TF-IDF  (Vector-space)

In [None]:
def tf_idf(t, d):
    return math.log(1+collection[d].count(t)) * idf(t)

def score_TFIDF(q,d):
    unique = unique_words_docs[d-1]
    score = 0
    for word in q:
        score += tf_idf(word,d-1)   
    return score

def get_TFIDF_scores():
    TFIDF_dict = {}
    
    nr = 0
    for query_id in test_queries:
        r = {}
        print('\r',str(nr)+'/'+str(len(test_queries)), end=" ")
        query_tokens = index.tokenize(queries_dict[str(query_id)])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]

        for d in query_test_docs[query_id]:
            d_id = str(index.document(d)[0])
            r[d_id] = score_TFIDF(query_id_tokens, d)
            
        while len(r) < 1000:
            i = random.randrange(1,n)
            i = str(index.document(i)[0])
            if len(collection[i-1]) == 0:
                r[j] = r[j]
            else: r[i] = 0 if i not in r else r[i]

        TFIDF_dict[str(query_id)] = make_dict_format(r)
        nr += 1
        
    return TFIDF_dict

TFIDF_scores = get_TFIDF_scores()
# pickle.dump(TFIDF_scores, open("./results/tfidf_results.p", "wb"))
# TFIDF_scores = pickle.load(open( "./results/tfidf_results.p", "rb"))

def get_top_tfidf():
    tfidf_top_1000_docs = {}
    for key, values in TFIDF_scores.items():
        query_id = key
        tfidf_top_1000_docs[int(query_id)] = []
        sorted_list = sorted(values, key=itemgetter(0), reverse = True)[:1000]
        for value in sorted_list:
            tfidf_top_1000_docs[int(query_id)].append(index.document_ids([value[1]])[0][1])
    return tfidf_top_1000_docs

tfidf_top_1000_docs = get_top_tfidf()
# pickle.dump(tfidf_top_1000_docs, open("./results/tfidf_top.p", "wb"))
# tfidf_top_1000_docs = pickle.load(open( "./results/tfidf_top.p", "rb"))

## BM25 (Probabilistic)

In [None]:
def average_length():
    l = 0
    for i in range(n):
        l += len(collection[i])
    return l/float(n)

l_av = average_length()

def BM25(t,d):
    k1 = 1.2
    b = 0.75
    first = ((k1+1)* collection[d].count(t)) / (k1*((1-b)+b*(len(collection[d])/l_av))+collection[d].count(t))
    return  first*idf(t)

def score_BM25(q,d):
    unique = unique_words_docs[d-1]
    score = 0
    for word in set(q):
        score += BM25(word,d-1)
        
    return score

def get_BM25_scores():
    BM25_dict = {}
    
    nr = 0
    for query_id in test_queries:
        r = {}
        print('\r',str(nr)+'/'+str(len(test_queries)), end=" ")
        query_tokens = index.tokenize(queries_dict[str(query_id)])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]
        for d in query_test_docs[query_id]:
            d_id = str(index.document(d)[0])
            r[d_id] = score_BM25(query_id_tokens, d)
            
        while len(r) < 1000:
            i = random.randrange(1,n)
            i = str(index.document(i)[0])
            if len(collection[i-1]) == 0:
                r[j] = r[j]
            else: r[i] = 0 if i not in r else r[i]

        BM25_dict[str(query_id)] = make_dict_format(r)
        nr += 1
        
    return BM25_dict

BM25_scores = get_BM25_scores()
# pickle.dump(BM25_scores, open("./results/bm25_results.p", "wb"))
# BM25_scores = pickle.load(open( "./results/bm25_results.p", "rb"))

## Jelinek-Mercer


In [None]:
def jelinek_mercer(lamb, d, w):
    P = (lamb * (collection[d].count(w)/len(collection[d]))) + ((1-lamb) * background_prob(w))
    return P

def jelinek_score(lamb,query,d):
    score = 0
    for q in query:
        score += math.log(jelinek_mercer(lamb,d,q))
    return score

def get_jelinek_scores(lamb, type_set):
    print('\r','lamb:',lamb)
    jelinek_dict = {}
    nr = 0
    
    if type_set == "val":
        queries = val_queries
        query_docs = query_val_docs
    else:
        queries = test_queries
        query_docs = query_test_docs
        
    for query_id in queries:
        r = {}
        print('\r',str(nr)+'/'+str(len(queries)), end=" ")
        query_tokens = index.tokenize(queries_dict[str(query_id)])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]
        
    
        for d in query_docs[query_id]:
            d_id = str(index.document(d)[0])
            r[d_id] = jelinek_score(lamb, query_id_tokens, d-1)
            
        while len(r) < 1000:
            i = random.randrange(1,n)
            j = str(index.document(i)[0])
            if len(collection[i-1]) == 0:
                r[j] = r[j]
            else: r[j] = jelinek_score(lamb, query_id_tokens, i-1) if j not in r else r[j]

        jelinek_dict[str(query_id)] = make_dict_format(r)
        nr += 1
        
    return jelinek_dict

jelinek_scores_0_1 = get_jelinek_scores(0.1, 'val')
jelinek_scores_0_2 = get_jelinek_scores(0.2, 'val')
jelinek_scores_0_3 = get_jelinek_scores(0.3, 'val')
jelinek_scores_0_4 = get_jelinek_scores(0.4, 'val')
jelinek_scores_0_5 = get_jelinek_scores(0.5, 'val')
jelinek_scores_0_6 = get_jelinek_scores(0.6, 'val')
jelinek_scores_0_7 = get_jelinek_scores(0.7, 'val')
jelinek_scores_0_8 = get_jelinek_scores(0.8, 'val')
jelinek_scores_0_9 = get_jelinek_scores(0.9, 'val')
# pickle.dump(jelinek_scores_0_1, open("./results/jelinek_scores_0_1.p", "wb"))
# pickle.dump(jelinek_scores_0_2, open("./results/jelinek_scores_0_2.p", "wb"))
# pickle.dump(jelinek_scores_0_3, open("./results/jelinek_scores_0_3.p", "wb"))
# pickle.dump(jelinek_scores_0_4, open("./results/jelinek_scores_0_4.p", "wb"))
# pickle.dump(jelinek_scores_0_5, open("./results/jelinek_scores_0_5.p", "wb"))
# pickle.dump(jelinek_scores_0_6, open("./results/jelinek_scores_0_6.p", "wb"))
# pickle.dump(jelinek_scores_0_7, open("./results/jelinek_scores_0_7.p", "wb"))
# pickle.dump(jelinek_scores_0_8, open("./results/jelinek_scores_0_8.p", "wb"))
# pickle.dump(jelinek_scores_0_9, open("./results/jelinek_scores_0_9.p", "wb"))
# jelinek_scores_0_1 = pickle.load(open( "./results/jelinek_scores_0_1.p", "rb"))
# jelinek_scores_0_2 = pickle.load(open( "./results/jelinek_scores_0_2.p", "rb"))
# jelinek_scores_0_3 = pickle.load(open( "./results/jelinek_scores_0_3.p", "rb"))
# jelinek_scores_0_4 = pickle.load(open( "./results/jelinek_scores_0_4.p", "rb"))
# jelinek_scores_0_5 = pickle.load(open( "./results/jelinek_scores_0_5.p", "rb"))
# jelinek_scores_0_6 = pickle.load(open( "./results/jelinek_scores_0_6.p", "rb"))
# jelinek_scores_0_7 = pickle.load(open( "./results/jelinek_scores_0_7.p", "rb"))
# jelinek_scores_0_8 = pickle.load(open( "./results/jelinek_scores_0_8.p", "rb"))
# jelinek_scores_0_9 = pickle.load(open( "./results/jelinek_scores_0_9.p", "rb"))

## Dirichlet Prior

In [None]:
def dirichlet_prior(mu,d,w):
    return ((len(collection[d])/(len(collection[d])+mu)) * (collection[d].count(w)/len(collection[d]))) + ((mu/(mu+len(collection[d]))) * background_prob(w))

def dirichlet_score(mu,query,d):
    score = 0
    for q in query:
        score += math.log(dirichlet_prior(mu,d,q))
    return score

def get_dirichlet_scores(mu, type_set):
    print('\r','mu:',mu)
    dirichlet_dict = {}
    nr = 0
    
    if type_set == "val":
        queries = val_queries
        query_docs = query_val_docs
    else:
        queries = test_queries
        query_docs = query_test_docs
        
    for query_id in queries:
        r = {}
        print('\r',str(nr)+'/'+str(len(queries)), end=" ")
        query_tokens = index.tokenize(queries_dict[str(query_id)])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]      
    
        for d in query_docs[query_id]:
            d_id = str(index.document(d)[0])
            r[d_id] = dirichlet_score(mu, query_id_tokens, d-1)
            
        while len(r) < 1000:
            i = random.randrange(1,n)
            j = str(index.document(i)[0])
            if len(collection[i-1]) == 0:
                r[j] = r[j]
            else: r[j] = dirichlet_score(mu, query_id_tokens, i-1) if j not in r else r[j]

        dirichlet_dict[str(query_id)] = make_dict_format(r)
        nr += 1
        
    return dirichlet_dict

dirichlet_scores_500 = get_dirichlet_scores(500, "val")
dirichlet_scores_1000 = get_dirichlet_scores(1000, "val")
dirichlet_scores_1500 = get_dirichlet_scores(1500, "val")
dirichlet_scores_2000 = get_dirichlet_scores(2000, "val")
# pickle.dump(dirichlet_scores_500, open("./results/dirichlet_scores_500.p", "wb"))
# pickle.dump(dirichlet_scores_1000, open("./results/dirichlet_scores_1000.p", "wb"))
# pickle.dump(dirichlet_scores_1500, open("./results/dirichlet_scores_1500.p", "wb"))
# pickle.dump(dirichlet_scores_2000, open("./results/dirichlet_scores_2000.p", "wb"))
# dirichlet_scores_500 = pickle.load(open( "./results/dirichlet_scores_500.p", "rb"))
# dirichlet_scores_1000 = pickle.load(open( "./results/dirichlet_scores_1000.p", "rb"))
# dirichlet_scores_1500 = pickle.load(open( "./results/dirichlet_scores_1500.p", "rb"))
# dirichlet_scores_2000 = pickle.load(open( "./results/dirichlet_scores_2000.p", "rb"))


## Absolute Discounting

In [None]:
def absolute_discounting(delta, d, w):
    return (max(collection[d].count(w)-delta, 0)/len(collection[d])) + (((delta * len(unique_words_docs[d]))/len(collection[d])) * (background_prob(w)))

def AD_score(delta,query,d):
    score = 0
    for q in query:
        score += math.log(absolute_discounting(delta,d,q))
    return score

def get_AD_scores(delta, type_set):
    print('\r','delta:',delta)
    AD_dict = {}
    nr = 1 
    
    if type_set == "val":
        queries = val_queries
        query_docs = query_val_docs
    else:
        queries = test_queries
        query_docs = query_test_docs
        
    for query_id in queries:
        r = {}
        print('\r',str(nr)+'/'+str(len(queries)), end=" ")
        query_tokens = index.tokenize(queries_dict[str(query_id)])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]      
    
        for d in query_docs[query_id]:
            d_id = str(index.document(d)[0])
            r[d_id] = AD_score(delta, query_id_tokens, d-1)
            
        while len(r) < 1000:
            i = random.randrange(1,n)
            j = str(index.document(i)[0])
            if len(collection[i-1]) == 0:
                r[j] = r[j]
            else: r[j] = AD_score(delta, query_id_tokens, i-1) if j not in r else r[j]

        AD_dict[str(query_id)] = make_dict_format(r)
        nr += 1
        
    return AD_dict

AD_scores_0_1 = get_AD_scores(0.1,"val")
AD_scores_0_2 = get_AD_scores(0.2,"val")
AD_scores_0_3 = get_AD_scores(0.3,"val")
AD_scores_0_4 = get_AD_scores(0.4,"val")
AD_scores_0_5 = get_AD_scores(0.5,"val")
AD_scores_0_6 = get_AD_scores(0.6,"val")
AD_scores_0_7 = get_AD_scores(0.7,"val")
AD_scores_0_8 = get_AD_scores(0.8,"val")
AD_scores_0_9 = get_AD_scores(0.9,"val")
# pickle.dump(AD_scores_0_1, open("./results/AD_scores_0_1.p", "wb"))
# pickle.dump(AD_scores_0_2, open("./results/AD_scores_0_2.p", "wb"))
# pickle.dump(AD_scores_0_3, open("./results/AD_scores_0_3.p", "wb"))
# pickle.dump(AD_scores_0_4, open("./results/AD_scores_0_4.p", "wb"))
# pickle.dump(AD_scores_0_5, open("./results/AD_scores_0_5.p", "wb"))
# pickle.dump(AD_scores_0_6, open("./results/AD_scores_0_6.p", "wb"))
# pickle.dump(AD_scores_0_7, open("./results/AD_scores_0_7.p", "wb"))
# pickle.dump(AD_scores_0_8, open("./results/AD_scores_0_8.p", "wb"))
# pickle.dump(AD_scores_0_9, open("./results/AD_scores_0_9.p", "wb"))
# AD_scores_0_1 = pickle.load(open( "./results/AD_scores_0_1.p", "rb"))
# AD_scores_0_2 = pickle.load(open( "./results/AD_scores_0_2.p", "rb"))
# AD_scores_0_3 = pickle.load(open( "./results/AD_scores_0_3.p", "rb"))
# AD_scores_0_4 = pickle.load(open( "./results/AD_scores_0_4.p", "rb"))
# AD_scores_0_5 = pickle.load(open( "./results/AD_scores_0_5.p", "rb"))
# AD_scores_0_6 = pickle.load(open( "./results/AD_scores_0_6.p", "rb"))
# AD_scores_0_7 = pickle.load(open( "./results/AD_scores_0_7.p", "rb"))
# AD_scores_0_8 = pickle.load(open( "./results/AD_scores_0_8.p", "rb"))
# AD_scores_0_9 = pickle.load(open( "./results/AD_scores_0_9.p", "rb"))

# Hyperparameter optimisation

Write run, create output and analyse output functions:

In [None]:
def write_run(model_name, data, out_f,
              max_objects_per_query=sys.maxsize,
              skip_sorting=False):
    """
    Write a run to an output file.
    Parameters:
        - model_name: identifier of run.
        - data: dictionary mapping topic_id to object_assesments;
            object_assesments is an iterable (list or tuple) of
            (relevance, object_id) pairs.
            The object_assesments iterable is sorted by decreasing order.
        - out_f: output file stream.
        - max_objects_per_query: cut-off for number of objects per query.
    """
    for subject_id, object_assesments in data.items():
        if not object_assesments:
            logging.warning('Received empty ranking for %s; ignoring.',
                            subject_id)

            continue

        # Probe types, to make sure everything goes alright.
        # assert isinstance(object_assesments[0][0], float) or \
        #     isinstance(object_assesments[0][0], np.float32)
        assert isinstance(object_assesments[0][1], str) or \
            isinstance(object_assesments[0][1], bytes)

        if not skip_sorting:
            object_assesments = sorted(object_assesments, reverse=True)

        if max_objects_per_query < sys.maxsize:
            object_assesments = object_assesments[:max_objects_per_query]

        if isinstance(subject_id, bytes):
            subject_id = subject_id.decode('utf8')

        for rank, (relevance, object_id) in enumerate(object_assesments):
            if isinstance(object_id, bytes):
                object_id = object_id.decode('utf8')

            out_f.write(
                '{subject} Q0 {object} {rank} {relevance} '
                '{model_name}\n'.format(
                    subject=subject_id,
                    object=object_id,
                    rank=rank + 1,
                    relevance=relevance,
                    model_name=model_name))
            
# The following writes the run to standard output.
# In your code, you should write the runs to local
# storage in order to pass them to trec_eval.
# write_run(
#     model_name="PLM",
#     data=PLM_scores,
#     out_f=open("results/PLM_scores.run", "w"),
#     max_objects_per_query=1000)

r = re.compile(r'([^ \\t]*)\\t*')

def create_output(type_set, filename):
    if type_set == 'test':
        command = "trec_eval -m all_trec -q ap_88_89/qrel_test "
    else:
        command = "trec_eval -m all_trec -q ap_88_89/qrel_validation "
    command +=  "runfiles/" + filename #+" | grep -E '\sall\s'"
    
    output = str(subprocess.check_output(command, shell = True))
    return output

def analyse_output(output, title):
    # NDCG@10, Mean Average Precision (MAP@1000), Precision@5 and Recall@1000.
    measure_results = {}
    measures = ["ndcg_cut_10", ["100"]], ["map_cut_1000",[]], ["P_5", ["500", "relative"]], ["recall_1000",[]]
    for measure in measures:
        measure_list = []
        measure_all = 0
        for line in output.split():
            if measure[0] in line:
                clean = True
                for restriction in measure[1]:
                    if restriction in line:
                        clean = False
                if clean:
                    if "tall" in line:
                        measure_all = r.findall(line)[-1]
                    else:
                        measure_list.append(float(r.findall(line)[-1]))
        measure_results[measure[0]] = measure_all, measure_list

    return [title, measure_results]

## Dirichlet mu optimisation:

In [None]:
output500 = create_output('validation', 'dirichlet_scores_500.run')
measure_results500 = analyse_output(output500, "Dirichlet mu=500")
output1000 = create_output('validation', 'dirichlet_scores_1000.run')
measure_results1000 = analyse_output(output1000, "Dirichlet mu=1000")
output1500 = create_output('validation', 'dirichlet_scores_1500.run')
measure_results1500 = analyse_output(output1500, "Dirichlet mu=1500")
output2000 = create_output('validation', 'dirichlet_scores_2000.run')
measure_results2000 = analyse_output(output2000, "Dirichlet mu=2000")
dirichlet_measures = [measure_results500, measure_results1000, measure_results1500, measure_results2000]

for param in dirichlet_measures:
    print(param[0])
    for key,value in param[1].items():
        print(str(key)+':', value[0])
    print('')

** -> Dirichlet mu NDCG winner: 1000**


** -> Dirichlet overall winner: 2000**

## Jelinek lambda optimisation:

In [None]:
outl01 = create_output('validation', 'jelinek_scores_0_1.run')
outl02 = create_output('validation', 'jelinek_scores_0_2.run')
outl03 = create_output('validation', 'jelinek_scores_0_3.run')
outl04 = create_output('validation', 'jelinek_scores_0_4.run')
outl05 = create_output('validation', 'jelinek_scores_0_5.run')
outl06 = create_output('validation', 'jelinek_scores_0_6.run')
outl07 = create_output('validation', 'jelinek_scores_0_7.run')
outl08 = create_output('validation', 'jelinek_scores_0_8.run')
outl09 = create_output('validation', 'jelinek_scores_0_9.run')
resl01 = analyse_output(outl01, "Jelinek lamb=0.1")
resl02 = analyse_output(outl02, "Jelinek lamb=0.2")
resl03 = analyse_output(outl03, "Jelinek lamb=0.3")
resl04 = analyse_output(outl04, "Jelinek lamb=0.4")
resl05 = analyse_output(outl05, "Jelinek lamb=0.5")
resl06 = analyse_output(outl06, "Jelinek lamb=0.6")
resl07 = analyse_output(outl07, "Jelinek lamb=0.7")
resl08 = analyse_output(outl08, "Jelinek lamb=0.8")
resl09 = analyse_output(outl09, "Jelinek lamb=0.9")
jelinek_measures = [resl01, resl02, resl03, resl04, resl05, resl06, resl07, resl08, resl09]

In [None]:
for param in jelinek_measures:
    print(param[0])
    for key,value in param[1].items():
        print(str(key)+':', value[0])
    print('')

** -> Jelinek lambda winner: 0.1**

## Absolute Discounting delta optimisation:

In [None]:
outd01 = create_output('validation', 'AD_scores_0_1.run')
outd02 = create_output('validation', 'AD_scores_0_2.run')
outd03 = create_output('validation', 'AD_scores_0_3.run')
outd04 = create_output('validation', 'AD_scores_0_4.run')
outd05 = create_output('validation', 'AD_scores_0_5.run')
outd06 = create_output('validation', 'AD_scores_0_6.run')
outd07 = create_output('validation', 'AD_scores_0_7.run')
outd08 = create_output('validation', 'AD_scores_0_8.run')
outd09 = create_output('validation', 'AD_scores_0_9.run')
resd01 = analyse_output(outd01, "AD delta=0.1")
resd02 = analyse_output(outd02, "AD delta=0.2")
resd03 = analyse_output(outd03, "AD delta=0.3")
resd04 = analyse_output(outd04, "AD delta=0.4")
resd05 = analyse_output(outd05, "AD delta=0.5")
resd06 = analyse_output(outd06, "AD delta=0.6")
resd07 = analyse_output(outd07, "AD delta=0.7")
resd08 = analyse_output(outd08, "AD delta=0.8")
resd09 = analyse_output(outl09, "AD delta=0.9")
AD_measures = [resd01, resd02, resd03, resd04, resd05, resd06, resd07, resd08, resd09]

In [None]:
for param in AD_measures:
    print(param[0])
    for key,value in param[1].items():
        print(str(key)+':', value[0])
    print('')

** -> AD delta winner: 0.8**

## Now run the winning parameters on the Test Set

In [None]:
#Jelinek
jelinek_scores_test = get_jelinek_scores(0.2, 'test')
# pickle.dump(jelinek_scores_test, open("./results/jelinek_scores_test.p", "wb"))
# jelinek_scores_test = pickle.load(open( "./results/jelinek_scores_test.p", "rb"))
outl02t = create_output('test', 'jelinek_scores_test.run')
resl02t = analyse_output(outl02t, "Jelinek lamb=0.2")
print(resl02t[0])
for key,value in resl02t[1].items():
    print(str(key)+':', value[0])
print('')

#Dirichlet
dirichlet_scores_test = get_dirichlet_scores(2000, "test")
# pickle.dump(dirichlet_scores_test, open("./results/dirichlet_scores_test.p", "wb"))
# dirichlet_scores_test = pickle.load(open( "./results/dirichlet_scores_test.p", "rb"))
output2000t = create_output('test', 'dirichlet_scores_test.run')
measure_results2000t = analyse_output(output2000t, "Dirichlet mu=2000")
print(measure_results2000t[0])
for key,value in measure_results2000t[1].items():
    print(str(key)+':', value[0])
print('')

#Absolute Discounting
AD_scores_test = get_AD_scores(0.8,"test")
# pickle.dump(AD_scores_test, open("./results/AD_scores_test.p", "wb"))
# AD_scores_test = pickle.load(open( "./results/AD_scores_test.p", "rb"))
outd08t = create_output('test', 'AD_scores_test.run')
resd08t = analyse_output(outd08t, "AD delta=0.8")
print(resd08t[0])
for key,value in resd08t[1].items():
    print(str(key)+':', value[0])
print('')

## Positional Language Models on Test Set with mu for Dirichlet = 1000

In [None]:
def get_query_terms():
    query_terms =[]
    for query in query_list:
        query_tokens = index.tokenize(query[1])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]

        for query_token in query_id_tokens:
            if query_token not in query_terms:
                query_terms.append(query_token)
    return query_terms

get_query_terms()

def q_docs():
    q_docs = []
    nr = 0
    for doc in collection:
        nr +=1
        if nr % 1000 ==0:
            print(nr)
        query_doc = []
        for i in range(len(doc)):
            if doc[i] in query_terms:
                for qterm in query_terms:
                    if qterm == doc[i]:
                        query_doc.append([qterm, i])
        q_docs.append(query_doc)
    return q_docs

q_docs = get_q_docs()
# pickle.dump(q_docs, open("./results/query_docs.p", "wb"))
# q_docs = pickle.load(open( "./results/query_docs.p", "rb"))

def kernel_gaussian(sigma,i,j):
    return math.exp((-1*((i-j)**2))/(2*(sigma**2)))
    
def kernel_triangle(sigma,i,j):
    if i-j <= sigma:
        return 1-((i-j)/sigma)
    else:
        return 0.0

def kernel_cosine(sigma,i,j):
    if i-j <= sigma:
        return 0.5*(1+math.cos(((i-j)*math.pi)/sigma))
    else:
        return 0.0

def kernel_circle(sigma,i,j):
    if i-j <= sigma:
        return math.sqrt(1-(((i-j)/sigma)**2))
    else:
        return 0.0

def kernel_passage(sigma,i,j):
    if i-j <= sigma:
        return 1.0
    else:
        return 0.0  
    
def c(w,j,d): 
    if w == d[j]:
        return 1
    else: return 0
    
def c_prime(w,i,d): # 0.0003 seconds
    c_prime = 0
    
    for query in q_docs[d]:
        if query[0] == w:
            j = query[1]
            c_prime += kernel_gaussian(50,i,j)
    return c_prime

def get_all_zs():
    Z = []
    
    max_len = 0
    for doc in collection:
        if len(doc) > max_len:
            max_len = len(doc)
    
    for i in range(max_len):
        z = 0
        for j in range(max_len):
            z += kernel_gaussian(50,i,j)
        Z.append(z)
    return Z

Zs = get_all_zs()

def PLM(mu,w,i,d): 
    P = (c_prime(w,i,d) + (mu * background_prob(w))) / (Zs[i] + mu)#
    return P

def PLM_score(q,d):
    unique = unique_words_docs[d]
    max_score = -100000
    for i in range(len(collection[d])):   
        scores = []
        for word in q:
            if inverted_list[word][0]:
                scores.append(((q.count(word)/float(len(q))) * (math.log((q.count(word)/float(len(q)))/PLM(1000,word,i,d)))))
        if -sum(scores) > max_score:
            max_score = -sum(scores)
    return max_score

def get_PLM_scores(mu, type_set):
    PLM_dict = {}
    secdoc = 0
    nr = 0
    
    if type_set == "val":
        queries = val_queries
        query_docs = query_val_docs
    else:
        queries = test_queries
        query_docs = query_test_docs
        
    for query_id in queries:
        nr +=1
        start = time.time()
        r = {}
        if len(query_docs[query_id]) < 1000:
            nrdocs = 1000
        else: nrdocs = len(query_docs[query_id])
        print('\r','Q'+str(query_id), 'estimated time: '+ str(round((secdoc*nrdocs)/60.0,2)), 'min,', nrdocs, 'documents', '\t'+str(nr)+'/'+str(len(queries)), end=" ") 
        query_tokens = index.tokenize(queries_dict[str(query_id)]) #query[1])
        query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
        query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]

        for d in query_docs[query_id]:
            d_id = str(index.document(d)[0])
            r[d_id] = PLM_score(query_id_tokens, d-1)

            
        while len(r) < 1000:
            i = random.randrange(1,n)
            j = str(index.document(i)[0])
            if len(collection[i-1]) == 0:
                r[j] = r[j]
            else: r[j] = PLM_score(query_id_tokens, i-1) if j not in r else r[j]
                
        PLM_dict[str(query_id)] = make_dict_format(r)
        secdoc = (time.time()-start)/float(nrdocs)
    
    return PLM_dict

PLM_scores = get_PLM_scores(2000, 'test')
# pickle.dump(PLM_scores, open("./results/plm_results.p", "wb"))
# PLM_scores = pickle.load(open( "./results/PLM_scores.p", "rb"))