Скопируем функцию оценки качества ранжирования.

In [306]:
def print_standings(groundtruth_file = 'qrel_clean', answer_file = 'qrel_nnovik', should_log=True):
    q2reld = {} 
    for line in open(groundtruth_file):
        qid, did = [int(x) for x in line.split()]
        if qid not in q2reld.keys():
            q2reld[qid] = set()
        q2reld[qid].add(did)        

    q2retrd = {}
    for line in open(answer_file):
        qid, did = [int(x) for x in line.split()]
        if qid not in q2retrd.keys():
            q2retrd[qid] = []
        q2retrd[qid].append(did)               

    N = len(q2retrd.keys())
    precision = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2retrd[q]) for q in q2retrd.keys()]) / N
    recall = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2reld[q]) for q in q2retrd.keys()]) / N
    
    if should_log:
        print("Mean precision: {}\nMean recall: {}\nMean F-measure: {}"\
              .format(precision, recall, 2*precision*recall/(precision+recall)))

    # MAP@10
    import numpy as np

    MAP = 0.0
    for q in q2retrd.keys():
        n_results = min(10, len(q2retrd[q]))
        avep = np.zeros(n_results)
        for i in range(n_results):
            avep[i:] += q2retrd[q][i] in q2reld[q]
            avep[i] *= (q2retrd[q][i] in q2reld[q]) / (i+1.0)
        MAP += sum(avep) / min(n_results, len(q2reld[q]))
    
    if should_log:
        print("MAP@10: {}".format(MAP/N))
    
    return (precision, recall, 2*precision*recall/(precision+recall), MAP/N)

Напишем вспомогательные функции для извлечения информации о документах и запросах:

In [307]:
import re

class Data(object):
    def __init__(self, index = 0, header = "", annotation = ""):
        self.index = index
        self.header = header
        self.annotation = annotation
        
    def __str__(self):
        return "Data(index = %d, header = %s, annotation = %s)" % (self.index, self.header, self.annotation)

class Query(object):
    def __init__(self, index = 0, query = ""):
        self.index = index
        self.query = query
        
    def __str__(self):
        return "Query(index = %d, query = %s)" % (self.index, self.query)
    
def read_until_new_section(data_file):
    text = ""
    line = " "
    
    while line != "":
        line = data_file.readline().strip()
        if re.match("(.)[ITABW]", line) != None:
            break
        else:
            text += " " + line

    return text.strip(), line
        
def parse_data_file(filename):
    data_list = []

    with open(filename) as data_f:
        data = None
        line = " "
        
        while line != "":
            line = data_f.readline().strip()
            if line.startswith(".W"):
                data.annotation, line = read_until_new_section(data_f)
            if line.startswith(".I"):
                if data != None: 
                    data_list.append(data)
                data = Data()
                data.index = int(line[3:])
            if line.startswith(".T"):
                data.header, line = read_until_new_section(data_f)

        data_list.append(data)
        
    return data_list

def parse_query_file(filename):
    query_list = []

    with open(filename) as data_f:
        query = None
        line = " "
        
        while line != "":
            line = data_f.readline().strip()
            if line.startswith(".W"):
                query.query, line = read_until_new_section(data_f)
            if line.startswith(".I"):
                if query != None: 
                    query_list.append(query)
                query = Query()
                query.index = int(line[3:])

        query_list.append(query)
        
    return query_list

А также напишем вспомогательную функцию *** text_to_tokens() *** для разбора текста не лексемы:

In [330]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()             # Simple lemmatizer
stemmer1 = PorterStemmer()                   # Snowball better than Porter
stemmer2 = SnowballStemmer('english', False) # Snowball better than Porter
stop_words = set(stopwords.words('english')) # Stop words set

from nltk.tag import pos_tag
import re
import string

def tokenize(text):
    # Replaces all punctuation symbols with white spaces
    #text = re.sub(r'[^\w\s]',' ',text)
    
    tokens = [_.strip().lower() for _ in word_tokenize(text)]
    return list(filter(lambda x: x not in string.punctuation, tokens))

def lemmatize_all(tokens, lemmatizer):
    wnl = lemmatizer
    for word, tag in pos_tag(tokens):
        word = word.lower()
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield wnl.lemmatize(word)

# Используем лемматизацию вместе со стеммингом, ибо так немножко лучше результаты.
# С алгоритмом все просто:
# 1) Бьем на токены токенайзером
# 2) В зависимости от части речи используем лемматизацию именно для этой части речи
# 3) Удаляем стоп слова
# 4) Шлифуем стеммингом
def text_to_tokens(text):
    lst = tokenize(text)
    lemmaized = lemmatize_all(lst, lemmatizer)#[lemmatizer.lemmatize(_) for _ in lst] 
    wo_stop_words = filter(lambda word: word not in stop_words, lemmaized)
    return [stemmer2.stem(_) for _ in wo_stop_words]#list(wo_stop_words)#

Класс ___ CorpusIndex ___ используется для хранения инвертированного индекса и работы с ним:

In [353]:
from nltk import Text
import itertools
import operator
from collections import Counter
import textwrap

from math import log


def find_indexes(word, list_of_pages):
    pages_amount = len(list_of_pages)
    word_indexes = []
    for index in range(pages_amount):
        if word in list_of_pages[index]:
            word_indexes.append(index + 1)
    return word_indexes

def calculate_frequency(list_of_pages, docs_length):
    return [(doc_len, dict(Counter(page).items())) 
            for (doc_len, page) in zip(docs_length, list_of_pages)]
    
class CorpusIndex:
    
    """
    Build CorpusIndex from list of texts.
    """
    @staticmethod
    def from_corpus(corpus):
        corpus = list(corpus)
        docs_length = [len(x) for x in corpus]
        average_length = sum(docs_length) / len(docs_length)
        pages_list = list(map(lambda x: text_to_tokens(x), corpus))
        unique_lemms = sorted(list(set(itertools.chain(*pages_list))))
        inverted_index = list(map(lambda lemma: (lemma, find_indexes(lemma, pages_list)), unique_lemms))
        words_frequency = calculate_frequency(pages_list, docs_length)
        return CorpusIndex(unique_lemms, inverted_index, words_frequency, average_length)

    @staticmethod
    def _parse_frequency_token(token):
        word, freq = token.split("(")
        freq = int(freq[:-1])
        return word, freq
    
    @staticmethod
    def from_disk(path):
        with open(path, mode="r") as input_file:
            # Unique lemmas
            input_file.readline()
            unique_lemmas_line = input_file.readline().strip()
            unique_lemms = unique_lemmas_line.split(", ")
            input_file.readline()
            
            # Inverted index
            input_file.readline()
            word_index_line = input_file.readline().strip()
            inverted_index = []
            while word_index_line != "":
                word, indexes =  word_index_line.split(": ")
                indexes = [int(x) for x in indexes.split(" ")]
                inverted_index.append((word, indexes))
                word_index_line = input_file.readline().strip()
            
            # Words frequency:
            input_file.readline()
            frequency_line = input_file.readline()[:-1]
            words_frequency = []
            while frequency_line != "":
                amount, freqs =  frequency_line.split(": ")
                amount = int(amount)
                if amount != 0:
                    freqs = dict(CorpusIndex._parse_frequency_token(freq) for freq in freqs.split(", "))
                else:
                    freqs = dict()
                words_frequency.append((amount, freqs))
                frequency_line = input_file.readline()[:-1]
                
            return CorpusIndex(unique_lemms, inverted_index, words_frequency)
    
    def __init__(self, unique_lemms, inverted_index, words_frequency, average_length):
        self.unique_lemms = unique_lemms
        # Refactor!
        self.inverted_index = dict(inverted_index)
        self.words_frequency = words_frequency
        self.average_doc_len = average_length

    def __eq__(self, other):
        return self.unique_lemms == other.unique_lemms \
                and self.inverted_index == other.inverted_index \
                and self.words_frequency == other.words_frequency
        
    def __str__(self):
        return textwrap.dedent("""
        CorpusIndex(
            Unique lemmas: %s
            Inverted index: %s
            Words frequency: %s            
        )
        """ % (self.unique_lemms, self.inverted_index, self.words_frequency))

    def save(self, path):
        with open(path, mode="w+") as output_file:
            output_file.write("Unique lemmas:\n")
            output_file.write(", ".join(self.unique_lemms))
            output_file.write("\n\n")
            output_file.write("Inverted index:\n")
            output_file.write("\n".join("%s: %s" % (word, " ".join(str(i) for i in indexes)) \
                                        for (word, indexes) in self.inverted_index.items()))
            output_file.write("\n\n")
            output_file.write("Words frequency:\n")
            output_file.write("\n".join("%d: %s" % (words_amount, ", ".join("%s(%d)" % pair for pair in freq.items())) \
                                        for (words_amount, freq) in self.words_frequency))
            output_file.write("\n")

    def find(self, lemma):
        result = self.inverted_index.get(lemma)
        return result if result != None else []

    def lemma_freq(self, lemma, doc_id):
        doc_info = self.words_frequency[doc_id - 1]
        if doc_info[0] == 0:
            return 0
        hits = doc_info[1].get(lemma)
        if hits == None:
            return 0
        return hits
        
    def search_in_index(self, query, rsv_func):
        query_tokens = text_to_tokens(query)
        indexes = list(map(self.find, query_tokens))
        match_docIds = list(set(itertools.chain(*indexes)))
        
        rsv_scores = dict(map(lambda docId: (docId, rsv_func(self, docId, query_tokens)), match_docIds))
        sorted_rsv_scores = sorted(rsv_scores.items(), key=operator.itemgetter(1))
        sorted_rsv_scores.reverse()

        return list(map(lambda x: x[0], sorted_rsv_scores))
    
    def get_avg_index_len(self):
        index_len_sum = sum([len(x[1]) for x in self.inverted_index.items()])
        return index_len_sum / len(self.inverted_index)
    
    def get_max_index_len(self):
        index_len_max = max([len(x[1]) for x in self.inverted_index.items()])
        return index_len_max
    
    def print_statistics(self):
        dict_len = len(self.unique_lemms)
        print("Length of the dictionary: %s" % dict_len)
        print("Average list of word's positions length: %s" % self.get_avg_index_len())
        print("Max list of word's positions length: %s" % self.get_max_index_len())

Добавим функцию для создания различных версий RSV(q,d) функции:

In [332]:
def idf(corpusIndex, lemma):
    indexes = corpusIndex.inverted_index.get(lemma)
    docs_amount = 0
    if indexes != None:
        docs_amount = len(indexes)
    N = len(corpusIndex.words_frequency)
    Nt = docs_amount
    return log(1.0 + (N - Nt + 0.5) / (Nt + 0.5))

def idf_simple(corpusIndex, lemma):
    indexes = corpusIndex.inverted_index.get(lemma)
    docs_amount = 0
    if indexes != None:
        docs_amount = len(indexes)
    N = len(corpusIndex.words_frequency)
    Nt = docs_amount
    return log(N / Nt)

def construct_rsv_func(k1, b,
                       idf_func = idf, 
                       norm_rsv = False,
                       use_tfq = False, k2 = 100):
    def rsv(corpusIndex, doc_id, query_lemmas):
        score, idf_sum = 0.0, 0.0
        Ld = float(corpusIndex.words_frequency[doc_id - 1][0])
        _L_ = corpusIndex.average_doc_len
        for lemma in set(query_lemmas):
            if norm_rsv:
                idf_sum += idf(corpusIndex, lemma)

            f_td = corpusIndex.lemma_freq(lemma, doc_id)
            f_tq = query_lemmas.count(lemma)
            if f_td == 0:
                continue

            addition = idf_func(corpusIndex, lemma) * f_td * (k1 + 1) / (k1 * ((1 - b) + b * Ld / _L_) + f_td)
            if use_tfq:
                addition *= (k2 + 1) * f_tq / (k2 + f_tq)
            score += addition

        if norm_rsv:
            score /= idf_sum

        return score
    
    return rsv

Функция для замера оценок получивсшегося ранжирования:

In [354]:
def test_ranking(corpus, rsv_func=construct_rsv_func(k1 = 1.2, b = 0.75), extra_log_info, logging=True):
    disk_corpus = corpus

    query_list = parse_query_file("cran.qry")
    #print("\n\n".join(str(x) for x in query_list))

    search_results = map(lambda q: (q.index, disk_corpus.search_in_index(q.query, rsv_func)[:10]), query_list)
    with open("qrel_nnovik", mode="w+") as results_f:
        index = 1
        for (_, resutls) in search_results:
            for docId in resutls:
                results_f.write("%s %s\n" % (index, docId))
            index += 1

    result = print_standings(should_log = logging)
    return result

Протестируем вначале наш индекс вместе с самой базовой BM25:

In [355]:
data_list = parse_data_file("cran.all.1400")
rsv = construct_rsv_func(k1 = 1.2, b = 0.75)

headers_list = map(lambda x: x.header, data_list)
corpus = CorpusIndex.from_corpus(headers_list)
#corpus.save("test")
print("*** Ranking for k1 = 1.2, b = 0.75 (headers) ***")
ranking = test_ranking(corpus, rsv, "headers")
corpus.print_statistics()
print("\n")

annotations_list = map(lambda x: x.annotation, data_list)
corpus = CorpusIndex.from_corpus(annotations_list)
print("*** Ranking for k1 = 1.2, b = 0.75 (annotations) ***")
ranking = test_ranking(corpus, rsv, "annotations")
corpus.print_statistics()

*** Ranking for k1 = 1.2, b = 0.75 (headers) ***
Mean precision: 0.24666666666666678
Mean recall: 0.3609997558574267
Mean F-measure: 0.2930772645786639
MAP@10: 0.28114384115786245
Length of the dictionary: 1523
Average list of word's positions length: 7.115561391989495
Max list of word's positions length: 358


*** Ranking for k1 = 1.2, b = 0.75 (annotations) ***
Mean precision: 0.2942222222222223
Mean recall: 0.42721996967640197
Mean F-measure: 0.3484620396960764
MAP@10: 0.3662481089555164
Length of the dictionary: 6851
Average list of word's positions length: 12.559772296015181
Max list of word's positions length: 713


In [356]:
data_list = parse_data_file("cran.all.1400")
annotations_list = map(lambda x: x.annotation, data_list)
corpus = CorpusIndex.from_corpus(annotations_list)
#headers_list = map(lambda x: x.header, data_list)
#corpus = CorpusIndex.from_corpus(headers_list)

В дальнейшем будем использовать инвертированный индекс, построенный по аннотациям (не по заголовкам).

In [337]:
from itertools import product

range_k1 = [_ / 100.0 for _ in range(120, 201, 10)]
range_b  = [_ / 100.0 for _ in range(0, 101, 10)]

best_f_measure = ((), 0.0)
best_map_10 = ((), 0.0)
for (k1, b) in product(range_k1, range_b):
    rsv = construct_rsv_func(k1, b)
    ranking = test_ranking(corpus, rsv, "grid search", False)
    if (ranking[2] > best_f_measure[1]):
        best_f_measure = ((k1, b), ranking[2])
    if (ranking[3] > best_map_10[1]):
        best_map_10 = ((k1, b), ranking[3])
        
print("Best f-measure: %s", str(best_f_measure))
print("Best MAP@10: %s", str(best_map_10))

Best f-measure: %s ((1.9, 0.7), 0.3560820379208156)
Best MAP@10: %s ((2.0, 0.7), 0.37541323031270102)


Из полученных результатов видно, что наилучшая ___f-measure___ достигается при $k1 = 1.9, b = 0.7$. 

Такой $k1(=1.9)$ говорит о том, что оптимальный ранг сильно зависит от того, насколько часто терм запроса встречается в документах.

Такой $b(=0.7)$ говорит о том, что есть корелляция между весом терма запроса для данного документа и длиной документа. То есть, вес терма запроса для данного документа существенно зависит от длины документа.

In [338]:
best_k1 = 1.9
best_b = 0.7

In [339]:
rsv = construct_rsv_func(best_k1, best_b, idf_func = idf_simple)
ranking = test_ranking(corpus, rsv, "New idf")

Mean precision: 0.30177777777777776
Mean recall: 0.4345521020216618
Mean F-measure: 0.3561940681056661
MAP@10: 0.3744629405671733


При достаточно большом кол-ве текстов две формулы вычисления $IDF$ отличаются минимально. 
Поэтому и результат поменялся незначительно.

In [343]:
rsv = construct_rsv_func(best_k1, best_b, norm_rsv = True)
ranking = test_ranking(corpus, rsv, "Norm RSV")

Mean precision: 0.3017777777777778
Mean recall: 0.4342187686883285
Mean F-measure: 0.3560820379208156
MAP@10: 0.37417475714565657


Поскольку сумма IDF термов запроса одинакова для всех RSV(q, d), то нормирование по этой сумме не должно повлиять на результат (однако, меняет его минимально).

In [341]:
k2_values = [0, 5, 10, 50, 100, 500, 1000]

for _k2 in k2_values:
    rsv = construct_rsv_func(best_k1, best_b, use_tfq = True, k2 = _k2)
    print("*** Ranking for k2 = %s (annotations) ***" % _k2)
    ranking = test_ranking(corpus, rsv, "Use TFQ", True)
    if (ranking[2] > best_f_measure[1]):
        best_f_measure = ((k1, b), ranking[2])

*** Ranking for k2 = 0 (annotations) ***
Mean precision: 0.3017777777777778
Mean recall: 0.4342187686883285
Mean F-measure: 0.3560820379208156
MAP@10: 0.37417475714565657
*** Ranking for k2 = 5 (annotations) ***
Mean precision: 0.3031111111111111
Mean recall: 0.4371427706123304
Mean F-measure: 0.3579929377904864
MAP@10: 0.37369987682315736
*** Ranking for k2 = 10 (annotations) ***
Mean precision: 0.30266666666666664
Mean recall: 0.4368958570320835
Mean F-measure: 0.35760009056986974
MAP@10: 0.37282727247277503
*** Ranking for k2 = 50 (annotations) ***
Mean precision: 0.30266666666666664
Mean recall: 0.4368958570320835
Mean F-measure: 0.35760009056986974
MAP@10: 0.3737362230900591
*** Ranking for k2 = 100 (annotations) ***
Mean precision: 0.30266666666666664
Mean recall: 0.4368958570320835
Mean F-measure: 0.35760009056986974
MAP@10: 0.3736447327902354
*** Ranking for k2 = 500 (annotations) ***
Mean precision: 0.30266666666666664
Mean recall: 0.4368958570320835
Mean F-measure: 0.35760009

k2 = 5 оказался наилучшим вариантом для улучшения ___f-measure___. Это означает, что:
1. у нас имеются запросы с повторящимися термами;
2. если терм запроса повторяется несколько раз, то это должно это значит, что ранг данного документа должен быть немножечко увеличен для лучших результатов