In [122]:
import json

class WikiPage(object):
    def __init__(self, url="", title="", info="", index=-1):
        self.url = url
        self.title = title
        self.info = info
        self.index = int(index)
        
    def __str__(self):
        return "WikiPage(url = [ %s ], title = %s, info = %s, index = %d)" % (self.url, self.title, self.info[:100], self.index)

class Query(object):
    def __init__(self, index = 0, query = ""):
        self.index = index
        self.query = query
        
    def __str__(self):
        return "Query(index = %d, query = %s)" % (self.index, self.query)


def parse_data_file(filename = 'scraped_data_utf8.json'):
    page_list = []
    page_dict = dict()
    
    with open(filename) as json_wiki_file:
        for line in json_wiki_file:
            node = json.loads(line.strip())
            page = WikiPage(node['url'], node['title'], node['info'], node['index'])
            page_list.append(page)
            page_dict[page.index] = page
    
    max_index = max(map(lambda x: x.index, page_list))
    for ind in range(max_index):
        if ind not in page_dict:
            page_dict[ind] = WikiPage(index = ind)
            
    return page_dict

def parse_query_file(filename = 'qid.csv'):
    query_list = []

    with open(filename) as json_wiki_file:
        for line in json_wiki_file:
            index, query = line.strip().split(",", 1)
            query_list.append(
                Query(int(index), query)
            )
    
    return query_list

In [123]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from nltk.corpus import stopwords

#lemmatizer = WordNetLemmatizer()             # Simple lemmatizer
#stemmer1 = PorterStemmer()                   # Snowball better than Porter
stemmer2 = SnowballStemmer('russian') # Snowball better than Porter
stop_words = set(stopwords.words('russian')) # Stop words set

from nltk.tag import pos_tag
import re
import string

def tokenize(text):
    # Replaces all punctuation symbols with white spaces
    text = re.sub(r'[^\w\s]',' ',text)
    
    tokens = [_.strip().lower() for _ in word_tokenize(text)]
    return list(filter(lambda x: x not in string.punctuation, tokens))

# Используем лемматизацию вместе со стеммингом, ибо так немножко лучше результаты.
# С алгоритмом все просто:
# 1) Бьем на токены токенайзером
# 2) Удаляем стоп слова
# 3) Шлифуем стеммингом
def text_to_tokens(text):
    lst = tokenize(text)
    #lemmaized = lemmatize_all(lst, lemmatizer)#[lemmatizer.lemmatize(_) for _ in lst] 
    wo_stop_words = filter(lambda word: word not in stop_words, lst)
    return [stemmer2.stem(_) for _ in wo_stop_words]#list(wo_stop_words)#

In [124]:
from nltk import Text
import itertools
import operator
from collections import Counter
import textwrap

from math import log


def find_indexes(word, list_of_pages):
    pages_amount = len(list_of_pages)
    word_indexes = []
    for index in range(pages_amount):
        if word in list_of_pages[index]:
            word_indexes.append(index + 1)
    return word_indexes

def calculate_frequency(list_of_pages, docs_length):
    return [(doc_len, dict(Counter(page).items())) 
            for (doc_len, page) in zip(docs_length, list_of_pages)]
    
class CorpusIndex:
    
    """
    Build CorpusIndex from list of texts.
    """
    @staticmethod
    def from_corpus(corpus):
        corpus = list(corpus)
        print("Corpus: Ready")
        docs_length = [len(x) for x in corpus]
        print("Docs length: Ready")
        average_length = sum(docs_length) / len(docs_length)
        print("Average length: Ready")
        pages_list = list(map(lambda x: text_to_tokens(x), corpus))
        print("Pages List: Ready")
        unique_lemms = sorted(list(set(itertools.chain(*pages_list))))
        print("unique_lemms: Ready")
        inverted_index = list(map(lambda lemma: (lemma, find_indexes(lemma, pages_list)), unique_lemms))
        print("inverted_index: Ready")
        words_frequency = calculate_frequency(pages_list, docs_length)
        print("words_frequency: Ready")
        return CorpusIndex(unique_lemms, inverted_index, words_frequency, average_length)

    @staticmethod
    def _parse_frequency_token(token):
        word, freq = token.split("(")
        freq = int(freq[:-1])
        return word, freq
    
    @staticmethod
    def from_disk(path):
        with open(path, mode="r") as input_file:
            # Unique lemmas
            input_file.readline()
            unique_lemmas_line = input_file.readline().strip()
            unique_lemms = unique_lemmas_line.split(", ")
            input_file.readline()
            
            # Inverted index
            input_file.readline()
            word_index_line = input_file.readline().strip()
            inverted_index = []
            while word_index_line != "":
                word, indexes =  word_index_line.split(": ")
                indexes = [int(x) for x in indexes.split(" ")]
                inverted_index.append((word, indexes))
                word_index_line = input_file.readline().strip()
            
            # Words frequency:
            input_file.readline()
            frequency_line = input_file.readline()[:-1]
            words_frequency = []
            while frequency_line != "":
                amount, freqs =  frequency_line.split(": ")
                amount = int(amount)
                if amount != 0:
                    freqs = dict(CorpusIndex._parse_frequency_token(freq) for freq in freqs.split(", "))
                else:
                    freqs = dict()
                words_frequency.append((amount, freqs))
                frequency_line = input_file.readline()[:-1]
                
            return CorpusIndex(unique_lemms, inverted_index, words_frequency)
    
    def __init__(self, unique_lemms, inverted_index, words_frequency, average_length):
        self.unique_lemms = unique_lemms
        # Refactor!
        self.inverted_index = dict(inverted_index)
        self.words_frequency = words_frequency
        self.average_doc_len = average_length

    def __eq__(self, other):
        return self.unique_lemms == other.unique_lemms \
                and self.inverted_index == other.inverted_index \
                and self.words_frequency == other.words_frequency
        
    def __str__(self):
        return textwrap.dedent("""
        CorpusIndex(
            Unique lemmas: %s
            Inverted index: %s
            Words frequency: %s            
        )
        """ % (self.unique_lemms, self.inverted_index, self.words_frequency))

    def save(self, path):
        with open(path, mode="w+") as output_file:
            output_file.write("Unique lemmas:\n")
            output_file.write(", ".join(self.unique_lemms))
            output_file.write("\n\n")
            output_file.write("Inverted index:\n")
            output_file.write("\n".join("%s: %s" % (word, " ".join(str(i) for i in indexes)) \
                                        for (word, indexes) in self.inverted_index.items()))
            output_file.write("\n\n")
            output_file.write("Words frequency:\n")
            output_file.write("\n".join("%d: %s" % (words_amount, ", ".join("%s(%d)" % pair for pair in freq.items())) \
                                        for (words_amount, freq) in self.words_frequency))
            output_file.write("\n")

    def find(self, lemma):
        result = self.inverted_index.get(lemma)
        return result if result != None else []

    def lemma_freq(self, lemma, doc_id):
        doc_info = self.words_frequency[doc_id - 1]
        if doc_info[0] == 0:
            return 0
        hits = doc_info[1].get(lemma)
        if hits == None:
            return 0
        return hits
        
    def search_in_index(self, query, rsv_func):
        query_tokens = text_to_tokens(query)
                
        indexes = list(map(self.find, query_tokens))
        match_docIds = list(set(itertools.chain(*indexes)))
        
        rsv_scores = dict(map(lambda docId: (docId, rsv_func(self, docId, query_tokens)), match_docIds))
        sorted_rsv_scores = sorted(rsv_scores.items(), key=operator.itemgetter(1))
        sorted_rsv_scores.reverse()

        return list(map(lambda x: x[0], sorted_rsv_scores))
    
    def get_avg_index_len(self):
        index_len_sum = sum([len(x[1]) for x in self.inverted_index.items()])
        return index_len_sum / len(self.inverted_index)
    
    def get_max_index_len(self):
        index_len_max = max([len(x[1]) for x in self.inverted_index.items()])
        return index_len_max
    
    def print_statistics(self):
        dict_len = len(self.unique_lemms)
        print("Length of the dictionary: %s" % dict_len)
        print("Average list of word's positions length: %s" % self.get_avg_index_len())
        print("Max list of word's positions length: %s" % self.get_max_index_len())

In [125]:
def idf(corpusIndex, lemma):
    indexes = corpusIndex.inverted_index.get(lemma)
    docs_amount = 0
    if indexes != None:
        docs_amount = len(indexes)
    N = len(corpusIndex.words_frequency)
    Nt = docs_amount
    return log(1.0 + (N - Nt + 0.5) / (Nt + 0.5))

def idf_simple(corpusIndex, lemma):
    indexes = corpusIndex.inverted_index.get(lemma)
    docs_amount = 0
    if indexes != None:
        docs_amount = len(indexes)
    N = len(corpusIndex.words_frequency)
    Nt = docs_amount
    return log(N / Nt)

def construct_rsv_func(k1, b,
                       idf_func = idf, 
                       norm_rsv = False,
                       use_tfq = False, k2 = 100):
    def rsv(corpusIndex, doc_id, query_lemmas):
        score, idf_sum = 0.0, 0.0
        Ld = float(corpusIndex.words_frequency[doc_id - 1][0])
        _L_ = corpusIndex.average_doc_len
        for lemma in set(query_lemmas):
            if norm_rsv:
                idf_sum += idf(corpusIndex, lemma)

            f_td = corpusIndex.lemma_freq(lemma, doc_id)
            f_tq = query_lemmas.count(lemma)
            if f_td == 0:
                continue

            addition = idf_func(corpusIndex, lemma) * f_td * (k1 + 1) / (k1 * ((1 - b) + b * Ld / _L_) + f_td)
            if use_tfq:
                addition *= (k2 + 1) * f_tq / (k2 + f_tq)
            score += addition

        if norm_rsv:
            score /= idf_sum

        return score
    
    return rsv

In [126]:
def test_ranking(corpus, rsv_func=construct_rsv_func(k1 = 1.2, b = 0.75), extra_log_info = "", logging=True):
    disk_corpus = corpus

    with open("nnovik_full_data", mode="w+") as results_f:
        pass

    query_list = parse_query_file()
    #print("\n\n".join(str(x) for x in query_list))

    search_results = map(lambda q: (q.index, disk_corpus.search_in_index(q.query, rsv_func)[:3]), query_list)
    with open("nnovik_full_data", mode="w") as results_f:
        index = 0
        for (qindex, resutls) in search_results:
            results_f.write("%d,%s\n" % (qindex, ",".join(map(lambda x: str(x - 1), resutls))))
            index += 1

In [127]:
data_dict = parse_data_file()
rsv = construct_rsv_func(k1 = 1.2, b = 0.75)

text_list = []
for ind in range(len(data_dict)):
    x = data_dict[ind]
    text_list.append(
        x.title #+ " " + x.info
    )
corpus = CorpusIndex.from_corpus(text_list)
#corpus.save("test")

Corpus: Ready
Docs length: Ready
Average length: Ready
Pages List: Ready
unique_lemms: Ready
inverted_index: Ready
words_frequency: Ready


In [130]:
print("*** Ranking for k1 = 1.2, b = 0.75 (headers) ***")
test_ranking(corpus)

*** Ranking for k1 = 1.2, b = 0.75 (headers) ***


In [131]:
from ml_metrics import mapk

def getList(filename):
    with open(filename) as input:
        return [line[:-1].split(',')[1:] for line in input]
    
mapk(getList('train_data.csv'), getList('nnovik_full_data'),3)

0.41677777777777769

In [89]:
corpus.save("test")