# 2 задание по информационному поиску
---
## Inverted index

In [114]:
!git clone https://github.com/d0rj/RusLit.git

fatal: destination path 'RusLit' already exists and is not an empty directory.


In [115]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [96]:
import nltk
import numpy as np
import re
import unicodedata

from pymorphy2 import MorphAnalyzer

In [22]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## *Realisation*

In [116]:
class Index:
    def __init__(self):
        self.index = {}
        self.morph = MorphAnalyzer()
        self.doc_token_count = {}
        self.doc_count = 0

    def add_to_index(self, filepath):
        with open(filepath, 'r') as fh:
            content = fh.readlines()
        tokens = []
        for row in content:
            for token in nltk.word_tokenize(re.sub(r'[^\w\s]','',unicodedata.normalize("NFKC",row))):
                tokens.append(token.lower())

        self.doc_token_count[filepath] = len(tokens)

        for position, token in enumerate(tokens):
            lemma = self.morph.normal_forms(token)[0]
            if lemma not in self.index.keys():
                self.index[lemma] = {}
            if token not in self.index[lemma].keys():
                self.index[lemma][token] = {}
            if filepath not in self.index[lemma][token].keys():
                self.index[lemma][token][filepath] = []

            self.index[lemma][token][filepath].append(position)

        self.doc_count += 1

    def find(self, *words):
        general_list = [] # general соответствует normal_form
        form_presion_list = []

        for word in words:
            word = word.lower()
            word_lemma = self.morph.normal_forms(word)[0]
            if word_lemma in self.index.keys():
                general_list.append([ self.index[word_lemma][form].keys() for form in self.index[word_lemma] ])
                for form in self.index[word_lemma].keys():
                    if form == word:
                        form_presion_list.append(self.index[word_lemma][form].keys())


        general_dict = {}
        for doc_list_list in general_list:
            for doc_list in doc_list_list:
                for doc_name in doc_list:
                    if doc_name not in general_dict.keys():
                        general_dict[doc_name] = 1
                    else:
                        general_dict[doc_name] += 1

        form_presion_dict = {}
        for doc_list in form_presion_list:
            for doc_name in doc_list:
                if doc_name not in form_presion_dict.keys():
                    form_presion_dict[doc_name] = 1
                else:
                    form_presion_dict[doc_name] += 1

        general_ranking = sorted( list(general_dict.keys()),
                                  key=lambda x: general_dict[x],
                                  reverse=True )
        form_precision_ranking = sorted( list(form_presion_dict.keys()),
                                         key=lambda x: form_presion_dict[x],
                                         reverse=True )

        # Отсортированные имена файлов по совпадению формы и общей встречаимости
        form_doc_mask_ranking = form_precision_ranking
        for doc_name in general_ranking:
            if doc_name not in form_doc_mask_ranking:
                form_doc_mask_ranking.append(doc_name)


        # TF-IDF ранжирование c точностью до формы
        doc_word_tfidf = {}
        for doc_name in form_doc_mask_ranking:
            doc_word_tfidf[doc_name] = []
            for word in words:
                word = word.lower()
                word_lemma = self.morph.normal_forms(word)[0]
                word_in_doc = 0
                for form in self.index.get(word_lemma, {}).keys():
                    for position in self.index[word_lemma][form].get(doc_name, []):
                        word_in_doc += 1

                doc_word_tfidf[doc_name].append(
                    ( word_in_doc / self.doc_token_count[doc_name] )
                    /
                    ( np.log10( self.doc_count / len(form_doc_mask_ranking) ) + 1e-10)
                )

        tfidf_ranking = sorted( list(doc_word_tfidf.keys()),
                                         key=lambda x: doc_word_tfidf[x],
                                         reverse=True )
        tfidf_doc_mask_ranking = []
        for tfidf_doc_rank, form_doc_rank in zip(tfidf_ranking, form_doc_mask_ranking):
            if tfidf_doc_rank not in tfidf_doc_mask_ranking:
                tfidf_doc_mask_ranking.append(tfidf_doc_rank)
            if form_doc_rank not in form_doc_mask_ranking:
                tfidf_doc_mask_ranking.append(form_doc_rank)

        # Формирование финальной выдачи
        result = []
        for doc_name in tfidf_doc_mask_ranking:
            for word in words:
                word = word.lower()
                word_lemma = self.morph.normal_forms(word)[0]
                positions = []
                for form in self.index.get(word_lemma, {}).keys():
                    positions += self.index[word_lemma][form].get(doc_name, [])
                positions = list(set(positions))
                result.append( (doc_name, word, positions) )

        return result

## *Testing*

In [117]:
idx = Index()
idx.add_to_index("RusLit/prose/Pushkin/Капитанская дочка.txt")
idx.add_to_index("RusLit/prose/Pushkin/История Пугачёва.txt")
idx.add_to_index("RusLit/prose/Pushkin/Дубровский.txt")

In [118]:
results = idx.find('Привет', 'Пугачёв', 'приветствую', 'яицких', 'История', 'Пугачева')

output_str = ''
for doc_name, word, positions in results:
    output_str += f'[{doc_name}] - {word} : {str(positions)}\n'

print(output_str)

[RusLit/prose/Pushkin/История Пугачёва.txt] - привет : []
[RusLit/prose/Pushkin/История Пугачёва.txt] - пугачёв : [8196, 4, 22540, 14350, 16399, 20498, 23, 38937, 30748, 32797, 36894, 22559, 12330, 20523, 49, 14386, 8245, 34869, 28727, 12346, 30783, 32836, 32847, 22613, 8287, 12391, 28775, 106, 34926, 18546, 28790, 26750, 18560, 34951, 26760, 30859, 26767, 18579, 26772, 22677, 28822, 14487, 12442, 12452, 28837, 22697, 28842, 34986, 10415, 8367, 32946, 18611, 18613, 184, 16581, 30919, 14539, 20686, 8401, 35029, 12502, 22747, 22752, 26848, 30944, 35054, 22771, 12538, 28922, 22782, 35070, 37120, 28930, 33027, 8459, 37138, 28950, 35102, 31007, 8484, 22824, 28974, 31024, 35120, 12600, 26936, 10561, 35138, 22853, 37189, 33106, 10584, 12636, 20837, 8550, 33127, 10598, 35180, 8557, 37245, 33153, 22920, 37259, 35219, 22932, 33177, 20895, 29097, 27055, 8627, 29109, 35254, 35261, 37310, 29123, 20936, 12748, 27085, 33229, 8666, 33250, 27110, 12783, 20979, 23032, 35320, 29181, 27138, 21010, 37414, 