In [58]:
from zipfile import ZIP_DEFLATED, ZipFile
import io

# считываем тексты
def read_texts():
    data = []
    with ZipFile('../task2/texts.zip', 'r', ZIP_DEFLATED) as zipFile:
        for info in zipFile.infolist():
            with io.TextIOWrapper(zipFile.open(info.filename), encoding="utf-8") as f:
                data.append(f.read())
    print(f'posts length = {len(data)}')
    return data

In [59]:
from nltk.corpus import stopwords
import string

# читаем стоп слова
def read_stop_words():
    stop_words = stopwords.words("russian")
    for ch in string.punctuation:
        stop_words.append(ch)
    return stop_words

In [60]:
from nltk import word_tokenize

# получаем токены с текста
def get_tokens(text, stop_words):
    tokens = word_tokenize(text.replace("-", " "), language="russian")
    tokens = [i.lower() for i in tokens]
    tokens = ([i for i in tokens if i not in stop_words])
    tokens = [i for i in tokens if i != "" and i.isalpha()]
    return tokens

In [61]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

# получаем нормальную форма слова
def get_lemma(token):
    return morph.parse(token)[0].normal_form

In [62]:
# получаем инвертированный список терминов из текстов
def get_inverted_index(texts):
    stop_words = read_stop_words()
    terms = {}
    for i, text in enumerate(texts):
        tokens = get_tokens(text, stop_words)
        lemmas = list(dict.fromkeys([get_lemma(token) for token in tokens]))
        for token in lemmas:
            if terms.get(token, None) is None:
                terms[token] = []
            terms[token].append(i + 1)
    return dict(sorted(terms.items()))

In [63]:
import json

# записываем индекс в файл
def write_index(index):
    with open('index.json', 'w', encoding='utf8') as outfile:
        json.dump(index, outfile, indent=4, ensure_ascii=False)

In [64]:
# читаем индекс
def read_index():
    with open('index.json', 'r', encoding='utf8') as file:
        return json.load(file)

In [65]:
# получаем индекс для текстов и записываем его в файл
inverted_index = get_inverted_index(read_texts())        
write_index(inverted_index)

posts length = 100
