In [11]:
from zipfile import ZIP_DEFLATED, ZipFile
import io


# считываем тексты
def read_texts():
    data = []
    with ZipFile('../task2/texts.zip', 'r', ZIP_DEFLATED) as zipFile:
        for info in zipFile.infolist():
            with io.TextIOWrapper(zipFile.open(info.filename), encoding="utf-8") as f:
                data.append(f.read())
    print(f'posts length = {len(data)}')
    return data

In [12]:
from nltk.corpus import stopwords
import string


# читаем стоп слова
def read_stop_words():
    stop_words = stopwords.words("russian")
    for ch in string.punctuation:
        stop_words.append(ch)
    return stop_words

In [13]:
from nltk import word_tokenize


# получаем токены с текста
def get_tokens(text, stop_words):
    tokens = word_tokenize(text.replace("-", " "), language="russian")
    tokens = [i.lower() for i in tokens]
    tokens = ([i for i in tokens if i not in stop_words])
    tokens = [i for i in tokens if i != "" and i.isalpha()]
    return tokens

In [14]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()


# получаем нормальную форма слова
def get_lemma(token):
    return morph.parse(token)[0].normal_form

In [15]:
# получаем инвертированный список терминов из текстов
def get_inverted_index(texts):
    stop_words = read_stop_words()
    terms = {}
    for i, text in enumerate(texts):
        tokens = get_tokens(text, stop_words)
        lemmas = [get_lemma(token) for token in tokens]
        for token in lemmas:
            if terms.get(token, None) is None:
                terms[token] = {}
            term_doc = terms[token].get(f"doc_{i + 1}", None)
            if term_doc is None:
                term_doc = terms[token][f"doc_{i + 1}"] = {}
            term_doc["count"] = term_doc.get("count", 0) + 1
    return dict(sorted(terms.items()))

In [16]:
import json


# записываем индекс в файл
def write_index(index):
    with open('index.json', 'w', encoding='utf8') as outfile:
        json.dump(index, outfile, indent=4, ensure_ascii=False)


# читаем индекс
def read_index():
    with open('index.json', 'r', encoding='utf8') as file:
        return json.load(file)


In [17]:
# получаем словарь документ-число слов
def get_doc_terms_count(texts):
    stop_words = read_stop_words()
    docs_count = {}
    for i, text in enumerate(texts):
        tokens = get_tokens(text, stop_words)
        lemmas = [get_lemma(token) for token in tokens]
        docs_count[f"doc_{i + 1}"] = len(lemmas)
    return docs_count

In [18]:
def write_doc_terms_count(docs_count):
    with open('doc_count.json', 'w', encoding='utf8') as outfile:
        json.dump(docs_count, outfile, indent=4, ensure_ascii=False)
        
def read_doc_terms_count():
    with open('doc_count.json', 'r', encoding='utf8') as file:
        return json.load(file)

In [19]:
# вычисляем tf для каждого термина и документа
def compute_tfs(index, terms_count):
    for key, term in index.items():
        for doc_key, doc in term.items():
            doc["tf"] = compute_tf(doc["count"], terms_count[doc_key])
    return index

def compute_tf(term_count, all_terms_count):
    return round(term_count / float(all_terms_count), 6)

In [27]:
import math

# вычисляем idf для каждого термина
def compute_idfs(index, D):
    for key, term in index.items():
        term_docs_count = len(term.keys())
        idf = compute_idf(D, term_docs_count)
        for doc_key, doc in term.items():
            doc["idf"] = idf
    return index


def compute_idf(all_docs, docs_count):
    return round(math.log10(all_docs / float(docs_count)), 6)

In [21]:
# вычисляем tf-idf для каждого термина и документа
def compute_tfs_idfs(index):
    for key, term in index.items():
        for doc_key, doc in term.items():
            doc["tf-idf"] = compute_tf_idf(doc["tf"], doc["idf"])
    return index


def compute_tf_idf(tf, idf):
    return round(tf * idf, 6)

In [22]:
# записываем значения tf, idf, tf-idf в json и txt(по заданию)
def write_terms_with_tf_idf(index):
    with open('tf_idf.json', 'w', encoding='utf8') as outfile:
        json.dump(index, outfile, indent=4, ensure_ascii=False)
    with open('tf_idf.txt', "w", encoding="utf-8") as file:
        for key, term in index.items():
            file.write(f'{key}\n')
            for doc_key, doc in term.items():
                file.write("" * len(key) + f'{doc_key} tf={doc["tf"]} idf={doc["idf"]} tf-idf={doc["tf-idf"]}\n')

In [24]:
# вычисляем индекс по текстам и кол-во слов в каждом тексте
texts = read_texts()
inverted_index = get_inverted_index(texts)
docs_terms_count = get_doc_terms_count(texts)
# записываем значения на будущее
write_index(inverted_index)
write_doc_terms_count(docs_terms_count)

posts length = 100


In [28]:
# D - кол-во текстов
D = 100
# считываем данные, вычисляем значения и записываем их
index = read_index()
doc_terms_count = read_doc_terms_count()
index = compute_tfs(index, doc_terms_count)
index = compute_idfs(index, D)
index = compute_tfs_idfs(index)
write_terms_with_tf_idf(index)