In [1]:
import math
import os
from collections import Counter, OrderedDict

In [20]:
parsed_sites_directory = 'parsed_sites'


def tf(word, doc_word_counts):
    return doc_word_counts[word] / sum(doc_word_counts.values())


def n_containing(word, docs_contents):
    return sum(1 for doc in docs_contents if word in docs_contents[doc])


def idf(word, docs_contents):
    return math.log(len(docs_contents) / n_containing(word, docs_contents))


def tfidf(word, doc_word_counts, docs_contents):
    return(tf(word, doc_word_counts) * idf(word, docs_contents))

In [26]:
docs = {fname for fname in os.listdir(parsed_sites_directory) if fname.startswith('http')}
docs_contents = {}

for doc in docs:
    doc_path = '{}/{}'.format(parsed_sites_directory, doc)
    with open(doc_path) as fp:
        docs_contents[doc] = dict(Counter([word.strip() for word in fp]))

for doc, doc_word_counts in docs_contents.items():
    print('Top words in document: {}'.format(doc))
    scores = {word: tfidf(word, doc_word_counts, docs_contents) for word in doc_word_counts}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:500]:
        print('\tWord: {}\tTD-IDF: {}'.format(word, round(score, 5)))

Top words in document: http:www.cmaf.ru
	Word: скрытие	TD-IDF: 0.01052
	Word: enter	TD-IDF: 0.01041
	Word: автоинформатор	TD-IDF: 0.00757
	Word: цмвс	TD-IDF: 0.00693
	Word: удалять	TD-IDF: 0.00433
	Word: влево	TD-IDF: 0.00362
	Word: мышка	TD-IDF: 0.00362
	Word: вооружённый	TD-IDF: 0.00355
	Word: поворот	TD-IDF: 0.00335
	Word: кнопка	TD-IDF: 0.00332
	Word: контейнер	TD-IDF: 0.00324
	Word: вдв	TD-IDF: 0.00323
	Word: ввод	TD-IDF: 0.0031
	Word: прекращать	TD-IDF: 0.00275
	Word: десантный	TD-IDF: 0.00263
	Word: вт	TD-IDF: 0.0024
	Word: бронекатер	TD-IDF: 0.00235
	Word: бункер	TD-IDF: 0.0022
	Word: нажать	TD-IDF: 0.00218
	Word: пн	TD-IDF: 0.00208
	Word: воздушно	TD-IDF: 0.00193
	Word: бк	TD-IDF: 0.00183
	Word: фондовый	TD-IDF: 0.00177
	Word: армия	TD-IDF: 0.00167
	Word: задний	TD-IDF: 0.00166
	Word: отправка	TD-IDF: 0.00159
	Word: кинофононегативный	TD-IDF: 0.00152
	Word: стрелка	TD-IDF: 0.00147
	Word: факс	TD-IDF: 0.0014
	Word: контент	TD-IDF: 0.00137
	Word: демьян	TD-IDF: 0.00135
	Word: эк