In [27]:
import math
import os
from collections import Counter, OrderedDict

In [28]:
parsed_sites_directory = 'parsed_sites_bigrams'


def tf(word, doc_word_counts):
    return doc_word_counts[word] / sum(doc_word_counts.values())


def n_containing(word, docs_contents):
    return sum(1 for doc in docs_contents if word in docs_contents[doc])


def idf(word, docs_contents):
    return math.log(len(docs_contents) / n_containing(word, docs_contents))


def tfidf(word, doc_word_counts, docs_contents):
    return(tf(word, doc_word_counts) * idf(word, docs_contents))

In [29]:
docs = {fname for fname in os.listdir(parsed_sites_directory) if fname.startswith('http')}
docs_contents = {}

for doc in docs:
    doc_path = '{}/{}'.format(parsed_sites_directory, doc)
    with open(doc_path) as fp:
        docs_contents[doc] = dict(Counter([word.strip() for word in fp]))

for doc, doc_word_counts in docs_contents.items():
    print('Top words in document: {}'.format(doc))
    scores = {word: tfidf(word, doc_word_counts, docs_contents) for word in doc_word_counts}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:500]:
        print('\tWord: {}\tTD-IDF: {}'.format(word, round(score, 5)))

Top words in document: http:www.cmaf.ru
	Word: ['армия', 'страница']	TD-IDF: 0.01069
	Word: ['контактный', 'новость']	TD-IDF: 0.01058
	Word: ['контейнер', 'стиль']	TD-IDF: 0.01058
	Word: ['контейнер', 'форма']	TD-IDF: 0.01058
	Word: ['поль', 'стиль']	TD-IDF: 0.01058
	Word: ['ввод', 'поль']	TD-IDF: 0.01058
	Word: ['ввод', 'текст']	TD-IDF: 0.01058
	Word: ['кнопка', 'текст']	TD-IDF: 0.01058
	Word: ['кнопка', 'отправка']	TD-IDF: 0.01058
	Word: ['отправка', 'форма']	TD-IDF: 0.01058
	Word: ['стрелка', 'форма']	TD-IDF: 0.01058
	Word: ['влево', 'стрелка']	TD-IDF: 0.01058
	Word: ['влево', 'удалять']	TD-IDF: 0.01058
	Word: ['дополнительный', 'удалять']	TD-IDF: 0.01058
	Word: ['кнопка', 'ряд']	TD-IDF: 0.01058
	Word: ['искать', 'кнопка']	TD-IDF: 0.01058
	Word: ['автоинформатор', 'страница']	TD-IDF: 0.01058
	Word: ['автоинформатор', 'экскурсионный']	TD-IDF: 0.01058
	Word: ['бюро', 'кроме']	TD-IDF: 0.01058
	Word: ['вт', 'секретарь']	TD-IDF: 0.01058
	Word: ['режим', 'факс']	TD-IDF: 0.01058
	Word: ['к