In [None]:
from __future__ import division

In [None]:
import re
import os

import numpy as np
import pandas as pd

from collections import defaultdict, namedtuple
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from datasketch import MinHashLSHForest, MinHash

In [None]:
FOLDER = "../documents/"
StemmedDocument = namedtuple("StemmedDocument", ["name", "word_counts"])

In [None]:
def get_filenames(folder):
    return sorted([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])

  
def read_article(folder, name):
    content = defaultdict(int)
    stop_words = stopwords.words("english")
    stemmer = PorterStemmer()
    with open(FOLDER + name, "r") as article:
        for line in article:
            for word in line.split():
                word = word.lower()
                word = re.sub("^[^a-z]*|[^a-z]*$", "", word)
                if word and word not in stop_words:
                    word = stemmer.stem(word)
                    word = word.encode("utf-8")
                    content[word] += 1
    return dict(content)


def read_documents(folder):
    documents = []
    filenames = get_filenames(folder)
    for fname in filenames:
        if not fname.startswith("summary"):
            content = read_article(folder, fname)
            documents.append(StemmedDocument(fname, content))
    return documents

In [None]:
documents = read_documents(FOLDER)

In [None]:
documents[0]

In [None]:
def create_document_term_matrix(documents):
    vectorizer = DictVectorizer(dtype=int, sparse=True)
    count_matrix = vectorizer.fit_transform(map(lambda x: x.word_counts, documents))
    terms = vectorizer.get_feature_names()
    transformer = TfidfTransformer(norm="l2", sublinear_tf=True)
    term_matrix = transformer.fit_transform(count_matrix)
    document_titles = list(map(lambda x: x.name, documents))
    return term_matrix, document_titles, terms

In [None]:
term_matrix, labels, words = create_document_term_matrix(documents)

In [None]:
term_matrix

In [None]:
similarity_matrix = pd.DataFrame((term_matrix * term_matrix.T).A, columns=labels, index=labels)

In [None]:
similarity_matrix

In [None]:
def find_top_k_similar_documents(similarity_matrix, document_name, k):
    row = similarity_matrix.loc[document_name, similarity_matrix.columns != document_name]
    return row.sort_values(ascending=False)[:k]

In [None]:
find_top_k_similar_documents(similarity_matrix, "uk_5.txt", 5)

In [None]:
class LSHForest(object):
    def __init__(self, nr_permutations):
        self._nr_permutations = nr_permutations
        
    def build_lsh_forest(self, documents):
        forest = MinHashLSHForest(num_perm=self._nr_permutations)
        for document_name, word_counts in documents:
            minhash = MinHash(num_perm=self._nr_permutations)
            for word, cnt in word_counts.items():
                for _ in range(cnt):
                    minhash.update(word)
            forest.add(document_name, minhash)
        forest.index()
        return forest
      
    def _query_indices_of_most_similar_documents(self, forest, words, k):
        minhash = MinHash(num_perm=self._nr_permutations)
        for word, count in words.items():
            for _ in range(count):
                minhash.update(word)
        return forest.query(minhash, k)
      
    def get_top_k_most_similar_documents(self, forest, documents, query_document, k):
        words = get_words_of_document(query_document, documents)
        top_k_indices = self._query_indices_of_most_similar_documents(forest, words, k)
        return top_k_indices


def get_words_of_document(query_document, documents):
    for name, words in documents:
        if name == query_document:
            return words
    return {}

In [None]:
lsh_forest = LSHForest(nr_permutations=32)

In [None]:
forest = lsh_forest.build_lsh_forest(documents)

In [None]:
approximate_neighbours = lsh_forest.get_top_k_most_similar_documents(forest, documents, "uk_5.txt", 5)

In [None]:
approximate_neighbours