In [1]:
import nltk
import datasketch

In [2]:
# create the corpus

corpus = [ "The early bird gets the worm.",
           "In the heat of the moment.",
           "Cross that bridge when you come to it.",
           "Don't put all your eggs in one basket.",
           "The ball is in your court.",
           "Your guess is as good as mine."]
corpus

['The early bird gets the worm.',
 'In the heat of the moment.',
 'Cross that bridge when you come to it.',
 "Don't put all your eggs in one basket.",
 'The ball is in your court.',
 'Your guess is as good as mine.']

In [3]:
# download punkt components

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maxen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# tokenize all of the documents
word_tokens = [nltk.word_tokenize(text) for text in corpus]
word_tokens

[['The', 'early', 'bird', 'gets', 'the', 'worm', '.'],
 ['In', 'the', 'heat', 'of', 'the', 'moment', '.'],
 ['Cross', 'that', 'bridge', 'when', 'you', 'come', 'to', 'it', '.'],
 ['Do', "n't", 'put', 'all', 'your', 'eggs', 'in', 'one', 'basket', '.'],
 ['The', 'ball', 'is', 'in', 'your', 'court', '.'],
 ['Your', 'guess', 'is', 'as', 'good', 'as', 'mine', '.']]

In [5]:
# take a look at all the shingles of 3-grams
# idea is documents with share shingles are similar
# on small corpus 3-gram is fine, but ~8-10 in practice

for index, doc_tokens in enumerate(word_tokens):
    for n_gram in nltk.ngrams(doc_tokens, 3):
        print(index, n_gram)

0 ('The', 'early', 'bird')
0 ('early', 'bird', 'gets')
0 ('bird', 'gets', 'the')
0 ('gets', 'the', 'worm')
0 ('the', 'worm', '.')
1 ('In', 'the', 'heat')
1 ('the', 'heat', 'of')
1 ('heat', 'of', 'the')
1 ('of', 'the', 'moment')
1 ('the', 'moment', '.')
2 ('Cross', 'that', 'bridge')
2 ('that', 'bridge', 'when')
2 ('bridge', 'when', 'you')
2 ('when', 'you', 'come')
2 ('you', 'come', 'to')
2 ('come', 'to', 'it')
2 ('to', 'it', '.')
3 ('Do', "n't", 'put')
3 ("n't", 'put', 'all')
3 ('put', 'all', 'your')
3 ('all', 'your', 'eggs')
3 ('your', 'eggs', 'in')
3 ('eggs', 'in', 'one')
3 ('in', 'one', 'basket')
3 ('one', 'basket', '.')
4 ('The', 'ball', 'is')
4 ('ball', 'is', 'in')
4 ('is', 'in', 'your')
4 ('in', 'your', 'court')
4 ('your', 'court', '.')
5 ('Your', 'guess', 'is')
5 ('guess', 'is', 'as')
5 ('is', 'as', 'good')
5 ('as', 'good', 'as')
5 ('good', 'as', 'mine')
5 ('as', 'mine', '.')


In [6]:
def check_similarities(corpus, jaccard_threshold=0.5, num_perm=128):
    print(corpus)
    print(jaccard_threshold, num_perm)
    
    # create min hash
    # jaccard similarity index > 0.5 and docs are similar
    # greater the threshold, the higher time similarity
    min_hash_lsh = datasketch.MinHashLSH(jaccard_threshold, num_perm)

    # store min hashes in this dict
    min_hashes = {}

    # min hash each document's shingles
    for index, text in enumerate(corpus):
        min_hash = datasketch.MinHash(num_perm)

        # generate n-grams (shingles) and calculate minhash
        for n_gram in nltk.ngrams(text, 3):
            min_hash.update("".join(n_gram).encode("utf-8"))

        # now insert all minhashes for current doc in the min hash lsh
        min_hash_lsh.insert(index, min_hash)

        # and map min hash to doc index
        min_hashes[index] = min_hash           
        
    for i in min_hashes.keys():
        result = min_hash_lsh.query(min_hashes[i])
        print(i, ":", result)

In [7]:
 check_similarities(corpus, 0.5, 128)

['The early bird gets the worm.', 'In the heat of the moment.', 'Cross that bridge when you come to it.', "Don't put all your eggs in one basket.", 'The ball is in your court.', 'Your guess is as good as mine.']
0.5 128
0 : [0]
1 : [1]
2 : [2]
3 : [3]
4 : [4]
5 : [5]


In [8]:
# a new corpus with similar sentences
corpus2 = [ "The early bird gets the worm.",
            "The early bird dog gets the birds.",
            "In the heat of the moment.",
            "Cross that bridge when you come to it.",
            "Don't put all your eggs in one basket.",
            "The ball is in your court.",
            "The tennis ball is in your court.",
            "Your guess is as good as mine."]

In [9]:
check_similarities(corpus2)

['The early bird gets the worm.', 'The early bird dog gets the birds.', 'In the heat of the moment.', 'Cross that bridge when you come to it.', "Don't put all your eggs in one basket.", 'The ball is in your court.', 'The tennis ball is in your court.', 'Your guess is as good as mine.']
0.5 128
0 : [0, 1]
1 : [0, 1]
2 : [2]
3 : [3]
4 : [4]
5 : [5, 6]
6 : [5, 6]
7 : [7]
