In [1]:
import warnings
from pprint import pprint

import numpy as np
import pandas as pd
from tqdm import tqdm

from docsim.lsh import MinHashLSH
from docsim.minhash import MinHash
import docsim.utils

In [2]:
warnings.filterwarnings("ignore")

In [3]:
tqdm.pandas()

### Set parameters

In [4]:
n = 3 # shingle size
k = 100 # number of permutations
b = 25 # number of bands

### Load and prepare data

In [5]:
# Load data and drop empty
df = pd.read_csv('../dataset/datascience-stackoverflow-questions.csv')
df['title'].replace('', np.nan, inplace=True)
df.dropna(subset=['title'], inplace=True)

# Convert to lowercase
df['text'] = df['title'].str.lower()

### Convert to shingles

In [6]:
# Break up text into shingles
df['shingles'] = df['text'].progress_map(lambda x: docsim.utils.generate_ngrams(x.split(), n))

# Drop texts that were too short for the provided shingle size
df = df[df['shingles'].str.len() != 0]

100%|██████████| 24363/24363 [00:00<00:00, 168028.72it/s]


### Compute the minhash signatures

In [7]:
# Initialize minhash object
minhash = MinHash(num_hashes=k)

# Compute the signatures on all examples
df['signature'] = df['shingles'].progress_map(minhash.signature)

# Put all signatures into a matrix
sig_matrix = np.array(df['signature'].tolist()).T

100%|██████████| 24172/24172 [00:19<00:00, 1215.82it/s]


### LSH

In [8]:
# Initialize the minashLSH object
minhash_lsh = MinHashLSH(documents=df['shingles'].tolist(), signatures=sig_matrix, num_bands=b)

# Build the LSH index
minhash_lsh.build()

# Get all candidates for all documents
doc_candidates = minhash_lsh.doc_candidates

### Random example

In [9]:
# Find all documents that have similar documents
has_similars = list(filter(lambda x: len(doc_candidates[x]) > 1, doc_candidates))

In [10]:
# Select a random document
rndm_doc_idx = np.random.choice(has_similars, 1)[0]
rndm_doc_similars = list(doc_candidates[rndm_doc_idx])

In [11]:
rndm_doc = df['title'].iloc[rndm_doc_idx]
sim_docs = df.iloc[rndm_doc_similars]['title']

In [12]:
print("Random Document: \n", rndm_doc)

Random Document: 
 tsfresh: how to predict class


In [13]:
print("Similar Documents: \n")
pprint(sim_docs.tolist())

Similar Documents: 

['How to predict user next purchase items',
 'How to predict an outcome within a specific time window?',
 "How to predict customer's next purchase",
 'how to predict content based demand',
 'How to predict Estimated Time for Arrival given only trajectory data and '
 'time?',
 'How to predict the dealer whether pick up the goods next month?',
 'Deep advantage learning: how to predict the value',
 'How to predict weather?',
 'How to predict probabilities in xgboost?',
 'How to predict consumer purchase in next 6 months?',
 'How to predict constant failing of equipment',
 'How to Predict Employee count of businesses using Keras classifiers',
 'How to predict ETA using Regression?',
 'how to predict an image using saved model',
 'How to predict the value in KNN?']
