In [1]:
import warnings
from pprint import pprint

import numpy as np
import pandas as pd
from tqdm import tqdm

from lsh import MinHashLSH
from minhash import MinHash
import utils

In [2]:
warnings.filterwarnings("ignore")

In [3]:
tqdm.pandas()

### Set parameters

In [4]:
n = 2 # shingle size
k = 100 # number of permutations
b = 25 # number of bands

### Load and prepare data

In [5]:
# Load data and drop empty
df = pd.read_csv('dataset/datascience-stackoverflow-questions.csv')
df['title'].replace('', np.nan, inplace=True)
df.dropna(subset=['title'], inplace=True)

# Convert to lowercase
df['text'] = df['title'].str.lower()

### Convert to shingles

In [7]:
# Break up text into shingles
df['shingles'] = df['text'].progress_map(lambda x: utils.generate_ngrams(x.split(), n))

# Drop texts that were too short for the provided shingle size
df = df[df['shingles'].str.len() != 0]

100%|██████████| 24363/24363 [00:00<00:00, 163856.50it/s]


### Compute the minhash signatures

In [10]:
# Initialize minhash object
minhash = MinHash(num_hashes=K)

# Compute the signatures on all examples
df['signature'] = df['shingles'].progress_map(minhash.signature)

# Put all signatures into a matrix
sig_matrix = np.array(df['signature'].tolist()).T

100%|██████████| 24359/24359 [00:23<00:00, 1043.94it/s]


### LSH

In [12]:
# Initialize the minashLSH object
minhash_lsh = MinHashLSH(documents=df['shingles'].tolist(), signatures=sig_matrix, num_bands=b)

# Build the LSH index
minhash_lsh.build()

# Get all candidates for all documents
doc_candidates = minhash_lsh.doc_candidates

### Random example

In [72]:
# Find all documents that have similar documents
has_similars = list(filter(lambda x: len(doc_candidates[x]) > 1, doc_candidates))

In [73]:
# Select a random document
rndm_doc_idx = np.random.choice(has_similars, 1)[0]
rndm_doc_similars = list(doc_candidates[rndm_doc_idx])

In [74]:
rndm_doc = df['title'].iloc[rndm_doc_idx]
sim_docs = df.iloc[rndm_doc_similars]['title']

In [75]:
print("Random Document: \n", rndm_doc)

Random Document: 
 Time series modelling


In [76]:
print("Similar Documents: \n")
pprint(sim_docs.tolist())

Similar Documents: 

['Irregular time series classification',
 'Time series prediction',
 'Time series classification',
 'Methods for analyzing multiple time series',
 'Time Series segmentation',
 'Time Series Forecasting',
 'Categorical Multivariate Time Series',
 'Classifying time series data that overlap',
 'Alignment of time series',
 'time series plot',
 'Classification of a time series data',
 'Time series regression',
 'Visualizing Time Series Data',
 'Time series decomposition',
 'Multivariate time series classification',
 'Multivariate Time Series Binary Classification']
