In [1]:
# download dataset from here https://www.kaggle.com/benhamner/nips-papers?select=papers.csv

import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHEnsemble
from spacy.lang.en import English
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [2]:
df = pd.read_csv('papers.csv')

In [3]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


### Tokenizer, lemmatize, remove punctuation and stopwords

In [4]:
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)
sample_text = "Gemini Man review: Double Will Smith can't save hackneyed spy flick U.S.A"
tokens = tokenizer(sample_text)

In [5]:
def spacy_tokenizer_lemmatizer(text):
    tokens = tokenizer(text)    
    lemma_list = []
    for token in tokens:
        if token.is_stop is False:
            lemma_list.append(token.lemma_)
    return(lemma_list)

In [6]:
def pipelinize(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            return [function(i) for i in list_or_series]
        else: # if it's not active, just pass it right back
            return list_or_series
    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

In [7]:
def preprocessor_final(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', '', text)
        text = re.sub('[\W]+', '', text.lower())
        return [i for i in text if i]
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            return_list.append(temp_text)
        return([i for i in return_list if i])
    else:
        pass

In [8]:
spacy_estimators = [('tokenizer', pipelinize(spacy_tokenizer_lemmatizer)),
                    ('preprocessor', pipelinize(preprocessor_final))]
spacy_pipe = Pipeline(spacy_estimators)
#spacy_pipe.transform([sample_text])

### Run Locality sensitive hashing

In [9]:
permutations = 128
threshold = 0.8
num_recommendations = 1

In [14]:
def get_lsh(df, perms, threshold):
    start_time = time.time()
    minhash = []
    for text in df['text']:
        tokens = set(spacy_pipe.transform([text])[0])
        m = MinHash(num_perm=perms)
        for s in tokens:
            try:
                m.update(s.encode('utf8'))
            except Exception:
                print(s)
                break
        minhash.append(m)
    
    text_lengths = list(df['text'].str.len())
    lsh = MinHashLSHEnsemble(threshold=threshold, num_perm=perms, num_part=32)
    lsh.index([(str(i), m, text_lengths[i]) for i, m in enumerate(minhash)])
    print(f'Took {time.time() - start_time}s to populate forest')
    return lsh

In [42]:
def predict(text, database, perms, lsh):
    tokens = set(spacy_pipe.transform([text])[0])
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
    idx_array = np.array(list(lsh.query(m, len(tokens))))
    if len(idx_array) == 0:
        return None
    result = database.iloc[idx_array]['title']
    return result

In [43]:
title = df['text'].iloc[5]
result = predict(title, df, permutations, lsh)
print(result)

4099    CCD Neural Network Processors for Pattern Reco...
3755    Unsupervised Classifiers, Mutual Information a...
808     Speech Recognition: Statistical and Neural Inf...
2248    Reconfigurable Neural Net Chip with 32K Connec...
4407                Visual Grammars and Their Neural Nets
                              ...                        
6639    Deconvolutional Paragraph Representation Learning
6374    Optimal Sample Complexity of M-wise Data for T...
6735    Teaching Machines to Describe Images with Natu...
6336             DPSCREEN: Dynamic Personalized Screening
6875    Simple and Scalable Predictive Uncertainty Est...
Name: title, Length: 659, dtype: object
