## Document Search
## LSH : Local Sensitive Hashing 


    - more efficient version of k-nearest neighbors using locality sensitive hashing
    - You can use this to find the document search

In [1]:
import nltk
import string
import re
import warnings
import pandas as pd
import numpy as np
import pickle
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples, stopwords


warnings.filterwarnings('ignore')

## Utilities

In [2]:
tokenizer = TweetTokenizer()
stop_words = stopwords.words('english') 

In [3]:
def process_tweet(tweet):
    
    # Remove URLS
    clean_tweet = re.sub('https?:[\/a-zA-Z.0-9]+','',tweet)
    
    #Convert @<something to <USR>
    clean_tweet = re.sub('@[a-zA-Z0-9_-]+','<USR>',clean_tweet)
    
    # Remove '#' tags
    clean_tweet = re.sub('#','', clean_tweet)
    
    # strip and lower
    tokens = tokenizer.tokenize(clean_tweet)
    tokens = [token.lower() for token in tokens if token not in stop_words and token not in string.punctuation]
    
    return tokens


def get_document_embedding(tweet, en_embedding):
    
    doc_embedding = np.zeros(300)
    
    for word in process_tweet(tweet):
        if word in en_embedding:
            doc_embedding += en_embedding[word]
    return doc_embedding

def get_document_vecs(all_docs, en_embedding):
    
    index2doc_dict = {}
    document_vec_list = []
    
    for i,doc in enumerate(all_docs):
        doc_embedding = get_document_embedding(doc, en_embedding)
        index2doc_dict[i] = doc_embedding
        document_vec_list.append(doc_embedding)
    
    document_vec_matrix = np.vstack(document_vec_list)
    
    return document_vec_matrix, index2doc_dict


def cosine_similarity(a,b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    dot = np.dot(a,b)
    score = dot/(norm_a*norm_b)
    
    if pd.isnull(score):
        score = 0
    return score

def nearnest_neighbors(a, candidates):
    scores = []
    
    for b in candidates:
        scores.append(cosine_similarity(a,b))
    
    idx = np.argmax(scores)
    return idx

## Load Data

In [4]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = positive_tweets + negative_tweets

## Load subset embedding
en_embeddings = pickle.load(open('./subset_embedding/en_embeddings.p','rb'))
len(en_embeddings)

6370

In [5]:
document_vecs, ind2Tweet = get_document_vecs(all_tweets, en_embeddings)
print(f"length of dictionary {len(ind2Tweet)}")
print(f"shape of document_vecs {document_vecs.shape}")

length of dictionary 10000
shape of document_vecs (10000, 300)


## Most Similar tweets

    - To find out similar tweet, we will check all the tweets

In [14]:
my_tweet = 'i am sad'
process_tweet(my_tweet)
tweet_embedding = get_document_embedding(my_tweet, en_embeddings)

idx = nearnest_neighbors(tweet_embedding, document_vecs)
print(f"Tweet : {my_tweet}")
print(f"Similiar Tweet : {all_tweets[idx]}")

Tweet : i am sad
Similiar Tweet : @hanbined sad pray for me :(((


## Most Similar tweets using LSH(Local Sensitive hashing)

    - Instead of looking all vectors(tweets), you can just search a subset to find its neighbnors.
    - You can divide the vector space into regions and search within one region for nearest neighbors of a given vector.
    
    
  #### Choosing the number of planes

* Each plane divides the space to $2$ parts.
* So $n$ planes divide the space into $2^{n}$ hash buckets.
* We want to organize 10,000 document vectors into buckets so that every bucket has about $~16$ planes.
* For that we need $\frac{10000}{16}=625$ buckets.
* We're interested in $n$, number of planes, so that $2^{n}= 625$. Now, we can calculate $n=\log_{2}625 = 9.29 \approx 10$.

In [18]:
N_VECS = len(all_tweets)     # number of vectors
N_DIMS = len(ind2Tweet[0])    # dim of vectors
N_PLANES = 10
N_UNIVERSES = 25

In [19]:
np.random.seed(0)
planes_l = [np.random.normal(size=(N_DIMS, N_PLANES)) for _ in range(N_UNIVERSES)]

In [27]:
def nearest_neighbor(v, candidates, k=1):
   
    similarity_l = []

    for row in candidates:
        cos_similarity = cosine_similarity(v,row)

        similarity_l.append(cos_similarity)
        
    sorted_ids = np.argsort(similarity_l)
    k_idx = sorted_ids[-k:]
    return k_idx

In [28]:
def hash_value_of_vector(v, planes):

    dot_product = np.dot(v,planes)    
    sign_of_dot_product = np.sign(dot_product)
    
    h = sign_of_dot_product>=0
    h = np.squeeze(h)

    hash_value = 0
    n_planes = planes.shape[1]
    
    for i in range(n_planes):
        hash_value += np.power(2,i)*h[i]
    hash_value = int(hash_value)

    return hash_value

In [29]:
def make_hash_table(vecs, planes):
   
    num_of_planes = planes.shape[1]

    num_buckets = 2**num_of_planes

    hash_table = {i:[] for i in range(num_buckets)}

    id_table = {i:[] for i in range(num_buckets)}

    for i, v in enumerate(vecs):

        h = hash_value_of_vector(v,planes)
        hash_table[h].append(v)

        id_table[h].append(i)

    return hash_table, id_table

In [30]:
# Creating the hashtables
hash_tables = []
id_tables = []
for universe_id in range(N_UNIVERSES):  # there are 25 hashes
    print('working on hash universe #:', universe_id)
    planes = planes_l[universe_id]
    hash_table, id_table = make_hash_table(document_vecs, planes)
    hash_tables.append(hash_table)
    id_tables.append(id_table)

working on hash universe #: 0
working on hash universe #: 1
working on hash universe #: 2
working on hash universe #: 3
working on hash universe #: 4
working on hash universe #: 5
working on hash universe #: 6
working on hash universe #: 7
working on hash universe #: 8
working on hash universe #: 9
working on hash universe #: 10
working on hash universe #: 11
working on hash universe #: 12
working on hash universe #: 13
working on hash universe #: 14
working on hash universe #: 15
working on hash universe #: 16
working on hash universe #: 17
working on hash universe #: 18
working on hash universe #: 19
working on hash universe #: 20
working on hash universe #: 21
working on hash universe #: 22
working on hash universe #: 23
working on hash universe #: 24


In [31]:
def approximate_knn(doc_id, v, planes_l, k=1, num_universes_to_use=N_UNIVERSES):

    vecs_to_consider_l = list()

    ids_to_consider_l = list()

    ids_to_consider_set = set()

    for universe_id in range(num_universes_to_use):

        planes = planes_l[universe_id]
        hash_value = hash_value_of_vector(v, planes)
        hash_table = hash_tables[universe_id]
        document_vectors_l = hash_table[hash_value]
        id_table = id_tables[universe_id]
        new_ids_to_consider = id_table[hash_value]

        if doc_id in new_ids_to_consider:
            new_ids_to_consider.remove(doc_id)
            print(f"removed doc_id {doc_id} of input vector from new_ids_to_search")

        for i, new_id in enumerate(new_ids_to_consider):

            if new_id not in ids_to_consider_set:
                document_vector_at_i = document_vectors_l[i]
                vecs_to_consider_l.append(document_vector_at_i)
                ids_to_consider_l.append(new_id)
                ids_to_consider_set.add(new_id)

    print("Fast considering %d vecs" % len(vecs_to_consider_l))

    vecs_to_consider_arr = np.array(vecs_to_consider_l)

    nearest_neighbor_idx_l = nearest_neighbor(v, vecs_to_consider_arr, k=k)
    print(nearest_neighbor_idx_l)
    print(ids_to_consider_l)
    nearest_neighbor_ids = [ids_to_consider_l[idx] for idx in nearest_neighbor_idx_l]

    return nearest_neighbor_ids

In [32]:
#document_vecs, ind2Tweet
doc_id = 0
doc_to_search = all_tweets[doc_id]
vec_to_search = document_vecs[doc_id]

In [33]:
nearest_neighbor_ids = approximate_knn(doc_id, vec_to_search, planes_l, k=3, num_universes_to_use=5)

removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
removed doc_id 0 of input vector from new_ids_to_search
Fast considering 186 vecs
[82 16  0]
[51, 105, 154, 195, 253, 615, 1111, 1959, 2176, 2240, 3319, 3715, 4050, 6338, 8497, 9168, 1803, 2858, 1063, 1657, 1789, 2366, 2598, 2773, 3280, 4736, 5642, 7023, 7985, 8359, 8556, 9296, 10, 26, 63, 69, 113, 164, 209, 272, 288, 604, 911, 1153, 1494, 1783, 2031, 2367, 2554, 2799, 2872, 2905, 3018, 3068, 3608, 3734, 3784, 3962, 4369, 4463, 4748, 4955, 5037, 5318, 5803, 5848, 5923, 6442, 6814, 6824, 6994, 7310, 7987, 8090, 8846, 9077, 9584, 9630, 9739, 9, 122, 124, 268, 332, 360, 532, 542, 555, 657, 705, 741, 829, 1117, 1215, 1332, 1384, 1619, 1720, 1912, 2034, 2172, 2355, 2411, 2414, 2480, 2606, 2718, 2750, 2940, 2945, 2991, 3079, 3251, 3338, 3437, 3498, 3685, 4023, 4086, 4349,

In [35]:
print(f"Nearest neighbors for document {doc_id}")
print(f"Document contents: {doc_to_search}")
print("")

for neighbor_id in nearest_neighbor_ids:
    print(f"Nearest neighbor at document id {neighbor_id}")
    print(f"document contents: {all_tweets[neighbor_id]}")

Nearest neighbors for document 0
Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Nearest neighbor at document id 268
document contents: @imarpita it was great talking to you :D
Nearest neighbor at document id 1803
document contents: @americascup Do you have stage times for Portsmouth live? :) x
Nearest neighbor at document id 51
document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)
