In [1]:
import pandas as pd

from datasketch import WeightedMinHashGenerator
from datasketch import MinHashLSHForest

In [51]:
#!pip3 install datasketch --user

In [52]:
# read the tdiff dataframe
df_tfidf = pd.read_pickle('data/final/tfidfs/df_tfidf_category_2-5.pkl')

In [53]:
# create an extra column with the minhash id (m1, m2 etc)
df_tfidf['Minhash_id'] = df_tfidf['doc_id'].apply(lambda x: 'm'+str(x))

In [54]:
df_tfidf.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,Category2,SubCategory2,Minhash_id
0,0,"[942411, 177151, 64816, 133116, 1128577, 22797...","[31663, 10456, 23415, 8984, 3499, 11863, 5346,...","[0.12998190993696945, 0.31411559559438207, 0.3...",291285,1426_cat,1589_subcat,m0
1,1,"[1128577, 1249027, 820477, 72034, 912996, 4046...","[3499, 6766, 28250, 25483, 30830, 16813, 14118...","[0.15023119822454753, 0.13586146435157057, 0.2...",103012,561_cat,1483_subcat,m1


In [55]:
df_tfidf.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,Category2,SubCategory2,Minhash_id
0,0,"[942411, 177151, 64816, 133116, 1128577, 22797...","[31663, 10456, 23415, 8984, 3499, 11863, 5346,...","[0.12998190993696945, 0.31411559559438207, 0.3...",291285,1426_cat,1589_subcat,m0
1,1,"[1128577, 1249027, 820477, 72034, 912996, 4046...","[3499, 6766, 28250, 25483, 30830, 16813, 14118...","[0.15023119822454753, 0.13586146435157057, 0.2...",103012,561_cat,1483_subcat,m1


## - Create the Weighted MinHash for each product

In [56]:
mg = WeightedMinHashGenerator(dim=46150, sample_size=128, seed=12)
def create_minhash(doc):
    term_ids = doc['term_id']
    tfidfs = doc['tfidf']
    
    tfidf_list = [0]*46150
    
    i = 0
    for term_id in term_ids:
        try:
            tfidf_list[term_id] = tfidfs[i]
        except Exception as e:
            print('Error 1: ', e)
            print('term_id', term_id)

        i += 1
        
    
    m1 = mg.minhash(tfidf_list)
    #print(tfidf_list)
    return m1

In [57]:
%%time

# create a minhash for each row(product)
df_tfidf['Minhash'] = df_tfidf[0:].apply(lambda x: create_minhash(x), axis=1)

Wall time: 26min 31s


In [58]:
# create an extra column with the minhash id (m1, m2 etc)
df_tfidf['Minhash_id'] = df_tfidf['doc_id'].apply(lambda x: 'm'+str(x))

In [59]:
df_tfidf.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,Category2,SubCategory2,Minhash_id,Minhash
0,0,"[942411, 177151, 64816, 133116, 1128577, 22797...","[31663, 10456, 23415, 8984, 3499, 11863, 5346,...","[0.12998190993696945, 0.31411559559438207, 0.3...",291285,1426_cat,1589_subcat,m0,<datasketch.weighted_minhash.WeightedMinHash o...
1,1,"[1128577, 1249027, 820477, 72034, 912996, 4046...","[3499, 6766, 28250, 25483, 30830, 16813, 14118...","[0.15023119822454753, 0.13586146435157057, 0.2...",103012,561_cat,1483_subcat,m1,<datasketch.weighted_minhash.WeightedMinHash o...


In [60]:
minhash_list = df_tfidf['Minhash']

In [61]:
len(minhash_list)

28241

In [62]:
df_tfidf.to_pickle('data/final/minhashes/df_minhash_category_1-5.pkl')

## - Create the MinhashLSH forest

In [2]:
df_tfidf = pd.read_pickle('data/final/minhashes/df_minhash_category_1-5.pkl')
minhash_list = df_tfidf['Minhash']

In [3]:
# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

In [4]:
i = 0
for minhash in minhash_list:
    # Add minhash into the index
    forest.add("m"+str(i), minhash)
    i += 1

In [5]:
# IMPORTANT: must call index() otherwise the keys won't be searchable
forest.index()

In [6]:
# Check for membership using the key
print("m2" in forest)
print("m3" in forest)

True
True


## - Make recommendations

In [7]:
def make_recs(doc_id):
    query = minhash_list[doc_id]
    
    # Using m1 as the query, retrieve top 2 keys that have the higest Jaccard
    results = forest.query(query, 10)
    
    return results

In [8]:
%%time

df_tfidf['recs'] = df_tfidf['doc_id'].apply(lambda x: make_recs(x))

Wall time: 1min 24s


In [9]:
df_tfidf.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,Category2,SubCategory2,Minhash_id,Minhash,recs
0,0,"[942411, 177151, 64816, 133116, 1128577, 22797...","[31663, 10456, 23415, 8984, 3499, 11863, 5346,...","[0.12998190993696945, 0.31411559559438207, 0.3...",291285,1426_cat,1589_subcat,m0,<datasketch.weighted_minhash.WeightedMinHash o...,"[m1410, m15433, m15949, m5758, m11035, m9664, ..."
1,1,"[1128577, 1249027, 820477, 72034, 912996, 4046...","[3499, 6766, 28250, 25483, 30830, 16813, 14118...","[0.15023119822454753, 0.13586146435157057, 0.2...",103012,561_cat,1483_subcat,m1,<datasketch.weighted_minhash.WeightedMinHash o...,"[m11393, m3882, m12858, m8269, m18731, m12519,..."


In [10]:
df_tfidf.shape

(28241, 10)

In [11]:
df_tfidf.to_pickle('data/final/recommendations/df_recos_category_1-5_new.pkl')