In [1]:
import pandas as pd

from datasketch import WeightedMinHashGenerator
from datasketch import MinHashLSHForest

In [5]:
# read the tdiff dataframe
df_tfidf = pd.read_pickle('data/tfidfs/df_tfidf_brand_0-0.pkl')

In [7]:
# create an extra column with the minhash id (m1, m2 etc)
df_tfidf['Minhash_id'] = df_tfidf['doc_id'].apply(lambda x: 'm'+str(x))

In [8]:
df_tfidf.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,brand_name2,Category2,SubCategory2,Minhash_id
0,0,"[samsung_brand, galaxy, s10, 128gb, dual, mobi...","[28661, 19239, 27889, 1587, 14268, 22300, 22299]","[0.0, 0.3732597244316061, 0.557812813771961, 0...",2155531584,samsung_brand,mobiles_cat,mobile_phone_subcat,m0
1,1,"[samsung_brand, galaxy, 128gb, dual, mobiles_c...","[28661, 19239, 1587, 14268, 22300, 22299, 8666...","[0.0, 0.3162877488300745, 0.30075305304392586,...",2155621215,samsung_brand,mobiles_cat,mobile_phone_subcat,m1


## - Create the Weighted MinHash for each product

In [9]:
mg = WeightedMinHashGenerator(dim=35405, sample_size=128, seed=12)
def create_minhash(doc):
    term_ids = doc['term_id']
    tfidfs = doc['tfidf']
    
    tfidf_list = [0]*35405
    
    i = 0
    for term_id in term_ids:
        try:
            tfidf_list[term_id] = tfidfs[i]
        except Exception as e:
            print('Error 1: ', e)
            print('term_id', term_id)

        i += 1
        
    #print('TF-IDF length is: ', len(tfidf_list))
    m1 = mg.minhash(tfidf_list)
    #print(tfidf_list)
    return m1

In [10]:
%%time

# create a minhash for each row(product)
df_tfidf['Minhash'] = df_tfidf[0:].apply(lambda x: create_minhash(x), axis=1)

TF-IDF length is:  35405
Wall time: 53.9 ms


In [7]:
# create an extra column with the minhash id (m1, m2 etc)
df_tfidf['Minhash_id'] = df_tfidf['doc_id'].apply(lambda x: 'm'+str(x))

In [8]:
df_tfidf.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,brand_name2,Category2,SubCategory2,Minhash_id,Minhash
0,0,"[samsung_brand, galaxy, s10, 128gb, dual, mobi...","[28661, 19239, 27889, 1587, 14268, 22300, 22299]","[0.0, 0.3732597244316061, 0.557812813771961, 0...",2155531584,samsung_brand,mobiles_cat,mobile_phone_subcat,m0,<datasketch.weighted_minhash.WeightedMinHash o...
1,1,"[samsung_brand, galaxy, 128gb, dual, mobiles_c...","[28661, 19239, 1587, 14268, 22300, 22299, 8666...","[0.0, 0.3162877488300745, 0.30075305304392586,...",2155621215,samsung_brand,mobiles_cat,mobile_phone_subcat,m1,<datasketch.weighted_minhash.WeightedMinHash o...


In [5]:
minhash_list = df_tfidf['Minhash']

In [6]:
len(minhash_list)

29541

In [11]:
df_tfidf.to_pickle('data/minhashes/df_minhash_brand_0-0.pkl') # df_tfidf_brand_0-5

In [2]:
df_tfidf = pd.read_pickle('data/minhashes/df_minhash_simple.pkl')

## - Create the MinhashLSH forest

In [3]:
# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

In [7]:
i = 0
for minhash in minhash_list:
    # Add minhash into the index
    forest.add("m"+str(i), minhash)
    i += 1

In [8]:
# IMPORTANT: must call index() otherwise the keys won't be searchable
forest.index()

In [9]:
# Check for membership using the key
print("m2" in forest)
print("m3" in forest)

True
True


## - Make recommendations

In [10]:
def make_recs(doc_id, n_recs):
    query = minhash_list[doc_id]
    
    # Using m1 as the query, retrieve top 10 keys that have the higest Jaccard
    results = forest.query(query, n_recs)
    
    return results

In [11]:
%%time

df_tfidf['recs'] = df_tfidf['doc_id'].apply(lambda x: make_recs(x, 10))
#df_tfidf['recs_30'] = df_tfidf['doc_id'].apply(lambda x: make_recs(x, 30))

Wall time: 1min 33s


## - Finalize the dataset

In [19]:
# create a df with only the recs of each product
df_recs = df_tfidf[['product_id', 'recs']]
# expand each row to as many rows as the length of the recs list
df_recs = df_recs.set_index('product_id').recs.apply(pd.Series).stack().reset_index(level=-1, drop=True).astype(str).reset_index()
# rename the columns
df_recs.columns = ['product_id', 'rec_m_id']

In [20]:
# add the brand, category, subcategory of each recommended product
df_recs = df_recs.merge(df_tfidf[['Minhash_id', 'brand_name2', 'Category2', 'SubCategory2']], left_on='rec_m_id', right_on='Minhash_id', how='left')

In [21]:
# groupby each product and convert to lists
df_recs = df_recs.groupby(['product_id'], as_index=False)['brand_name2', 'Category2', 'SubCategory2'].agg(lambda x: list(x))
# rename columns
df_recs.columns = ['product_id', 'Brands', 'Categories', 'Subcategories']

In [22]:
# add the above info to the main dataset
df_recs2 = df_tfidf.merge(df_recs, left_on='product_id', right_on='product_id', how='left')

In [23]:
# create 3 columns with the number of uniique brands, categories, subcategories
df_recs2['N_Brands'] = df_recs2['Brands'].apply(lambda x: len(set(x)))
df_recs2['N_Categories'] = df_recs2['Categories'].apply(lambda x: len(set(x)))
df_recs2['N_Subcategories'] = df_recs2['Subcategories'].apply(lambda x: len(set(x)))

In [24]:
df_recs2.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,brand_name2,Category2,SubCategory2,Minhash_id,Minhash,recs,recs_30,Brands,Categories,Subcategories,N_Brands,N_Categories,N_Subcategories
0,0,"[samsung_brand, galaxy, s10, 128gb, dual, mobi...","[28661, 19239, 27889, 1587, 14268, 22300, 22299]","[0.3460158140661532, 0.3732597244316061, 0.557...",2155531584,samsung_brand,mobiles_cat,mobile_phone_subcat,m0,<datasketch.weighted_minhash.WeightedMinHash o...,"[m14, m3, m22, m16, m0, m12, m1, m15, m17, m23]","[m9, m16, m208, m7, m63, m65, m22, m0, m12, m1...","[samsung_brand, samsung_brand, oneplus_brand, ...","[mobiles_cat, mobiles_cat, mobiles_cat, mobile...","[mobile_phone_subcat, mobile_phone_subcat, mob...",7,1,1
1,1,"[samsung_brand, galaxy, 128gb, dual, mobiles_c...","[28661, 19239, 1587, 14268, 22300, 22299, 8666...","[0.2932021745910131, 0.3162877488300745, 0.300...",2155621215,samsung_brand,mobiles_cat,mobile_phone_subcat,m1,<datasketch.weighted_minhash.WeightedMinHash o...,"[m3, m22, m0, m50, m12, m1, m17, m23, m210, m125]","[m208, m63, m65, m22, m0, m12, m165, m137, m20...","[oneplus_brand, huawei_brand, huawei_brand, sa...","[mobiles_cat, mobiles_cat, mobiles_cat, mobile...","[mobile_phone_subcat, mobile_phone_subcat, mob...",11,1,1


In [31]:
len(df_recs2.iloc[0][11])

30

In [18]:
# data/recommendations
df_recs2.to_pickle('data/recommendations/df_recos_simple_10.pkl')