In [1]:
import pandas as pd
from random import randint, sample

from datasketch import WeightedMinHashGenerator
from datasketch import MinHashLSHForest

In [84]:
# data/recommendations
df_recs = pd.read_pickle('data/recommendations/df_recos_simple_30.pkl')

In [85]:
df_recs['Ν_recs_1'] = df_recs['recs'].apply(lambda x: len(x))
df_recs['Ν_recs_2'] = df_recs['recs_30'].apply(lambda x: len(x))

In [86]:
df_recs = df_recs[df_recs.Ν_recs_1==10]
df_recs = df_recs[df_recs.Ν_recs_2==30]

In [105]:
df_recs.shape

(28972, 20)

In [6]:
df_recs.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,brand_name2,Category2,SubCategory2,Minhash_id,Minhash,recs,recs_30,Brands,Categories,Subcategories,N_Brands,N_Categories,N_Subcategories,Ν_recs_1,Ν_recs_2
0,0,"[samsung_brand, galaxy, s10, 128gb, dual, mobi...","[28661, 19239, 27889, 1587, 14268, 22300, 22299]","[0.3460158140661532, 0.3732597244316061, 0.557...",2155531584,samsung_brand,mobiles_cat,mobile_phone_subcat,m0,<datasketch.weighted_minhash.WeightedMinHash o...,"[m14, m3, m22, m16, m0, m12, m1, m15, m17, m23]","[m9, m16, m208, m7, m63, m65, m22, m0, m12, m1...","[samsung_brand, samsung_brand, samsung_brand, ...","[mobiles_cat, mobiles_cat, mobiles_cat, mobile...","[mobile_phone_subcat, mobile_phone_subcat, mob...",1,1,1,10,30
1,1,"[samsung_brand, galaxy, 128gb, dual, mobiles_c...","[28661, 19239, 1587, 14268, 22300, 22299, 8666...","[0.2932021745910131, 0.3162877488300745, 0.300...",2155621215,samsung_brand,mobiles_cat,mobile_phone_subcat,m1,<datasketch.weighted_minhash.WeightedMinHash o...,"[m3, m22, m0, m50, m12, m1, m17, m23, m210, m125]","[m208, m63, m65, m22, m0, m12, m165, m137, m20...","[samsung_brand, samsung_brand, samsung_brand, ...","[mobiles_cat, mobiles_cat, mobiles_cat, mobile...","[mobile_phone_subcat, mobile_phone_subcat, mob...",4,1,1,10,30


## - Create the MinhashLSH forest

In [138]:
def make_recs(doc_id, query, n_recs, minhash_list):
    # Create a MinHash LSH Forest with the same num_perm parameter
    forest = MinHashLSHForest(num_perm=128)

    minhash_list = minhash_list
    i = 0
    for minhash in minhash_list:
        # Add minhash into the index
        forest.add(i, minhash)
        #forest.add(minhash_ids_list[i], minhash)
        i += 1

    # IMPORTANT: must call index() otherwise the keys won't be searchable
    forest.index()
    
    #print(len(minhash_list))
    #print(doc_id)
    # the query is the product of the first product page
    #query = minhash_list[doc_id]
    
    # Using m1 as the query, retrieve top 10 keys that have the higest Jaccard
    results = forest.query(query, n_recs)
    
    return results

In [139]:
def second_page(x):
    try:
    
        first_product_doc_id = x['doc_id']
        first_product_minhash = x['Minhash']
        #print('the ', first_product_minhash)
        #print(first_product_doc_id)
        first_product_recs = x['recs']
        try:
            first_product_recs.remove(x['Minhash_id'])
        except:
            pass
        random_rec = sample(range(1, 9), 1)[0]
        clicked_rec_m = first_product_recs[random_rec]
        #print('clicked_rec_m = ', clicked_rec_m)

        # find the doc_id of the clicked recommendation which is the product of the second product page
        second_product_doc_id = df_recs[df_recs.Minhash_id==clicked_rec_m].iloc[0][0]
        #print('second_product_doc_id =', second_product_doc_id)
        # find the 30 recs of the clicked recommendation which is the product of the second product page
        second_product_recs_30 = df_recs[df_recs.Minhash_id==clicked_rec_m].iloc[0][11]
        #print('second_product_recs_30 = ', second_product_recs_30)

        # find the Minhash signatures of the 30 recs
        minhash_list = df_recs[df_recs.Minhash_id.isin(list(second_product_recs_30))]['Minhash'].tolist()
        # find the Minhash ids signatures of the 30 recs
        minhash_ids_list = df_recs[df_recs.Minhash_id.isin(list(second_product_recs_30))]['Minhash_id'].tolist()
        #print(minhash_ids_list)
        #print('Number of minhashsignatures: ', len(minhash_list))

        # find the recommendations of the second product page
        recs_10 = make_recs(first_product_doc_id, first_product_minhash, 10, minhash_list)
        #print(recs_10)

        final_recs = []
        for rec in recs_10:
            final_recs.append(minhash_ids_list[rec])

        #print(final_recs)
        return [clicked_rec_m, final_recs]
    except:
        #print('Error to None')
        return None

In [160]:
df_recs2 = df_recs.sample(10000)

In [161]:
#df_recs2 = df_recs[100:101]

In [162]:
%%time
df_recs2['second_page_product-recs'] = df_recs2.apply(lambda x: second_page(x), axis=1)

Wall time: 4min 14s


In [164]:
df_recs2 = df_recs2.dropna(subset = ['second_page_product-recs'])

In [165]:
df_recs2.shape

(9957, 21)

In [166]:
df_recs2['second_page_product'] = df_recs2['second_page_product-recs'].apply(lambda x: x[0])
df_recs2['second_page_recs'] = df_recs2['second_page_product-recs'].apply(lambda x: x[1])

In [167]:
#df_recs = df_recs.dropna(subset = ['second_page_recs', 'second_page_product'])

In [168]:
df_recs2.head(2)

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,brand_name2,Category2,SubCategory2,Minhash_id,Minhash,...,Categories,Subcategories,N_Brands,N_Categories,N_Subcategories,Ν_recs_1,Ν_recs_2,second_page_product-recs,second_page_product,second_page_recs
9773,9773,"[15, laptop_pc_cat, laptop_cases_subcat, camel...","[1874, 21257, 21256, 12170, 8896, 21248]","[0.2372059371686326, 0.12173721374972295, 0.23...",2155467405,camel active_brand,laptop_pc_cat,laptop_cases_subcat,m9773,<datasketch.weighted_minhash.WeightedMinHash o...,...,"[laptop_pc_cat, laptop_pc_cat, laptop_pc_cat, ...","[laptop_cases_subcat, laptop_cases_subcat, lap...",3,1,2,10,30,"[m9768, [m9229, m9759, m9762, m9764, m9766, m9...",m9768,"[m9229, m9759, m9762, m9764, m9766, m9768, m97..."
10710,10710,"[10, pro, 2012, 13, 6000mah, power_brand, batt...","[754, 25683, 2684, 1622, 6467, 25613, 10632, 2...","[0.09660005906353689, 0.14397453962361087, 0.2...",26029598,2-power_brand,laptop_pc_cat,laptop_battery_subcat,m10710,<datasketch.weighted_minhash.WeightedMinHash o...,...,"[laptop_pc_cat, laptop_pc_cat, laptop_pc_cat, ...","[laptop_battery_subcat, laptop_battery_subcat,...",1,1,1,10,30,"[m10919, [m10919, m11074, m11585]]",m10919,"[m10919, m11074, m11585]"


In [179]:
df_recs2.iloc[0][22]

['m9229',
 'm9759',
 'm9762',
 'm9764',
 'm9766',
 'm9768',
 'm9769',
 'm9773',
 'm9962',
 'm10479']

In [194]:
df_recs2.second_page_product.nunique()

7560

In [199]:
# create a df with only the recs of each product
df_recs3 = df_recs2[['second_page_product', 'second_page_recs']]
df_recs3 = df_recs3.drop_duplicates(subset=['second_page_product'], keep='first')
# expand each row to as many rows as the length of the recs list
df_recs3 = df_recs3.set_index('second_page_product').second_page_recs.apply(pd.Series).stack().reset_index(level=-1, drop=True).astype(str).reset_index()
# rename the columns
df_recs3.columns = ['second_page_product_m', 'rec_m_id']

In [200]:
df_recs3.shape

(73034, 2)

In [201]:
df_recs3.head(1)

Unnamed: 0,second_page_product_m,rec_m_id
0,m9768,m9229


In [202]:
# add the brand, category, subcategory of each recommended product
df_recs3 = df_recs3.merge(df_recs[['Minhash_id', 'brand_name2', 'Category2', 'SubCategory2']], left_on='rec_m_id', right_on='Minhash_id', how='left')

In [203]:
df_recs3.shape

(73034, 6)

In [204]:
df_recs3.head(1)

Unnamed: 0,second_page_product_m,rec_m_id,Minhash_id,brand_name2,Category2,SubCategory2
0,m9768,m9229,m9229,case logic_brand,laptop_pc_cat,laptop_cases_subcat


In [205]:
df_recs3.second_page_product_m.nunique()

7560

In [206]:
# groupby each product and convert to lists
df_recs3 = df_recs3.groupby(['second_page_product_m'], as_index=False)['brand_name2', 'Category2', 'SubCategory2'].agg(lambda x: list(x))
# rename columns
df_recs3.columns = ['second_page_product_m', 'Brands', 'Categories', 'Subcategories']

In [207]:
df_recs3.shape

(7560, 4)

In [208]:
df_recs3.iloc[0][1]

['samsung_brand',
 'samsung_brand',
 'samsung_brand',
 'samsung_brand',
 'samsung_brand',
 'xiaomi_brand',
 'cubot_brand',
 'meizu_brand',
 'meizu_brand',
 'meizu_brand']

In [209]:
# create 3 columns with the number of uniique brands, categories, subcategories
df_recs3['N_Brands'] = df_recs3['Brands'].apply(lambda x: len(set(x)))
df_recs3['N_Categories'] = df_recs3['Categories'].apply(lambda x: len(set(x)))
df_recs3['N_Subcategories'] = df_recs3['Subcategories'].apply(lambda x: len(set(x)))

In [210]:
df_recs3.head(2)

Unnamed: 0,second_page_product_m,Brands,Categories,Subcategories,N_Brands,N_Categories,N_Subcategories
0,m10,"[samsung_brand, samsung_brand, samsung_brand, ...","[mobiles_cat, mobiles_cat, mobiles_cat, mobile...","[mobile_phone_subcat, mobile_phone_subcat, mob...",4,1,1
1,m100,"[nokia_brand, nokia_brand, nokia_brand, nokia_...","[mobiles_cat, mobiles_cat, mobiles_cat, mobile...","[mobile_phone_subcat, mobile_phone_subcat, mob...",1,1,2


In [211]:
df_recs3.iloc[0][1]

['samsung_brand',
 'samsung_brand',
 'samsung_brand',
 'samsung_brand',
 'samsung_brand',
 'xiaomi_brand',
 'cubot_brand',
 'meizu_brand',
 'meizu_brand',
 'meizu_brand']

In [212]:
df_recs3.describe()

Unnamed: 0,N_Brands,N_Categories,N_Subcategories
count,7560.0,7560.0,7560.0
mean,2.403571,1.115079,1.216534
std,1.895529,0.373745,0.555973
min,1.0,1.0,1.0
25%,1.0,1.0,1.0
50%,1.0,1.0,1.0
75%,3.0,1.0,1.0
max,10.0,4.0,8.0


In [174]:
df_recs3.describe()

Unnamed: 0,N_Brands,N_Categories,N_Subcategories
count,7560.0,7560.0,7560.0
mean,2.538624,1.121958,1.232407
std,2.056508,0.383127,0.577693
min,1.0,1.0,1.0
25%,1.0,1.0,1.0
50%,2.0,1.0,1.0
75%,4.0,1.0,1.0
max,15.0,4.0,8.0
