In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [2]:
pd.__version__

'0.25.1'

In [3]:
# read the dataset
df = pd.read_pickle('data/final/product_details/df_product_details_retailrocket.pkl')

In [4]:
df.head(2)

Unnamed: 0,itemid,Title,SubCategory,Category,Category2,SubCategory2,Title_lc2
0,291285,942411 177151 64816 133116 1128577 227976 1196328 603188 1128577 227976 1196328 177151 64816 737007 992688 727274 1273256 574225 567967 269926 348137 786181 709553 901561 1080764 824998 111677 675499 113294 610517 398124 280445,1589,1426,1426_cat,1589_subcat,942411 177151 64816 133116 1128577 227976 1196328 603188 1128577 227976 1196328 177151 64816 737007 992688 727274 1273256 574225 567967 269926 348137 786181 709553 901561 1080764 824998 111677 675499 113294 610517 398124 280445 1426_cat 1589_subcat
1,103012,1249027 820477 72034 912996 1128577 404632 30603 1009622 1128577 780582 904144 716033 820477 375545 n72000.000 590873 611917 606886 1128577 954367 631756 7681 353870 1324984 145012 568015 737771 237874 1249027 1271914 1058996 836265,1483,561,561_cat,1483_subcat,1249027 820477 72034 912996 1128577 404632 30603 1009622 1128577 780582 904144 716033 820477 375545 n72000.000 590873 611917 606886 1128577 954367 631756 7681 353870 1324984 145012 568015 737771 237874 1249027 1271914 1058996 836265 561_cat 1483_subcat


In [5]:
df.shape

(28241, 7)

In [6]:
df.Title_lc2.nunique()

23261

In [7]:
#df.url.nunique()

In [8]:
df.Title.nunique()

23244

In [9]:
# create a list of all preprocessed product titles
corpus = df.Title_lc2.values.tolist()

In [10]:
# the number of products
len(corpus)

28241

In [11]:
# Convert the above corpus to a matrix of TF-IDF features.
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tf = TfidfVectorizer() 
tfidf_matrix = tf.fit_transform(corpus)

In [12]:
# get a list of the unique terms in the corpus
feature_names = tf.get_feature_names()

In [34]:
len(feature_names)

37626

In [54]:
# transform the sparce matrix to a list of dicts
# each dict corresponds to each term of a product title
tfidf_list = []
for doc in range(0,len(corpus)):
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

    for i, w, s in [(i, feature_names[i], s) for (i, s) in tfidf_scores]:
        doc_dict = {'doc_id':doc, 'term_id':i, 'term':w, 'tfidf':s}
        tfidf_list.append(doc_dict)

In [55]:
# transform the list of dicts to a pandas dataframe
df_tfidf = pd.DataFrame(tfidf_list)
df_tfidf.head()

Unnamed: 0,doc_id,term_id,term,tfidf
0,0,31663,942411,0.129982
1,0,10456,177151,0.314116
2,0,23415,64816,0.331849
3,0,8984,133116,0.166381
4,0,3499,1128577,0.088479


## -- Set the weights for category, subcategory

In [56]:
# set the 2 weights
category_weight = 3.0
subcategory_weight = 1.0

In [57]:
df.Category2.unique()

array(['1426_cat', '561_cat', '955_cat', '1606_cat', '105_cat',
       '1308_cat'], dtype=object)

In [58]:
df.SubCategory2.unique()

array(['1589_subcat', '1483_subcat', '1221_subcat', '1051_subcat',
       '646_subcat', '1163_subcat', '1421_subcat', '1192_subcat',
       '589_subcat', '84_subcat', '683_subcat', '1049_subcat',
       '671_subcat', '707_subcat', '292_subcat', '704_subcat',
       '618_subcat', '1279_subcat', '858_subcat', '744_subcat',
       '1650_subcat', '1528_subcat', '452_subcat', '626_subcat',
       '1213_subcat', '51_subcat', '411_subcat', '520_subcat',
       '987_subcat', '1222_subcat', '1048_subcat', '144_subcat',
       '445_subcat', '182_subcat', '694_subcat', '478_subcat',
       '222_subcat'], dtype=object)

In [59]:
# create a df for the weights of the 3 kind of terms

df_category_weights = pd.DataFrame({'term':list(df.Category2.unique()), 
                                 'weight':category_weight})

df_subcategory_weights = pd.DataFrame({'term':list(df.SubCategory2.unique()), 
                                 'weight':subcategory_weight})

df_weights = pd.concat([df_category_weights, df_subcategory_weights], axis=0)

df_weights

Unnamed: 0,term,weight
0,1426_cat,3.0
1,561_cat,3.0
2,955_cat,3.0
3,1606_cat,3.0
4,105_cat,3.0
5,1308_cat,3.0
0,1589_subcat,1.0
1,1483_subcat,1.0
2,1221_subcat,1.0
3,1051_subcat,1.0


In [60]:
# merge the main 'df_tfidf' with the above df
df_tfidf = df_tfidf.merge(df_weights, left_on='term', right_on='term', how='left')
# set the weight of all the other terms to 1
df_tfidf = df_tfidf.fillna(1)

In [61]:
df_tfidf['tfidf'] = df_tfidf['tfidf'] * df_tfidf['weight']

In [62]:
# # create a column with the weight of each term
# df_tfidf['weight'] = df_tfidf['term'].apply(lambda x: 0.5 if x in df.Category2.unique()
#                                                       else 1 if x in df.SubCategory2.unique()
#                                                       else 1)

# df_tfidf['tfidf'] = df_tfidf['tfidf'] * df_tfidf['weight']

In [63]:
df_tfidf.head(40)

Unnamed: 0,doc_id,term_id,term,tfidf,weight
0,0,31663,942411,0.129982,1.0
1,0,10456,177151,0.314116,1.0
2,0,23415,64816,0.331849,1.0
3,0,8984,133116,0.166381,1.0
4,0,3499,1128577,0.088479,1.0
5,0,11863,227976,0.325922,1.0
6,0,5346,1196328,0.477446,1.0
7,0,22145,603188,0.127043,1.0
8,0,25984,737007,0.194714,1.0
9,0,32994,992688,0.10802,1.0


In [64]:
df_tfidf.shape

(641813, 5)

In [65]:
# check the number of unique terms
print(df_tfidf.term_id.max())

37625


In [66]:
# groupby each product to a row and convert the rest of the columns to lists
df_tfidf2 = df_tfidf.groupby(['doc_id'], as_index=False)['term','term_id','tfidf'].agg(lambda x: list(x))

In [67]:
df_tfidf2['product_id'] = df['itemid'].values.tolist()

In [68]:
df.columns

Index(['itemid', 'Title', 'SubCategory', 'Category', 'Category2',
       'SubCategory2', 'Title_lc2'],
      dtype='object')

In [69]:
#df_tfidf2['brand_name2'] = df['brand_name2'].values.tolist()
df_tfidf2['Category2'] = df['Category2'].values.tolist()
df_tfidf2['SubCategory2'] = df['SubCategory2'].values.tolist()

In [70]:
df_tfidf2.head()

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,Category2,SubCategory2
0,0,"[942411, 177151, 64816, 133116, 1128577, 227976, 1196328, 603188, 737007, 992688, 727274, 1273256, 574225, 567967, 269926, 348137, 786181, 709553, 901561, 1080764, 824998, 111677, 675499, 113294, 610517, 398124, 280445, 1426_cat, 1589_subcat]","[31663, 10456, 23415, 8984, 3499, 11863, 5346, 22145, 25984, 32994, 25688, 7402, 21407, 21212, 13098, 15324, 27358, 25165, 30509, 2236, 28381, 3209, 24164, 3609, 22356, 16655, 13421, 9505, 9942]","[0.12998190993696945, 0.31411559559438207, 0.3318485535463801, 0.1663811840422672, 0.08847851735646069, 0.32592184633757765, 0.47744564996757105, 0.12704286169825435, 0.19471384746900658, 0.10801964530083273, 0.17824390315847807, 0.15263081092034866, 0.14538297348031426, 0.12084264525640605, 0.08150056833466746, 0.1314107728586312, 0.12664114348837363, 0.14669714382824947, 0.08312671608639156, 0.1170503100535749, 0.1212154472238916, 0.1610592875683539, 0.18160059086122965, 0.10823035620944084, 0.07128726460658918, 0.17532163247290203, 0.1583145733637088, 0.18982128145162225, 0.1257051696211118]",291285,1426_cat,1589_subcat
1,1,"[1128577, 1249027, 820477, 72034, 912996, 404632, 30603, 1009622, 780582, 904144, 716033, 375545, n72000, 000, 590873, 611917, 606886, 954367, 631756, 7681, 353870, 1324984, 145012, 568015, 737771, 237874, 1271914, 1058996, 836265, 561_cat, 1483_subcat]","[3499, 6766, 28250, 25483, 30830, 16813, 14118, 259, 27209, 30574, 25361, 16072, 37004, 0, 21825, 22392, 22239, 31939, 22962, 26845, 15476, 8829, 9565, 21214, 26007, 12163, 7363, 1640, 28671, 21068, 9652]","[0.15023119822454753, 0.13586146435157057, 0.21581577555207096, 0.2702246610242224, 0.23171943157820035, 0.17957410572479401, 0.045714367178264104, 0.21397447291814148, 0.23171943157820035, 0.14853007177421412, 0.13458474893323466, 0.11174909616532461, 0.24209956697118193, 0.043076809262938436, 0.20660964965106415, 0.11190696242421452, 0.23171943157820035, 0.2347347437041046, 0.20769919977783446, 0.2347347437041046, 0.09674226102065646, 0.14723901348544238, 0.2347347437041046, 0.16415807588211642, 0.20883718996420297, 0.05268385533995571, 0.12907777632241438, 0.15012350997739726, 0.25984452563124083, 0.20063256205119553, 0.07632483818835384]",103012,561_cat,1483_subcat
2,2,"[1128577, 280445, 1426_cat, 000, 640322, 181883, 1301287, 639502, 118989, 295202, n14892000, 224903, 1156162, 653611, 293352, 676627, 1128176, 644974, 280874, 1136148, 1297729, 771211, 1096082, 1099873, 1293405, 360092, 260172, 1002405, 1244358, 348954, 1248387, 1221_subcat]","[3499, 13421, 9505, 0, 23210, 10596, 8172, 23186, 5159, 13842, 33675, 11776, 4287, 23561, 13782, 24196, 3488, 23334, 13438, 3695, 8075, 26937, 2643, 2756, 7959, 15654, 12825, 54, 6625, 15352, 6755, 6011]","[0.08193137565791431, 0.14659977551530845, 0.17577508284663762, 0.03523910762454509, 0.05966143459478799, 0.4018805220223715, 0.3246967277542709, 0.09024786780712316, 0.3707123558030387, 0.345671815710489, 0.22105806060319197, 0.06974520748263421, 0.16365770363330046, 0.06866794247092618, 0.1393405498527515, 0.1532316651761802, 0.13178639773534773, 0.16816269840516443, 0.10290177383129365, 0.1665509329626627, 0.05993643779033952, 0.13713402515357187, 0.10071754036875175, 0.13851103092932696, 0.16052611812180168, 0.16734131411569103, 0.14659977551530845, 0.08921872939495847, 0.07143549273752325, 0.15884981452392957, 0.22105806060319197, 0.07287212085563155]",77282,1426_cat,1221_subcat
3,3,"[1249027, 30603, 561_cat, 1483_subcat, 1192659, 800315, 677200, 1322949, 157939, 243924, 625815, 744982, 673876, 1051562, 226915, 2212, 1215254, 646928, 39986, 312815, 610340, 493696, 1277301, 952750, 794061, 1184709, 1322984, 318178, 718248, 750206]","[6766, 14118, 21068, 9652, 5222, 27727, 24216, 8770, 9908, 12342, 22785, 26219, 24116, 1436, 11822, 11660, 5857, 23386, 16697, 14331, 22349, 19236, 7507, 31904, 27570, 5019, 8772, 14494, 25436, 26362]","[0.15447995522822583, 0.0519790761029235, 0.22812730122494546, 0.08678441412664034, 0.4215682079856946, 0.3072563197799937, 0.20376414857932798, 0.3072563197799937, 0.12948777298971562, 0.13355122358594593, 0.13044332826869345, 0.17776502654915252, 0.13678010795499715, 0.14132283815908597, 0.18642423100214023, 0.1784540260971147, 0.11197379408012251, 0.14500191445487448, 0.1500561105400557, 0.0897485284426923, 0.09600289551690344, 0.12700363345683152, 0.19845697374216525, 0.18881154132607902, 0.25277027003314384, 0.11809865726913646, 0.1483339445462787, 0.1680193763777346, 0.1047452716992514, 0.24496140801809857]",76201,561_cat,1483_subcat
4,4,"[30603, 000, 237874, 639502, 653611, 977762, 1115724, 1094324, 73682, 1320974, 132820, 141646, 960054, 600483, 887448, 174342, 987250, 782561, 1240856, 1010547, 290981, 16970, 1241026, n48, n36, 249416, 376717, 703408, 664908, 6299, 246175, 955_cat, 1051_subcat]","[14118, 0, 12163, 23186, 23561, 32598, 3187, 2595, 25979, 8717, 8908, 9468, 32085, 22061, 30133, 10374, 32842, 27267, 6530, 284, 13719, 10252, 6534, 35031, 34452, 12496, 16101, 24978, 23884, 22897, 12412, 31989, 1453]","[0.05613405062660716, 0.15868594106766154, 0.0646920954046163, 0.06773283433142428, 0.1030733353435728, 0.18455580384610568, 0.34000379472493014, 0.3318170137484436, 0.3318170137484436, 0.13693153651346948, 0.1498536391275005, 0.15653312424605717, 0.15542179290684166, 0.14885860534350248, 0.14613500368064936, 0.14566802311241375, 0.1605273096197946, 0.1536345575629337, 0.1501438169237605, 0.15683222930670174, 0.1489520250399315, 0.12801596542400495, 0.1472671597270944, 0.32325603132689723, 0.12777654534275087, 0.12994879235791468, 0.164947877874277, 0.09444013315386518, 0.15713420765878713, 0.14430731462404675, 0.1833476508985319, 0.27556997842304903, 0.09810661126855265]",420271,955_cat,1051_subcat


In [71]:
df_tfidf2.shape

(28241, 7)

In [72]:
# save the dataframe as pickle file
df_tfidf2.to_pickle('data/final/tfidfs/df_tfidf_category_3-0.pkl')