In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [2]:
# read the dataset
df = pd.read_pickle('data/product_details/df_preproc.pkl')

In [3]:
df.head(2)

Unnamed: 0,product_id,url,Title,Category2,SubCategory2,brand_name2,Title_lc2
0,2155531584,https://www.bestprice.gr/item/2155531584/samsung-galaxy-s10-128gb-dual.html,Samsung Galaxy S10+ 128GB Dual,mobiles_cat,mobile_phone_subcat,samsung_brand,samsung_brand galaxy s10+ 128gb dual mobiles_cat mobile_phone_subcat
1,2155621215,https://www.bestprice.gr/item/2155621215/samsung-galaxy-a70-6gb-128gb-dual.html,Samsung Galaxy A70 6GB 128GB Dual,mobiles_cat,mobile_phone_subcat,samsung_brand,samsung_brand galaxy a70 6gb 128gb dual mobiles_cat mobile_phone_subcat


In [4]:
df.shape

(29541, 7)

In [5]:
df.Title_lc2.nunique()

28293

In [6]:
df.url.nunique()

29541

In [7]:
df.Title.nunique()

28290

In [8]:
# create a list of all preprocessed product titles
corpus = df.Title_lc2.values.tolist()

In [9]:
# the number of products
len(corpus)

29541

In [10]:
# Convert the above corpus to a matrix of TF-IDF features.
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tf = TfidfVectorizer() 
tfidf_matrix = tf.fit_transform(corpus)

In [11]:
# get a list of the unique terms in the corpus
feature_names = tf.get_feature_names()

In [12]:
len(feature_names)

35405

In [13]:
feature_names[90:100]

['0035ng',
 '00360100',
 '003618',
 '0037x615',
 '003846',
 '003847',
 '0038ng',
 '0038x112',
 '0038x888',
 '003901']

In [14]:
# transform the sparce matrix to a list of dicts
# each dict corresponds to each term of a product title
tfidf_list = []
for doc in range(0,len(corpus)):
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

    for i, w, s in [(i, feature_names[i], s) for (i, s) in tfidf_scores]:
        doc_dict = {'doc_id':doc, 'term_id':i, 'term':w, 'tfidf':s}
        tfidf_list.append(doc_dict)

In [15]:
# transform the list of dicts to a pandas dataframe
df_tfidf = pd.DataFrame(tfidf_list)
df_tfidf.head()

Unnamed: 0,doc_id,term_id,term,tfidf
0,0,28661,samsung_brand,0.346016
1,0,19239,galaxy,0.37326
2,0,27889,s10,0.557813
3,0,1587,128gb,0.354927
4,0,14268,dual,0.401626


## -- Set the weights for brand, category, subcategory

In [16]:
# set the 3 weights
brand_weight = 0.0
category_weight = 1.0
subcategory_weight = 1.0

In [17]:
# create a df for the weights of the 3 kind of terms

df_brand_weights = pd.DataFrame({'term':list(df.brand_name2.unique()), 
                                 'weight':brand_weight})

df_category_weights = pd.DataFrame({'term':list(df.Category2.unique()), 
                                 'weight':category_weight})

df_subcategory_weights = pd.DataFrame({'term':list(df.SubCategory2.unique()), 
                                 'weight':subcategory_weight})

df_weights = pd.concat([df_brand_weights, df_category_weights, df_subcategory_weights], axis=0)

df_weights

Unnamed: 0,term,weight
0,samsung_brand,0.0
1,xiaomi_brand,0.0
2,huawei_brand,0.0
3,apple_brand,0.0
4,nokia_brand,0.0
...,...,...
18,compact_subcat,1.0
19,dslr_subcat,1.0
20,analog_subcat,1.0
21,photograph_cases_subcat,1.0


In [18]:
# merge the main 'df_tfidf' with the above df
df_tfidf = df_tfidf.merge(df_weights, left_on='term', right_on='term', how='left')
# set the weight of all the other terms to 1
df_tfidf = df_tfidf.fillna(1)

In [19]:
df_tfidf.head(12)

Unnamed: 0,doc_id,term_id,term,tfidf,weight
0,0,28661,samsung_brand,0.346016,0.0
1,0,19239,galaxy,0.37326,1.0
2,0,27889,s10,0.557813,1.0
3,0,1587,128gb,0.354927,1.0
4,0,14268,dual,0.401626,1.0
5,0,22300,mobiles_cat,0.187396,1.0
6,0,22299,mobile_phone_subcat,0.327722,1.0
7,1,28661,samsung_brand,0.293202,0.0
8,1,19239,galaxy,0.316288,1.0
9,1,1587,128gb,0.300753,1.0


In [20]:
df_tfidf['tfidf'] = df_tfidf['tfidf'] * df_tfidf['weight']

In [21]:
df_tfidf.head(12)

Unnamed: 0,doc_id,term_id,term,tfidf,weight
0,0,28661,samsung_brand,0.0,0.0
1,0,19239,galaxy,0.37326,1.0
2,0,27889,s10,0.557813,1.0
3,0,1587,128gb,0.354927,1.0
4,0,14268,dual,0.401626,1.0
5,0,22300,mobiles_cat,0.187396,1.0
6,0,22299,mobile_phone_subcat,0.327722,1.0
7,1,28661,samsung_brand,0.0,0.0
8,1,19239,galaxy,0.316288,1.0
9,1,1587,128gb,0.300753,1.0


In [22]:
df_tfidf.shape

(298104, 5)

In [23]:
# check the number of unique terms
print(df_tfidf.term_id.max())

35404


In [24]:
# groupby each product to a row and convert the rest of the columns to lists
df_tfidf2 = df_tfidf.groupby(['doc_id'], as_index=False)['term','term_id','tfidf'].agg(lambda x: list(x))

In [25]:
# add a column with the product id
df_tfidf2['product_id'] = df['product_id'].values.tolist()
# add a column with the product brand
df_tfidf2['brand_name2'] = df['brand_name2'].values.tolist()
# add a column with the product categories
df_tfidf2['Category2'] = df['Category2'].values.tolist()
# add a column with the product subcategories
df_tfidf2['SubCategory2'] = df['SubCategory2'].values.tolist()

In [26]:
df_tfidf2.head()

Unnamed: 0,doc_id,term,term_id,tfidf,product_id,brand_name2,Category2,SubCategory2
0,0,"[samsung_brand, galaxy, s10, 128gb, dual, mobiles_cat, mobile_phone_subcat]","[28661, 19239, 27889, 1587, 14268, 22300, 22299]","[0.0, 0.3732597244316061, 0.557812813771961, 0.35492680989503356, 0.4016258522753795, 0.18739622639932324, 0.3277215099109148]",2155531584,samsung_brand,mobiles_cat,mobile_phone_subcat
1,1,"[samsung_brand, galaxy, 128gb, dual, mobiles_cat, mobile_phone_subcat, a70, 6gb]","[28661, 19239, 1587, 14268, 22300, 22299, 8666, 6913]","[0.0, 0.3162877488300745, 0.30075305304392586, 0.340324252453377, 0.1587932656740559, 0.27770019594468515, 0.6095408653184142, 0.3658534986680371]",2155621215,samsung_brand,mobiles_cat,mobile_phone_subcat
2,2,"[samsung_brand, galaxy, dual, mobiles_cat, mobile_phone_subcat, note, 10, 12gb, 256gb]","[28661, 19239, 14268, 22300, 22299, 22898, 754, 1600, 3289]","[0.0, 0.3342614341187884, 0.3596638602388183, 0.1678170112783347, 0.2934810661964935, 0.4725855288044311, 0.18646888268201767, 0.47143468747698397, 0.261234456911427]",2155742926,samsung_brand,mobiles_cat,mobile_phone_subcat
3,3,"[samsung_brand, galaxy, s10, 128gb, dual, mobiles_cat, mobile_phone_subcat]","[28661, 19239, 27889, 1587, 14268, 22300, 22299]","[0.0, 0.3732597244316061, 0.557812813771961, 0.35492680989503356, 0.4016258522753795, 0.18739622639932324, 0.3277215099109148]",2155531583,samsung_brand,mobiles_cat,mobile_phone_subcat
4,4,"[samsung_brand, galaxy, 128gb, dual, mobiles_cat, mobile_phone_subcat, s10e]","[28661, 19239, 1587, 14268, 22300, 22299, 27919]","[0.0, 0.3297602425424082, 0.3135638357286398, 0.354820597532478, 0.16555717379654225, 0.28952902636134675, 0.6799667840565736]",2155531585,samsung_brand,mobiles_cat,mobile_phone_subcat


In [27]:
df_tfidf2.shape

(29541, 8)

In [28]:
# save the dataframe as pickle file
df_tfidf2.to_pickle('data/tfidfs/df_tfidf_brand_0-0.pkl')