# Setting up 

In [1]:
import pandas as pd
import json
from functions import display_rows
import sys,os,os.path
import math


%env WEEK4 /workspace/search_with_machine_learning_course/week4
WEEK4='/workspace/search_with_machine_learning_course/week4'

env: WEEK4=/workspace/search_with_machine_learning_course/week4


In [2]:
# Dump categories, 10 level deep, dump everything
!python $WEEK4/dump_category_depth_maps.py --output=/workspace/search_with_machine_learning_course/week4/data/category_parent_maps.csv

category_parent_maps_df = pd.read_csv(f'{WEEK4}/data/category_parent_maps.csv').sort_values(by=['depth'], ascending=True)
category_parent_maps_df

Number of categories: 4625
Writting data to /workspace/search_with_machine_learning_course/week4/data/category_parent_maps.csv...


Unnamed: 0,id,name,parent_id,parent_name,depth
0,cat00000,Best Buy,,,1
27,abcat0100000,TV & Home Theater,cat00000,Best Buy,2
423,pcmcat245100050028,Office,cat00000,Best Buy,2
296,abcat0800000,Mobile Phones,cat00000,Best Buy,2
45,abcat0300000,"Car, Marine & GPS",cat00000,Best Buy,2
...,...,...,...,...,...
300,abcat0404002,Compact Flash,pcmcat225800050009,Memory Cards,8
301,abcat0404003,Memory Stick,pcmcat225800050009,Memory Cards,8
302,abcat0404004,Secure Digital,pcmcat225800050009,Memory Cards,8
303,abcat0404005,XD Memory Cards,pcmcat225800050009,Memory Cards,8


In [3]:
# normalize user queires, this can take a while
#!python prep_train_csv.py --input=/workspace/datasets/train.csv --output=/workspace/datasets/train_prep_week4.csv
train_df = pd.read_csv('/workspace/datasets/train_prep_week4.csv')

In [4]:
unique_queries_per_category = train_df.groupby(['category'])['query_normalized'].nunique().reset_index().rename({'query_normalized': 'query_count'}, axis='columns')

display(unique_queries_per_category)

queries_df_with_query_count = pd.merge(train_df, unique_queries_per_category, on='category', how='inner').merge(category_parent_maps_df, left_on='category', right_on='id', how='inner')
       
queries_df_with_query_count.head(5)

Unnamed: 0,category,query_count
0,,42
1,abcat0011000,6
2,abcat0020000,1
3,abcat0020004,3
4,abcat0031100,33
...,...,...
1535,pcmcat96200050043,18
1536,pcmcat96200050046,58
1537,pcmcat96200050047,27
1538,pcmcat96200050052,1


Unnamed: 0.1,Unnamed: 0,user,sku,category,query,query_normalized,query_count,id,name,parent_id,parent_name,depth
0,0,000000df17cd56a5df4a94074e133c9d4739fae3,2125233,abcat0101001,Televisiones Panasonic 50 pulgadas,television panason 50 pulgada,11437,abcat0101001,All Flat-Panel TVs,abcat0101000,TVs,4
1,1,000001928162247ffaf63185cd8b2a244c78e7c6,2009324,abcat0101001,Sharp,sharp,11437,abcat0101001,All Flat-Panel TVs,abcat0101000,TVs,4
2,3,000017f79c2b5da56721f22f9fdd726b13daf8e8,2877125,abcat0101001,rca,rca,11437,abcat0101001,All Flat-Panel TVs,abcat0101000,TVs,4
3,19,0000c4e9d7075985d1020c456e7ce36f83f834eb,2126065,abcat0101001,Samsung 40,samsung 40,11437,abcat0101001,All Flat-Panel TVs,abcat0101000,TVs,4
4,40,00017f7beeac02736c0ce7bf1e75f3010939b34e,1831054,abcat0101001,lcd tv,lcd tv,11437,abcat0101001,All Flat-Panel TVs,abcat0101000,TVs,4


In [7]:
class Node:
    def __init__(self, data):
        self.children = []
        self.data = data

def build_tree():
    cats = category_parent_maps_df
    columns = cats.columns.to_list()
    rows = cats.to_numpy()

    retries = []

    cat_nodes = {}
    root = None

    for row in rows:
        name = row[columns.index('name')] 
        id = row[columns.index('id')] 
        parent_id_raw = row[columns.index('parent_id')] 
        parent_id = None if pd.isna(parent_id_raw) else parent_id_raw
        #print(f'id={id}, name={name}, parent_id={parent_id}')    
        
        count_row = unique_queries_per_category[unique_queries_per_category['category'] == id]
        if count_row.empty:
            count = 0
        else:
            count = count_row.iloc[0]['query_count']

        node = Node({'id': id, 'name': name, 'count': count, 'parent_id': parent_id})
        cat_nodes[id] = node

        if parent_id is None:            
            root = node

    for id, node in cat_nodes.items():    
        
        if node.data['parent_id'] is not None:
            try:
                cat_nodes[node.data['parent_id']].children.append(node)
                
            except KeyError as e:
                print('Parent ID not found: ' + str(e))
    
    return (root, cat_nodes)


def print_node(node: Node, depth):
    print(
        (' . ' * (depth)*2) + 
        (f" |- {node.data['id']} - {node.data['name']} ({node.data['count']})")
    )

def walk(node: Node, depth = 0):
    print_node(node, depth)

    for child in node.children:
        walk(child, depth + 1)

def update_query_count(node: Node, depth = 0):
    if len(node.children) == 0:
        # leaf node
        return node.data['count']

    total_count = 0
    for child in node.children:        
        total_count += update_query_count(child, depth + 1)
    
   
    node.data['count'] = total_count

    return total_count

def try_rollup(node: Node, threshold: int, cat_nodes: dict): 
    
    if node.data['count'] >= threshold:
        #print(f"node.data.count: {node.data['count']}, threshold: {threshold}")   
        return node

    round = 0
    parent_node = node
    while node.data['count'] < threshold:
        if round > 100:
            #infinite loop protection
            print('quiting try_rollup, too many tries...')
            break
        round += 1
        parent_node = cat_nodes[parent_node.data['parent_id']]
        if parent_node.data['count'] >= threshold:
            return parent_node
    
    if parent_node is not None:
        return parent_node
    else:
        return node

        

# if necessary, roll leaf nodes up to the desired node
def build_rollup_map(node: Node, threshold, cat_nodes: dict, rollup_map = {}, depth = 0):
    if len(node.children) == 0:
        roll_to = try_rollup(node, threshold, cat_nodes)
        # leaf node         
        rollup_map[node.data['id']] = {
            'from': node,
            'to': roll_to
        }
    
    for child in node.children:
        verdict = build_rollup_map(child, threshold, cat_nodes, rollup_map, depth + 1)
    
    return rollup_map

(root, cat_nodes) = build_tree()

print('Number of categories processed: ' + str(len(cat_nodes)))


test_node = cat_nodes['pcmcat139900050002']
speaker_node = cat_nodes['pcmcat143200050022']

update_query_count(root)

ROLLUP_IF_LESS_THAN=1000
rollup_maps = build_rollup_map(root, ROLLUP_IF_LESS_THAN, cat_nodes)

rollup_columns = ['from_category_id', 'from_category_name', 'count', 'to_category_id', 'to_category_name']
rollup_data = []
for id, roll in rollup_maps.items():
    rollup_data.append([
        roll['from'].data['id'],
        roll['from'].data['name'],
        roll['from'].data['count'],
        roll['to'].data['id'],
        roll['to'].data['name'],
    ])
    #print(f"from: {roll['from'].data['id']} - {roll['from'].data['name']} ({roll['from'].data['count']})    | to: {roll['to'].data['id']} - {roll['to'].data['name']} ({roll['to'].data['count']})")

rollup_df = pd.DataFrame(rollup_data, columns=rollup_columns)

rollup_df.nunique()

Parent ID not found: 'pcmcat161200050031'
Parent ID not found: 'pcmcat161200050031'
Parent ID not found: 'pcmcat161200050031'
Parent ID not found: 'pcmcat156300050010'
Parent ID not found: 'pcmcat156300050010'
Parent ID not found: 'pcmcat156300050010'
Parent ID not found: 'pcmcat156300050010'
Parent ID not found: 'pcmcat156300050010'
Parent ID not found: 'pcmcat245700050019'
Parent ID not found: 'pcmcat245700050019'
Parent ID not found: 'pcmcat245700050019'
Parent ID not found: 'pcmcat245700050019'
Parent ID not found: 'pcmcat245700050019'
Parent ID not found: 'pcmcat245700050019'
Parent ID not found: 'pcmcat220700050008'
Parent ID not found: 'pcmcat241300050031'
Parent ID not found: 'pcmcat241300050031'
Number of categories processed: 4625


from_category_id      3862
from_category_name    3174
count                  419
to_category_id         163
to_category_name       162
dtype: int64

In [8]:
cat_nodes_plain = {}

for id,node in cat_nodes.items():
    cat_nodes_plain[id] = node.data

with open(WEEK4 + '/data/cat_nodes.py', 'w') as f:
    f.write(repr(cat_nodes_plain))



In [9]:
import ast 
with open(WEEK4 + '/data/cat_nodes.py') as f:
    cat_nodes2 = ast.literal_eval(f.read())


In [10]:
cat_nodes2

{'cat00000': {'id': 'cat00000',
  'name': 'Best Buy',
  'count': 281692,
  'parent_id': None},
 'abcat0100000': {'id': 'abcat0100000',
  'name': 'TV & Home Theater',
  'count': 25750,
  'parent_id': 'cat00000'},
 'pcmcat245100050028': {'id': 'pcmcat245100050028',
  'name': 'Office',
  'count': 8267,
  'parent_id': 'cat00000'},
 'abcat0800000': {'id': 'abcat0800000',
  'name': 'Mobile Phones',
  'count': 25818,
  'parent_id': 'cat00000'},
 'abcat0300000': {'id': 'abcat0300000',
  'name': 'Car, Marine & GPS',
  'count': 6805,
  'parent_id': 'cat00000'},
 'pcmcat149900050025': {'id': 'pcmcat149900050025',
  'name': 'Recycling',
  'count': 0,
  'parent_id': 'cat00000'},
 'pcmcat133600050011': {'id': 'pcmcat133600050011',
  'name': 'Online Trade In',
  'count': 0,
  'parent_id': 'cat00000'},
 'abcat0600000': {'id': 'abcat0600000',
  'name': 'Movies & Music',
  'count': 36601,
  'parent_id': 'cat00000'},
 'abcat0200000': {'id': 'abcat0200000',
  'name': 'Audio & MP3',
  'count': 41209,
  'pa

In [11]:


queries_df_with_query_count_dedupped = queries_df_with_query_count.drop(['user', 'sku', 'Unnamed: 0', 'id', 'query'], axis=1).drop_duplicates()
queries_df_with_query_count_dedupped

Unnamed: 0,category,query_normalized,query_count,name,parent_id,parent_name,depth
0,abcat0101001,television panason 50 pulgada,11437,All Flat-Panel TVs,abcat0101000,TVs,4
1,abcat0101001,sharp,11437,All Flat-Panel TVs,abcat0101000,TVs,4
2,abcat0101001,rca,11437,All Flat-Panel TVs,abcat0101000,TVs,4
3,abcat0101001,samsung 40,11437,All Flat-Panel TVs,abcat0101000,TVs,4
4,abcat0101001,lcd tv,11437,All Flat-Panel TVs,abcat0101000,TVs,4
...,...,...,...,...,...,...,...
1847442,pcmcat235500050003,3ds bundl,1,Nintendo 3DS Consoles,pcmcat232900050000,Nintendo 3DS,4
1847443,pcmcat240500050027,camera light,1,Softboxes,pcmcat240500050025,Photography Lighting,7
1847444,pcmcat235500050005,portabl dvv,1,Nintendo 3DS Accessories,pcmcat232900050000,Nintendo 3DS,4
1847445,cat02737,drum,1,Buddhist,cat02014,World,5


In [12]:
queries_df_with_query_count_dedupped_with_rollup = queries_df_with_query_count_dedupped.merge(rollup_df, left_on='category', right_on='from_category_id', how='inner').sort_values(by='category')

queries_df_with_query_count_dedupped_with_rollup

Unnamed: 0,category,query_normalized,query_count,name,parent_id,parent_name,depth,from_category_id,from_category_name,count,to_category_id,to_category_name
281122,abcat0020004,cellphon,3,Unique Gifts,abcat0010000,Gift Center,3,abcat0020004,Unique Gifts,3,cat00000,Best Buy
281123,abcat0020004,cell phone,3,Unique Gifts,abcat0010000,Gift Center,3,abcat0020004,Unique Gifts,3,cat00000,Best Buy
281124,abcat0020004,beverag center,3,Unique Gifts,abcat0010000,Gift Center,3,abcat0020004,Unique Gifts,3,cat00000,Best Buy
0,abcat0101001,television panason 50 pulgada,11437,All Flat-Panel TVs,abcat0101000,TVs,4,abcat0101001,All Flat-Panel TVs,11437,abcat0101001,All Flat-Panel TVs
7621,abcat0101001,panson 3d,11437,All Flat-Panel TVs,abcat0101000,TVs,4,abcat0101001,All Flat-Panel TVs,11437,abcat0101001,All Flat-Panel TVs
...,...,...,...,...,...,...,...,...,...,...,...,...
279226,pcmcat99000050002,8339283,12,Dynex Networking,pcmcat99000050001,Networking,5,pcmcat99000050002,Dynex Networking,12,pcmcat128500050004,Name Brands
279230,pcmcat99000050002,dsl modem,12,Dynex Networking,pcmcat99000050001,Networking,5,pcmcat99000050002,Dynex Networking,12,pcmcat128500050004,Name Brands
279229,pcmcat99000050002,router wire,12,Dynex Networking,pcmcat99000050001,Networking,5,pcmcat99000050002,Dynex Networking,12,pcmcat128500050004,Name Brands
279227,pcmcat99000050002,ethernet router,12,Dynex Networking,pcmcat99000050001,Networking,5,pcmcat99000050002,Dynex Networking,12,pcmcat128500050004,Name Brands


In [13]:
queries_df_with_query_count_dedupped_with_rollup[queries_df_with_query_count_dedupped_with_rollup['category'] == 'pcmcat171900050029']


Unnamed: 0,category,query_normalized,query_count,name,parent_id,parent_name,depth,from_category_id,from_category_name,count,to_category_id,to_category_name
103471,pcmcat171900050029,blackberri torch case,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
103481,pcmcat171900050029,laptop 17,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
103482,pcmcat171900050029,belkin,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
106529,pcmcat171900050029,evo 3g,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
106530,pcmcat171900050029,tdident cyclop case,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
...,...,...,...,...,...,...,...,...,...,...,...,...
105432,pcmcat171900050029,lifeproof case,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
105431,pcmcat171900050029,d link,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
105429,pcmcat171900050029,desir s,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted
106895,pcmcat171900050029,motorolla atrix phone case,3992,Fitted,abcat0811006,Mobile Phone Cases & Clips,5,pcmcat171900050029,Fitted,3992,pcmcat171900050029,Fitted


In [14]:
%env FASTTEXT_LABEL_FILE $WEEK4 + '/data/query_classification.fasttext'
FASTTEXT_LABEL_FILE = WEEK4 + '/data/query_classification.fasttext'

%env FASTTEXT_LABEL_FILE_SHUFFLED $WEEK4 + '/data/query_classification_shuffled.fasttext'
FASTTEXT_LABEL_FILE_SHUFFLED = WEEK4 + '/data/query_classification_shuffled.fasttext'

%env FASTTEXT_LABEL_FILE_TRAINING $WEEK4 + '/data/query_classification_train.fasttext'
FASTTEXT_LABEL_FILE_TRAINING = WEEK4 + '/data/query_classification_train.fasttext'

%env FASTTEXT_LABEL_FILE_TEST $WEEK4 + '/data/query_classification_test.fasttext'
FASTTEXT_LABEL_FILE_TEST = WEEK4 + '/data/query_classification_test.fasttext'





queries_df_with_query_count_dedupped_with_rollup_shuffled = queries_df_with_query_count_dedupped_with_rollup.sample(frac=1)

desired_entries = queries_df_with_query_count_dedupped_with_rollup_shuffled.shape[0]
head_size = math.ceil(desired_entries * .75)
tail_size = math.ceil(desired_entries * .20)

fasttext_train_df = queries_df_with_query_count_dedupped_with_rollup_shuffled.head(head_size)
#.sample(frac=1)
fasttext_test_df = queries_df_with_query_count_dedupped_with_rollup_shuffled.tail(tail_size)
#.sample(frac=1)
# making sure there's no overlap data between train and test data
#fasttext_test_df = fasttext_test_df[~fasttext_test_df.isin(fasttext_train_df)].dropna()
#.sample(frac=1)

print(
f'Generating training data: \n' +
f'  head_size: {head_size}\n' +
f'  tail_size: {tail_size}\n' +
f'  fasttext_train_df #rows: {fasttext_train_df.shape[0]}\n' +
f'  fasttext_test_df #rows: {fasttext_test_df.shape[0]}\n'
)



with open(FASTTEXT_LABEL_FILE_TRAINING, 'w') as file:
    for index, row in fasttext_train_df.iterrows():
        fasttext_row = f"__label__{row['to_category_id']} {row['query_normalized']}"
        file.write(fasttext_row + '\n')

with open(FASTTEXT_LABEL_FILE_TEST, 'w') as file:
    for index, row in fasttext_test_df.iterrows():
        fasttext_row = f"__label__{row['to_category_id']} {row['query_normalized']}"
        file.write(fasttext_row + '\n')



env: FASTTEXT_LABEL_FILE=/workspace/search_with_machine_learning_course/week4 + '/data/query_classification.fasttext'
env: FASTTEXT_LABEL_FILE_SHUFFLED=/workspace/search_with_machine_learning_course/week4 + '/data/query_classification_shuffled.fasttext'
env: FASTTEXT_LABEL_FILE_TRAINING=/workspace/search_with_machine_learning_course/week4 + '/data/query_classification_train.fasttext'
env: FASTTEXT_LABEL_FILE_TEST=/workspace/search_with_machine_learning_course/week4 + '/data/query_classification_test.fasttext'
Generating training data: 
  head_size: 211371
  tail_size: 56366
  fasttext_train_df #rows: 211371
  fasttext_test_df #rows: 56366



In [394]:
!~/fastText-0.9.2/fasttext supervised \
    -input $FASTTEXT_LABEL_FILE_TRAINING \
    -output $WEEK4/data/query-classifier-model \
    -epoch 25 \
    -lr .05 \
    -loss hs \
    -wordNgrams 2


Read 0M words
Number of words:  39993
Number of labels: 158
Progress: 100.0% words/sec/thread:   32292 lr:  0.000000 avg.loss:  2.165621 ETA:   0h 0m 0s  0h 0m54s36s lr:  0.027545 avg.loss:  2.722070 ETA:   0h 0m33s 2.659781 ETA:   0h 0m30sm22s  0h 0m 2s


In [391]:
!~/fastText-0.9.2/fasttext test \
    $WEEK4/data/query-classifier-model.bin \
    $FASTTEXT_LABEL_FILE_TEST

print('')    

!~/fastText-0.9.2/fasttext test \
    $WEEK4/data/query-classifier-model.bin \
    $FASTTEXT_LABEL_FILE_TEST \
    3
print('')

!~/fastText-0.9.2/fasttext test \
    $WEEK4/data/query-classifier-model.bin \
    $FASTTEXT_LABEL_FILE_TEST \
    5

N	56366
P@1	0.332
R@1	0.332

N	56366
P@3	0.166
R@3	0.499

N	56366
P@5	0.113
R@5	0.563


In [15]:
import fasttext
import functions


inputs = ['iphone', 'ipad', 'apple iphone', 'playstation 3', 'age of empires', 'sharp tv', 'haystack']

number_of_predictions = 5
model = fasttext.load_model(WEEK4 + '/data/query-classifier-model.bin')

for input in inputs:
    input_stemmed = functions.transform_name(input)

    print(f"predition for {input} / stemmed: {input_stemmed}:")
    (predictions, scores) = model.predict(input_stemmed, k=number_of_predictions)

    i=0
    for prediction in predictions:    
        label = prediction.split('__')[2]
        node = cat_nodes[label]
        print(f"{label} - {node.data['name']} ({node.data['count']}):{scores[i]}")
        #grep_output = os.popen(f'grep -iRh {lable} -A 1 /workspace/datasets/product_data/categories | tail -n 1 | sed -e \'s/<[^>]*>//g\'').read()
        #print(grep_output.strip())
        i = i+1
    print('\n')

predition for iphone / stemmed: iphon:
pcmcat191200050015 - iPhone Accessories (2200):0.043403368443250656
abcat0208011 - iPod & MP3 Speakers, Docks & Radios (1827):0.037946850061416626
pcmcat248700050021 - Home (3703):0.037323445081710815
abcat0307005 - Car Accessories (3348):0.03704296052455902
abcat0300000 - Car, Marine & GPS (6805):0.0349414125084877


predition for ipad / stemmed: ipad:
pcmcat231800050009 - Tablet Accessories (3564):0.07089034467935562
pcmcat209000050007 - iPad (1762):0.06924150884151459
pcmcat217900050000 - iPad Accessories (3164):0.05312281847000122
pcmcat230600050006 - Sports Fan Shop (1001):0.03462045639753342
pcmcat248700050021 - Home (3703):0.03370543569326401


predition for apple iphone / stemmed: appl iphon:
pcmcat209400050001 - All Mobile Phones with Plans (4907):0.17464517056941986
pcmcat201900050009 - Screen Protectors (1371):0.0925264060497284
abcat0800000 - Mobile Phones (25818):0.07401757687330246
abcat0208007 - iPod & MP3 Player Accessories (2902):



{'cat00000': <__main__.Node at 0x7f60cd1ad340>,
 'abcat0100000': <__main__.Node at 0x7f60da89aeb0>,
 'pcmcat245100050028': <__main__.Node at 0x7f60da89af10>,
 'abcat0800000': <__main__.Node at 0x7f60cc7bcfd0>,
 'abcat0300000': <__main__.Node at 0x7f60cc7bc250>,
 'pcmcat149900050025': <__main__.Node at 0x7f60cc7bcb80>,
 'pcmcat133600050011': <__main__.Node at 0x7f60cc7bc340>,
 'abcat0600000': <__main__.Node at 0x7f60cc7be0d0>,
 'abcat0200000': <__main__.Node at 0x7f60cc7bec40>,
 'pcmcat248000050016': <__main__.Node at 0x7f60cc7bef40>,
 'abcat0700000': <__main__.Node at 0x7f60cc7be850>,
 'pcmcat248700050021': <__main__.Node at 0x7f60cc7be820>,
 'pcmcat164600050001': <__main__.Node at 0x7f60cc7be520>,
 'pcmcat128500050004': <__main__.Node at 0x7f60cc7be2e0>,
 'pcmcat230000050010': <__main__.Node at 0x7f60cc7bee50>,
 'pcmcat102500050032': <__main__.Node at 0x7f60cc7bedc0>,
 'pcmcat159800050001': <__main__.Node at 0x7f60cc7be3d0>,
 'pcmcat226500050012': <__main__.Node at 0x7f60cc7bea30>,
 '