In [1]:
import pickle
from model import Node, categorize_next, display_lazy_tree, format_node, format_time

In [2]:
import chardet

with open("amazon_products.csv", 'rb') as f:
    result = chardet.detect(f.read(8*1024*1024))
    print(result)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [3]:
import pandas as pd
df = pd.read_csv("amazon_products.csv", encoding=result["encoding"])

In [4]:
df = df.sample(20000)
df.head()

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
347681,B0BDR8BST3,"Warm Bird Nest House for Cages, Hanging Bird B...",https://m.media-amazon.com/images/I/61ZE5YPoj9...,https://www.amazon.com/dp/B0BDR8BST3,3.5,0,12.58,0.0,178,False,0
693671,B086KMYNSS,"Face Mask, Pack of 50 (5081)",https://m.media-amazon.com/images/I/41j2VaJpQ+...,https://www.amazon.com/dp/B086KMYNSS,4.5,0,3.99,0.0,162,False,600
179343,B099DHX1S6,FitTurn Bands Compatible with Fitbit Luxe Meta...,https://m.media-amazon.com/images/I/61U0E2KLrp...,https://www.amazon.com/dp/B099DHX1S6,3.9,236,13.98,0.0,128,False,0
277908,B0B8MWKJWC,Tanstic 4Pcs 2 Inch U Groove Wheels with Brack...,https://m.media-amazon.com/images/I/71aVEwTFf5...,https://www.amazon.com/dp/B0B8MWKJWC,3.7,8,17.99,0.0,141,False,0
1281311,B09VK7L627,17Pcs 1.57inch Small Brass Bell Copper Grazing...,https://m.media-amazon.com/images/I/81lc11KPni...,https://www.amazon.com/dp/B09VK7L627,4.7,0,14.99,0.0,182,False,0


In [5]:
with open("./tree_v4_improved.pkl", "rb") as f:
    root = pickle.load(f)

In [6]:
from typing import Callable, Optional, Tuple
from langchain_core.language_models.chat_models import BaseChatModel

from model.data import TokenCounts

def classify(item_description: str, root: Node, create_llm: Callable[[], BaseChatModel]) -> Tuple[Optional[Node], TokenCounts]:
    def classify_recur(item_description: str, node: Node) -> Tuple[Optional[Node], TokenCounts]:
        if node.is_leaf():
            return node, TokenCounts()
        children = [*node.children]
        all_tokens = TokenCounts()
        print(f"Entering {node.condition}")
        while len(children) > 0:
            print(f"Trying {node.condition} with {len(children)} unexplored child nodes.")
            choice, tokens = categorize_next(item_description=item_description, nodes=children, create_llm=create_llm)
            print(f"Chose {choice.condition if choice else "None"}")
            all_tokens += tokens
            if choice is None:
                print(f"Retrying previous node with mask.")
                return (None, all_tokens)
            
            print(f"Trying chosen subtree.")
            result, tokens = classify_recur(item_description=item_description, node=choice)
            print(f"Subtree yielded choice {result.condition if result else "None"}.")
            all_tokens += tokens
            if result is not None:
                print(f"Yielding subtree result {result.condition}")
                return (result, all_tokens)
            
            print(f"Retrying {node.condition} with mask.")
            children.remove(choice)
            
        return None, all_tokens
            
    return classify_recur(item_description=item_description, node=root)

In [7]:
from langchain_ollama import ChatOllama
create_llm = lambda: ChatOllama(
    model="qwen2.5:32b",
    # temperature=0,
)

In [8]:
display_lazy_tree(root, max_initial_depth=5)

VBox(children=(FigureWidget({
    'data': [{'branchvalues': 'total',
              'ids': [34a435e9-b882-45d3-…

In [9]:
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_community.callbacks import get_openai_callback


class Description(BaseModel):
    reasoning: str = Field(description="Reasoning on your choice.")
    description: str = Field(description="The shortened description")

def prompt_tentative_description(item_description: str, create_llm: Callable[[], BaseChatModel]) -> Tuple[Description, TokenCounts]:
    template = """
    From the given e-commerce item description give a simplified short, generic description that would be useful in categorizing an item into product standardizations.
    From specific product names and branding provide a description that includes the true nature of the product.
    Use the given tool to provide your answer.
    
    {item}
    """.strip()
    
    prompt = ChatPromptTemplate.from_template(template)
    prompt = prompt.format(item=item_description)
    
    with get_openai_callback() as cb:
        llm = create_llm().with_structured_output(Description)
        response = llm.invoke([HumanMessage(prompt)])
        
    return response, TokenCounts(prompt=cb.prompt_tokens, completion=cb.completion_tokens, total=cb.total_tokens)
    

In [10]:
from ipywidgets import IntProgress, VBox, Label
from IPython.display import display
import time

def create_classification_sample(df: pd.DataFrame, root: Node) -> Tuple[pd.DataFrame, TokenCounts]:
    items: list[dict[str, any]] = []
    all_tokens = TokenCounts()

    total_rows = len(df)
    progress_bar = IntProgress(min=0, max=total_rows, description='Progress:', bar_style='info')
    label = Label(value=f"0/{total_rows} items processed")
    time_label = Label(value="Estimating time...")
    display(VBox([progress_bar, label, time_label]))

    processed_rows = 0
    start_time = time.time()

    for i, row in df.iterrows():
        print(f"\tItem: {row['title']}")
        short_desc, desc_tokens = prompt_tentative_description(item_description=row["title"], create_llm=create_llm)
        print(f"\tShort Desc: {short_desc.description}")
        node, tokens = classify(item_description=short_desc.description, root=root, create_llm=create_llm)
        record = {
            "item": row["title"],
            "imputed_desc": short_desc.description,
            "classification": format_node(node) if node else "None"
        }
        items.append(record)
        all_tokens += tokens + desc_tokens

        processed_rows += 1
        elapsed_time = time.time() - start_time
        progress_bar.value = processed_rows
        label.value = f"{processed_rows}/{total_rows} items processed"
        estimated_total_time = elapsed_time / processed_rows * total_rows
        remaining_time = estimated_total_time - elapsed_time
        time_label.value = f"Estimated remaining time: {format_time(remaining_time)}"

    result = pd.DataFrame(items)
    return result, all_tokens


In [12]:
df_sample = df.sample(100)

args: list[dict[str,any]] = [
    {
        "model_file": "tree_v3_improved.pkl",
        "output_file": "classifications_v3_improved.csv"
    },
    {
        "model_file": "tree_v3.pkl",
        "output_file": "classifications_v3.csv"
    },
    {
        "model_file": "tree_v4_improved.pkl",
        "output_file": "classifications_v4_improved.csv"
    },
    {
        "model_file": "tree_v4.pkl",
        "output_file": "classifications_v4.csv"
    },
]

output_dir = "classification_results"
tokens_output_file = "tokens.csv"

import os
import time

os.makedirs(output_dir, exist_ok=True)


tokens_dicts = []
for argset in args:
    start_time = time.time()
    with open(argset["model_file"], "rb") as f:
        root = pickle.load(f)
    
    out_df, tokens = create_classification_sample(df_sample, root)
    elapsed_time = time.time() - start_time
    tokens_dicts.append({
        **argset,
        "prompt": tokens.prompt,
        "completion": tokens.completion,
        "total": tokens.total,
        "elapsed_seconds": elapsed_time
    })
    out_df.to_csv(f'{output_dir}/{argset["output_file"]}')
    print(f"Saved classifications from {argset['model_file']} to {output_dir}/{argset['output_file']}")
    
tokens_df = pd.DataFrame(tokens_dicts)
tokens_df.to_csv(tokens_output_file)

VBox(children=(IntProgress(value=0, bar_style='info', description='Progress:'), Label(value='0/100 items proce…

	Item: FUNCRECOL ABS-Like 3D Printer Resin 1KG Grey
	Short Desc: 3D Printer Resin - Grey - 1KG
Entering All Products
Trying All Products with 20 unexplored child nodes.
Chose Additive Manufacturing Products
Trying chosen subtree.
Subtree yielded choice Additive Manufacturing Products.
Yielding subtree result Additive Manufacturing Products
	Item: JOINLGO 4-CH GPS WiFi 1080P AHD Mobile Vehicle Car DVR MDVR Video Recorder Kit Remote Live View on PC Phone with IP69 Night Vision 2.0MP Rear Side View Car Camera for Truck Bus RV
	Short Desc: Vehicle Car DVR Kit with Multiple Cameras and Remote Viewing Capability
Entering All Products
Trying All Products with 20 unexplored child nodes.
Chose Auto Parts & Accessories
Trying chosen subtree.
Entering Auto Parts & Accessories
Trying Auto Parts & Accessories with 12 unexplored child nodes.
Chose Automotive Exterior Accessories
Trying chosen subtree.
Subtree yielded choice Automotive Exterior Accessories.
Yielding subtree result Automotive Exterior

VBox(children=(IntProgress(value=0, bar_style='info', description='Progress:'), Label(value='0/100 items proce…

	Item: FUNCRECOL ABS-Like 3D Printer Resin 1KG Grey
	Short Desc: 3D Printer Resin, Grey, 1KG
Entering All Products
Trying All Products with 19 unexplored child nodes.
Chose Additive Manufacturing Products
Trying chosen subtree.
Subtree yielded choice Additive Manufacturing Products.
Yielding subtree result Additive Manufacturing Products
	Item: JOINLGO 4-CH GPS WiFi 1080P AHD Mobile Vehicle Car DVR MDVR Video Recorder Kit Remote Live View on PC Phone with IP69 Night Vision 2.0MP Rear Side View Car Camera for Truck Bus RV
	Short Desc: Vehicle Security System: 4-Channel AHD Car DVR Kit with GPS, WiFi Remote Live View, 1080P Recording, Night Vision, Rear Camera
Entering All Products
Trying All Products with 19 unexplored child nodes.
Chose Automotive Accessories & Parts
Trying chosen subtree.
Entering Automotive Accessories & Parts
Trying Automotive Accessories & Parts with 12 unexplored child nodes.
Chose Automotive Performance Parts & Accessories
Trying chosen subtree.
Subtree yielded c

VBox(children=(IntProgress(value=0, bar_style='info', description='Progress:'), Label(value='0/100 items proce…

	Item: FUNCRECOL ABS-Like 3D Printer Resin 1KG Grey
	Short Desc: 3D Printer Resin, 1KG
Entering All Products
Trying All Products with 12 unexplored child nodes.
Chose Printmaking Supplies
Trying chosen subtree.
Subtree yielded choice Printmaking Supplies.
Yielding subtree result Printmaking Supplies
	Item: JOINLGO 4-CH GPS WiFi 1080P AHD Mobile Vehicle Car DVR MDVR Video Recorder Kit Remote Live View on PC Phone with IP69 Night Vision 2.0MP Rear Side View Car Camera for Truck Bus RV
	Short Desc: Vehicle DVR Kit with 4CH AHD Cameras, 1080P Video Recording, GPS, WiFi Remote Live View, Night Vision
Entering All Products
Trying All Products with 12 unexplored child nodes.
Chose Vehicle Maintenance & Powersports Equipment
Trying chosen subtree.
Entering Vehicle Maintenance & Powersports Equipment
Trying Vehicle Maintenance & Powersports Equipment with 7 unexplored child nodes.
Chose RV Parts & Accessories
Trying chosen subtree.
Subtree yielded choice RV Parts & Accessories.
Yielding subtree

VBox(children=(IntProgress(value=0, bar_style='info', description='Progress:'), Label(value='0/100 items proce…

	Item: FUNCRECOL ABS-Like 3D Printer Resin 1KG Grey
	Short Desc: 3D Printer Resin - Grey, 1KG (ABS-like)
Entering All Products
Trying All Products with 10 unexplored child nodes.
Chose Education & Research Supplies
Trying chosen subtree.
Entering Education & Research Supplies
Trying Education & Research Supplies with 2 unexplored child nodes.
Chose None
Retrying previous node with mask.
Subtree yielded choice None.
Retrying All Products with mask.
Trying All Products with 9 unexplored child nodes.
Chose Professional & Business Tools
Trying chosen subtree.
Entering Professional & Business Tools
Trying Professional & Business Tools with 2 unexplored child nodes.
Chose None
Retrying previous node with mask.
Subtree yielded choice None.
Retrying All Products with mask.
Trying All Products with 8 unexplored child nodes.
Chose Industrial & Commercial Goods
Trying chosen subtree.
Entering Industrial & Commercial Goods
Trying Industrial & Commercial Goods with 4 unexplored child nodes.
Chose I

In [13]:
tokens

TokenCounts(prompt=76492, completion=45351, total=121843)