In [2]:
import pickle
from model import Node, categorize_next, display_lazy_tree, format_node, format_time

In [3]:
import chardet

with open("amazon_products.csv", 'rb') as f:
    result = chardet.detect(f.read(8*1024*1024))
    print(result)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [4]:
import pandas as pd
df = pd.read_csv("amazon_products.csv", encoding=result["encoding"])

In [5]:
df = df.sample(20000)
df.head()

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
1150788,B002JTG0PO,"Polk Audio MC60 2-Way In-Ceiling 6.5"" Speaker ...",https://m.media-amazon.com/images/I/81-1RNG2Np...,https://www.amazon.com/dp/B002JTG0PO,4.7,0,149.0,0.0,82,False,0
477829,B00S15FGE6,Chad Uniform Oxford (Toddler/Little Kid/Big Kid),https://m.media-amazon.com/images/I/61o+VZ39Oa...,https://www.amazon.com/dp/B00S15FGE6,4.5,0,56.49,65.0,97,False,0
1126164,B06ZYGB2NM,"CRAFTMEMORE 1/4"" Hole Size 100 Sets Gunmetal B...",https://m.media-amazon.com/images/I/616S8TsTOE...,https://www.amazon.com/dp/B06ZYGB2NM,4.3,0,12.99,0.0,6,False,0
695195,B09364MYWD,Mini Size Glass Teapot Tea Kettle-with Stainle...,https://m.media-amazon.com/images/I/71Gqk22o0A...,https://www.amazon.com/dp/B09364MYWD,4.4,0,9.98,0.0,162,False,800
128464,B0BK8B3Y23,"Flying Orb Ball Toy, Hover/Boomerang Ball, Fly...",https://m.media-amazon.com/images/I/71n+Q5Sb0L...,https://www.amazon.com/dp/B0BK8B3Y23,4.0,632,32.0,0.0,228,False,100


In [6]:
with open("./tree_v4_improved.pkl", "rb") as f:
    root = pickle.load(f)

In [6]:
from typing import Callable, Optional, Tuple
from langchain_core.language_models.chat_models import BaseChatModel

from model.data import TokenCounts

def classify(item_description: str, root: Node, create_llm: Callable[[], BaseChatModel]) -> Tuple[Optional[Node], TokenCounts]:
    def classify_recur(item_description: str, node: Node) -> Tuple[Optional[Node], TokenCounts]:
        if node.is_leaf():
            return node, TokenCounts()
        children = [*node.children]
        all_tokens = TokenCounts()
        print(f"Entering {node.condition}")
        while len(children) > 0:
            print(f"Trying {node.condition} with {len(children)} unexplored child nodes.")
            choice, tokens = categorize_next(item_description=item_description, nodes=children, create_llm=create_llm)
            print(f"Chose {choice.condition if choice else "None"}")
            all_tokens += tokens
            if choice is None:
                print(f"Retrying previous node with mask.")
                return (None, all_tokens)
            
            print(f"Trying chosen subtree.")
            result, tokens = classify_recur(item_description=item_description, node=choice)
            print(f"Subtree yielded choice {result.condition if result else "None"}.")
            all_tokens += tokens
            if result is not None:
                print(f"Yielding subtree result {result.condition}")
                return (result, all_tokens)
            
            print(f"Retrying {node.condition} with mask.")
            children.remove(choice)
            
        return None, all_tokens
            
    return classify_recur(item_description=item_description, node=root)

In [7]:
from langchain_ollama import ChatOllama
create_llm = lambda: ChatOllama(
    model="qwen2.5:32b",
    # temperature=0,
)

In [7]:
display_lazy_tree(root, max_initial_depth=5)

VBox(children=(FigureWidget({
    'data': [{'branchvalues': 'total',
              'ids': [9d02c85c-ac42-4982-…

In [9]:
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_community.callbacks import get_openai_callback


class Description(BaseModel):
    reasoning: str = Field(description="Reasoning on your choice.")
    description: str = Field(description="The shortened description")

def prompt_tentative_description(item_description: str, create_llm: Callable[[], BaseChatModel]) -> Tuple[Description, TokenCounts]:
    template = """
    From the given e-commerce item description give a simplified short, generic description that would be useful in categorizing an item into product standardizations.
    From specific product names and branding provide a description that includes the true nature of the product.
    Use the given tool to provide your answer.
    
    {item}
    """.strip()
    
    prompt = ChatPromptTemplate.from_template(template)
    prompt = prompt.format(item=item_description)
    
    with get_openai_callback() as cb:
        llm = create_llm().with_structured_output(Description)
        response = llm.invoke([HumanMessage(prompt)])
        
    return response, TokenCounts(prompt=cb.prompt_tokens, completion=cb.completion_tokens, total=cb.total_tokens)
    

In [10]:
from ipywidgets import IntProgress, VBox, Label
from IPython.display import display
import time

def create_classification_sample(df: pd.DataFrame, root: Node) -> Tuple[pd.DataFrame, TokenCounts]:
    items: list[dict[str, any]] = []
    all_tokens = TokenCounts()

    total_rows = len(df)
    progress_bar = IntProgress(min=0, max=total_rows, description='Progress:', bar_style='info')
    label = Label(value=f"0/{total_rows} items processed")
    time_label = Label(value="Estimating time...")
    display(VBox([progress_bar, label, time_label]))

    processed_rows = 0
    start_time = time.time()

    for i, row in df.iterrows():
        print(f"\tItem: {row['title']}")
        short_desc, desc_tokens = prompt_tentative_description(item_description=row["title"], create_llm=create_llm)
        print(f"\tShort Desc: {short_desc.description}")
        node, tokens = classify(item_description=short_desc.description, root=root, create_llm=create_llm)
        record = {
            "item": row["title"],
            "imputed_desc": short_desc.description,
            "classification": format_node(node) if node else "None"
        }
        items.append(record)
        all_tokens += tokens + desc_tokens

        processed_rows += 1
        elapsed_time = time.time() - start_time
        progress_bar.value = processed_rows
        label.value = f"{processed_rows}/{total_rows} items processed"
        estimated_total_time = elapsed_time / processed_rows * total_rows
        remaining_time = estimated_total_time - elapsed_time
        time_label.value = f"Estimated remaining time: {format_time(remaining_time)}"

    result = pd.DataFrame(items)
    return result, all_tokens


In [None]:
df_sample = df.sample(100)

args: list[dict[str,any]] = [
    {
        "model_file": "tree_v3_improved.pkl",
        "output_file": "classifications_v3_improved.csv"
    },
    {
        "model_file": "tree_v3.pkl",
        "output_file": "classifications_v3.csv"
    },
    {
        "model_file": "tree_v4_improved.pkl",
        "output_file": "classifications_v4_improved.csv"
    },
    {
        "model_file": "tree_v4.pkl",
        "output_file": "classifications_v4.csv"
    },
]

output_dir = "classification_results"
tokens_output_file = "tokens.csv"

import os
import time

os.makedirs(output_dir, exist_ok=True)


tokens_dicts = []
for argset in args:
    start_time = time.time()
    with open(argset["model_file"], "rb") as f:
        root = pickle.load(f)
    
    out_df, tokens = create_classification_sample(df_sample, root)
    elapsed_time = time.time() - start_time
    tokens_dicts.append({
        **argset,
        "prompt": tokens.prompt,
        "completion": tokens.completion,
        "total": tokens.total,
        "elapsed_seconds": elapsed_time
    })
    out_df.to_csv(f'{output_dir}/{argset["output_file"]}')
    print(f"Saved classifications from {argset['model_file']} to {output_dir}/{argset['output_file']}")
    
tokens_df = pd.DataFrame(tokens_dicts)
tokens_df.to_csv(tokens_output_file)

VBox(children=(IntProgress(value=0, bar_style='info', description='Progress:'), Label(value='0/100 items proce…

	Item: WOCRAFT 100g(About 76pcs) Craft Supplies Small Antique Bronze Silver Sun Moon Star Charms Pendants for Jewelry Making Findings Crafting Accessory for DIY Necklace Bracelet (M206)
	Short Desc: Small antique bronze silver sun, moon, star charms for DIY jewelry
Entering All Products
Trying All Products with 12 unexplored child nodes.
Chose None
Retrying previous node with mask.
	Item: Dinosaur Swaddle Blanket, Dinosaur Baby Stuff, Newborn Soft Wearable Blanket Swaddle Up Sleep Sacks for Boys Girls Dinosaur Shower Gift
	Short Desc: Soft wearable blanket or sleep sack for newborns, suitable for both boys and girls.
Entering All Products
Trying All Products with 12 unexplored child nodes.
Chose Children's Education & Apparel
Trying chosen subtree.
Entering Children's Education & Apparel
Trying Children's Education & Apparel with 8 unexplored child nodes.
Chose None
Retrying previous node with mask.
Subtree yielded choice None.
Retrying All Products with mask.
Trying All Products with 