In [1]:
import chardet

with open("data-unspsc-codes.csv", 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [2]:
import pandas as pd

df = pd.read_csv("data-unspsc-codes.csv", encoding=result['encoding'])

df.head()

Unnamed: 0,Segment,Segment Name,Family,Family Name,Class,Class Name,Commodity,Commodity Name
0,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101501,Cats
1,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101502,Dogs
2,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101504,Mink
3,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101505,Rats
4,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101506,Horses


In [3]:
cat_cols = ["Segment Name", "Family Name", "Class Name", "Commodity Name"]

In [None]:
from itertools import combinations

for i, col in enumerate(cat_cols):
    unique = pd.unique(df[col])
    n_unqiue = len(unique)
        
    total = len(df[col])
    print(f"col: {col}\nunqiue: {n_unqiue}\ntotal: {total}\n")
    
    if i < len(cat_cols) - 1:
        next_branch_counts = []
        next_branches: dict[str, set] = {}
        
        for uc in unique:
            df_next = df[df[col] == uc]
            next_branch_counts.append(len(pd.unique(df_next[cat_cols[i+1]])))
            next_branches[uc] = set(pd.unique(df_next[cat_cols[i+1]]))
    
        print(f"next level:\nmax:{max(next_branch_counts)}\navg:{sum(next_branch_counts)/len(next_branch_counts)}\n")
        
        ambiguous = False
        for a, b in combinations(next_branches.keys(), 2):
            intersection = next_branches[a] & next_branches[b]
            if len(intersection) > 0:
                ambiguous = True
                print(f"The following nodes appear in both {a} and {b} for {cat_cols[i+1]}: {intersection}")
            
        if ambiguous:    
            print(f"{col} has ambiguous branches\n")
        else:
            print(f"{col} does NOT have ambiguous branches\n")
            

col: Segment Name
unqiue: 57
total: 71502

next level:
max:43
avg:8.157894736842104

Segment Name does NOT have ambiguous branches

col: Family Name
unqiue: 465
total: 71502

next level:
max:67
avg:11.425806451612903

Family Name does NOT have ambiguous branches

col: Class Name
unqiue: 5313
total: 71502

next level:
max:99
avg:13.45793337097685

Class Name does NOT have ambiguous branches

col: Commodity Name
unqiue: 71502
total: 71502



In [4]:
from typing import Callable, Union



class Node:
    """Class representing a node in the tree"""
    
    breadcrumb_name: Union[str, None] = None
    """The name of the breadcrumb found in the original dataset or None"""
    
    condition: Union[str, None] = None
    """The description or condition representing belonging to this node or None if this Node is the root"""
    
    extras: dict[str, any]
    """Extra information stored at this node for enrichment"""
    
    parent: Union['Node', None]
    """The parent Node or None if this Node is the root"""
    
    children: list['Node']
    """The child Nodes"""
    
    # TODO: create next condition when children are updated
    # TODO: children should be property
    
    def is_leaf(self) -> bool:
        return len(self.children) == 0
    
    def is_root(self) -> bool:
        return self.parent == None
    
    def is_from_breadcrumb(self) -> bool:
        return self.breadcrumb_name != None
    
    def __init__(self, condition: str = None, parent: 'Node' = None, extras: dict[str, any] = {}):
        self.condition = condition
        self.parent = parent
        self.extras = extras or {}
        self.children = []
        
    def add_children(self, children: list['Node']):
        self.children.extend(children)
        
        
def _create_partial_tree_from_breadcrumb(df: pd.DataFrame, parent: Node, idx: int, breadcrumb_cols: list[str], extra_cols_map: dict[str, list[str]] = None) -> list[Node]:
    breadcrumb_col = breadcrumb_cols[idx]
    breadcrumbs = pd.unique(df[breadcrumb_col])
    nodes = []
    
    for breadcrumb in breadcrumbs:
        extras = {}
        extra_cols = extra_cols_map.get(breadcrumb_col, []) if extra_cols_map else []
        sub_df = df[df[breadcrumb_col] == breadcrumb]
        for extra_col in extra_cols:
            extras[extra_col] = pd.unique(sub_df[extra_col])
        node = Node(condition=breadcrumb, parent=parent, extras=extras)
        if idx + 1 < len(breadcrumb_cols):
            node.add_children(_create_partial_tree_from_breadcrumb(sub_df, node, idx + 1, breadcrumb_cols=breadcrumb_cols, extra_cols_map=extra_cols_map))
        nodes.append(node)
    return nodes
        
        
    
    
def create_tree_from_breadcrumbs(df: pd.DataFrame, breadcrumb_cols: list[str], extra_cols_map: dict[str, list[str]] = None) -> Node:
    """_Create a tree from breadcrumbs left in the dataset. Suitable for datasets with an existing heirarchy. `breadcrumb_cols` is an ordered list of columns that represent heirarchical levels. `extra_cols_map` contains extra columns to store in `Node.extras` for each breadcrumb if applicable._

    
    Args:
        df (pd.DataFrame): dataset
        breadcrumb_cols (list[str]): ordered heirarchy columns
        extra_cols_map (dict[str, list[str]], optional): list of extra columns to enrich nodes for each breadcrumb column. Defaults to None.
    """
    
    root = Node()
    root.add_children(_create_partial_tree_from_breadcrumb(df, parent=root, idx=0, breadcrumb_cols=breadcrumb_cols, extra_cols_map=extra_cols_map))
    return root
    

In [5]:
root = create_tree_from_breadcrumbs(df, breadcrumb_cols=["Segment Name", "Family Name", "Class Name", "Commodity Name"], extra_cols_map={"Segment Name": ["Segment"], "Family Name": ["Family"], "Class Name": ["Class"], "Commodity Name": ["Commodity"]})

In [6]:
def _check_branches(nodes: list[Node]):
    from itertools import combinations


    n_sub_branches = 0
    max_sub_branches = 0
    all_children = []
    for node in nodes:
        n_sub_branches += len(node.children)
        all_children.extend(node.children)
        max_sub_branches = max(len(node.children), max_sub_branches)
    
    avg_sub_branches = n_sub_branches / len(nodes)
    
    print(f"sub_branches: {n_sub_branches}, avg: {avg_sub_branches}, max: {max_sub_branches}\n")
    
    if len(all_children) > 0:
        _check_branches(all_children)

def check_tree(root: Node):
    _check_branches(nodes=root.children)

In [7]:
check_tree(root)

sub_branches: 465, avg: 8.157894736842104, max: 43

sub_branches: 5313, avg: 11.425806451612903, max: 67

sub_branches: 71502, avg: 13.45793337097685, max: 99

sub_branches: 0, avg: 0.0, max: 0



In [8]:
import plotly.express as px

def display_tree(root: Node):
    labels = []
    parents = []

    def traverse(node: Node, parent_label: str = ""):
        label = str(node.condition or "Unnamed")
        labels.append(label)
        parents.append(parent_label)

        for child in node.children:
            traverse(child, parent_label=label)

    traverse(root)

    fig = px.sunburst(
        names=labels,
        parents=parents,
    )
    fig.update_layout(margin=dict(t=10, l=10, r=10, b=10))
    fig.show()


In [59]:
def get_node_conditions(node: Node):
    n = node
    labels = []
    while n != None:
        if n.condition is not None:
            labels.append(n.condition)
        n = n.parent
        
    return list(reversed(labels))

def format_node(node: Node):
    labels = get_node_conditions(node=node)
    return " > ".join(labels)

In [69]:
node = root.children[3].children[0].children[2]

print(format_node(node))

display_tree(node)

Resin and Rosin and Rubber and Foam and Film and Elastomeric Materials > Rubber and elastomers > Elastomers


In [70]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
)

from langchain_core.vectorstores import InMemoryVectorStore

def create_vector_store(texts: list[str]) -> InMemoryVectorStore:
    vectorstore = InMemoryVectorStore.from_texts(
        texts,
        embedding=embeddings,
    )
    
    return vectorstore

In [71]:
cats = [n.condition for n in node.children]

vectorstore = create_vector_store(cats)

In [72]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

def sample_from_embeddings(vectorstore: InMemoryVectorStore, samples: int = 10) -> list[list[float]]:
    store = vectorstore.store
    rev_map = {}
    embeddings = []
    for idx, (k, v) in enumerate(store.items()):
        vector = v['vector']
        rev_map[idx] = v['text']
        embeddings.append(vector)
        
    normalized_embeddings = normalize(embeddings, norm='l2', axis=1)

    k = samples

    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(normalized_embeddings)

    from sklearn.metrics.pairwise import cosine_similarity

    representative_indices = []
    for center in kmeans.cluster_centers_:
        sims = cosine_similarity([center], normalized_embeddings)[0]
        idx = np.argmax(sims)
        representative_indices.append(idx)

    selected_keys = [rev_map[i] for i in representative_indices]
    return selected_keys

In [None]:
print(len(vectorstore.store))



25


['Polyacrylate ACM',
 'Synthetic polyisoprene IR',
 'Ethylene propylene EP',
 'Acrylonitrile butadiene NBR',
 'Fluorosilicone FVMQ',
 'Polyether urethane EU',
 'Chloropolyethylene CM',
 'Copolyester',
 'Epichlorohydrin ECO',
 'Polyolenfinic']

In [119]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage
from pydantic import BaseModel, Field

class CategoryAnswer(BaseModel):
    category1: str = Field(description="The first created category that will divide the subset of items in the dataset into the next two subsets.")
    category2: str = Field(description="The second created category that is the negative of the first one.")

def ask_model_category(node: Node):
    selected_cats = sample_from_embeddings(vectorstore=vectorstore, samples=min(10, len(node.children)))
        
    template = """
    Your job is to create two new categories that will serve as nodes in a tree to help people find items.
    
    The parent categories for this subset of the dataset are:
    {parent_conditions}
    
    The following are a subset of categorizations that exist below this level and need to be divided into two smaller groups:
    {conditions}

    The items in the full dataset cover the scope of: {scope}

    Choose a pair of categories that divide the ones lower than this level into two roughly equal groups.
    For any item in the scope, the categories you provide should not be ambiguous or overlap.
    The categories you provide should be specific to items that already fit inside the parent categories and divide those items further according to the instructions above.
    ALL items that satisfy the parent categories need to satisfy at least one of the categories you provide.
    
    Simply negating the first category is one valid answer: ex: Plants and Not Plants
    """

    prompt = ChatPromptTemplate.from_template(template)
    prompt = prompt.format(conditions="\n* ".join(selected_cats), parent_conditions="\n* ".join(get_node_conditions(node)), scope="Products across all industries")

    llm = ChatOllama(
        model="llama3.1",
        temperature=0,
    )
    
    llm = llm.with_structured_output(CategoryAnswer)

    return llm.invoke([HumanMessage(prompt)])

In [120]:
result = ask_model_category(node=node)
result

CategoryAnswer(category1='Thermoplastic Elastomers', category2='Thermosetting Elastomers')