In [1]:
import chardet

with open("data-unspsc-codes.csv", 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [2]:
import pandas as pd

df = pd.read_csv("data-unspsc-codes.csv", encoding=result['encoding'])

df.head()

Unnamed: 0,Segment,Segment Name,Family,Family Name,Class,Class Name,Commodity,Commodity Name
0,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101501,Cats
1,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101502,Dogs
2,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101504,Mink
3,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101505,Rats
4,10000000,Live Plant and Animal Material and Accessories...,10100000,Live animals,10101500,Livestock,10101506,Horses


In [3]:
cat_cols = ["Segment Name", "Family Name", "Class Name", "Commodity Name"]

In [4]:
from itertools import combinations

for i, col in enumerate(cat_cols):
    unique = pd.unique(df[col])
    n_unqiue = len(unique)
        
    total = len(df[col])
    print(f"col: {col}\nunqiue: {n_unqiue}\ntotal: {total}\n")
    
    if i < len(cat_cols) - 1:
        next_branch_counts = []
        next_branches: dict[str, set] = {}
        
        for uc in unique:
            df_next = df[df[col] == uc]
            next_branch_counts.append(len(pd.unique(df_next[cat_cols[i+1]])))
            next_branches[uc] = set(pd.unique(df_next[cat_cols[i+1]]))
    
        print(f"next level:\nmax:{max(next_branch_counts)}\navg:{sum(next_branch_counts)/len(next_branch_counts)}\n")
        
        ambiguous = False
        for a, b in combinations(next_branches.keys(), 2):
            intersection = next_branches[a] & next_branches[b]
            if len(intersection) > 0:
                ambiguous = True
                print(f"The following nodes appear in both {a} and {b} for {cat_cols[i+1]}: {intersection}")
            
        if ambiguous:    
            print(f"{col} has ambiguous branches\n")
        else:
            print(f"{col} does NOT have ambiguous branches\n")
            

col: Segment Name
unqiue: 57
total: 71502

next level:
max:43
avg:8.157894736842104

Segment Name does NOT have ambiguous branches

col: Family Name
unqiue: 465
total: 71502

next level:
max:67
avg:11.425806451612903

Family Name does NOT have ambiguous branches

col: Class Name
unqiue: 5313
total: 71502

next level:
max:99
avg:13.45793337097685

Class Name does NOT have ambiguous branches

col: Commodity Name
unqiue: 71502
total: 71502



In [None]:
from model import create_tree_from_breadcrumbs, check_tree, Node, display_tree, create_vector_store, ask_model_category, format_node, optimize_tree, ProgressBars, display_lazy_tree

In [6]:
root = create_tree_from_breadcrumbs(df, breadcrumb_cols=["Segment Name", "Family Name", "Class Name", "Commodity Name"], extra_cols_map={"Segment Name": ["Segment"], "Family Name": ["Family"], "Class Name": ["Class"], "Commodity Name": ["Commodity"]})

In [7]:
check_tree(root)

sub_branches: 465, avg: 8.157894736842104, max: 43

sub_branches: 5313, avg: 11.425806451612903, max: 67

sub_branches: 71502, avg: 13.45793337097685, max: 99

sub_branches: 0, avg: 0.0, max: 0



In [8]:
node = root.children[0]

print(format_node(node))

display_tree(node, max_depth=2)

Live Plant and Animal Material and Accessories and Supplies
Generating Visual Tree...
Displayng...


In [9]:
cats = [n.condition for n in node.children]
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
)

vectorstore = create_vector_store(texts=cats, embeddings=embeddings)

from langchain_ollama import ChatOllama
create_llm = lambda: ChatOllama(
    model="qwen2.5:14b",
    # temperature=0,
)

In [13]:
cats, tokens = ask_model_category(node=root, embeddings=embeddings, create_llm=create_llm)
cats

TypeError: create_vector_store() missing 1 required positional argument: 'embeddings'

In [81]:
pd.unique(df["Segment Name"])

array(['Live Plant and Animal Material and Accessories and Supplies',
       'Mineral and Textile and Inedible Plant and Animal Materials',
       'Chemicals including Bio Chemicals and Gas Materials',
       'Resin and Rosin and Rubber and Foam and Film and Elastomeric Materials',
       'Paper Materials and Products',
       'Fuels and Fuel Additives and Lubricants and Anti corrosive Materials',
       'Mining and Well Drilling Machinery and Accessories',
       'Farming and Fishing and Forestry and Wildlife Machinery and Accessories',
       'Building and Construction Machinery and Accessories',
       'Industrial Manufacturing and Processing Machinery and Accessories',
       'Material Handling and Conditioning and Storage Machinery and their Accessories and Supplies',
       'Commercial and Military and Private Vehicles and their Accessories and Components',
       'Power Generation and Distribution Machinery and Accessories',
       'Tools and General Machinery',
       'Structur

In [82]:
segments_to_remove = [
    s for s in pd.unique(df["Segment Name"]) if "services" in s.lower()
]

segments_to_remove.append("Food Beverage and Tobacco Products")

families_to_remove = [
    f for f in pd.unique(df[df["Segment Name"] == "Live Plant and Animal Material and Accessories and Supplies"]["Family Name"]) if "live" in f.lower() or "fresh" in f.lower()
]

df_filt = df

for s in segments_to_remove:
    df_filt = df_filt[df_filt["Segment Name"] != s]

for f in families_to_remove:
    df_filt = df_filt[df_filt["Family Name"] != f]
    
len(df_filt)

31576

In [83]:
root = create_tree_from_breadcrumbs(df_filt, breadcrumb_cols=["Segment Name", "Family Name", "Class Name", "Commodity Name"], extra_cols_map={"Segment Name": ["Segment"], "Family Name": ["Family"], "Class Name": ["Class"], "Commodity Name": ["Commodity"]})
progress_bars = ProgressBars(n_leaves=len(df_filt))
display(progress_bars.ui)
optimize_tree(root=root, max_children=25, progress_bars=progress_bars, embeddings=embeddings, create_llm=create_llm)

VBox(children=(IntProgress(value=0, bar_style='info', description='Leaves Completed:', layout=Layout(width='90…

Working on node 'None' with 38 subcategories.
Created categories: ['Infrastructure Components', 'Safety and Security Equipment', 'Media and Visual Technology', 'Biological Materials and Supplies', 'Industrial Machinery and Tools', 'Leisure and Sports Goods', 'Energy Resources and Chemicals', 'Manufacturing and Processing Systems', 'Jewelry and Luxury Items', 'Financial Services and Instruments', 'Telecommunications and Information Tech', 'Real Estate and Land Development', 'Publishing and Media Products']
Rebalanced 33/38 categories.
Failed to categorize: ['Paper Materials and Products', 'Apparel and Luggage and Personal Care Products', 'Furniture and Furnishings', 'Musical Instruments and Games and Toys and Arts and Crafts and Educational Equipment and Materials and Accessories and Supplies', 'Organizations and Clubs']
Working on node 'Plate' with 26 subcategories.
Created categories: ['Metallic Plates', 'Non-Metallic Plates']
Rebalanced 26/26 categories.
Failed to categorize: []
Work

In [84]:
import pickle

with open("tree.pkl", "wb") as f:
    pickle.dump(root, f)

In [85]:
with open("tree.pkl", "rb") as f:
    root2 = pickle.load(f)

In [87]:
root2.children[0].condition

'Infrastructure Components'

In [106]:
display_lazy_tree(root2, max_initial_depth=3)

VBox(children=(FigureWidget({
    'data': [{'branchvalues': 'total',
              'ids': [071ca4c8-a548-49a2-…