In [1]:
import os

os.environ.setdefault("GRPC_VERBOSITY", "NONE")
os.environ.setdefault("GLOG_minloglevel", "2")
from src.pipeline_4_agent import SequentialAgentPipeline
from src.retrieval_then_llm import SmartClassifier, run_smartclassifier
import pandas as pd
from langfuse import Langfuse, get_client

langfuse = get_client()
tickets = pd.read_csv("data/dataset_for_categorization.csv")

In [2]:
pipeline = SequentialAgentPipeline("data/cleaned.csv")
final_categorizations = []
kb_results = []

Loading and preparing taxonomy from data/cleaned.csv...


In [3]:
xd = await run_smartclassifier(tickets.at[6, "ticket"])
print(xd)
basta = await pipeline.run(tickets.at[6, "ticket"])
langfuse.shutdown()

Initializing Smart Classifier (Domain→Cat1→Cat2→Cat3)...
Loading existing FAISS index from taxonomy.index...
{'best_path': {'L1_domain': {'id': 'dom.1b59a5fdf0ef', 'confidence': 0.9, 'name': 'Toki'}, 'L2_cat1': {'id': 'cat1.38f67d7524e1', 'confidence': 0.9, 'name': 'Үйлчилгээ'}, 'L3_cat2': {'id': 'cat2.aae242753c1c', 'confidence': 0.85, 'name': 'Гар утас лизинг'}, 'L4_cat3': {'id': 'cat3.b3c7195654a0', 'confidence': 0.85, 'name': 'Төхөөрөмж болон дагалдах хэрэгсэлтэй холбоотой'}}, 'rationale': 'Сонгосон зам: Toki > Үйлчилгээ > Гар утас лизинг > Төхөөрөмж болон дагалдах хэрэгсэлтэй холбоотой. Хэрэглэгч "гар утасны зээл/лизинг эргэн төлөх" талаар заавар хүсэж байгаагаас энэ нь гар утас лизинг, төхөөрөмжтэй холбоотой хамгийн тохирох ангилал гэж үзэв.', 'abstain': False, 'selected_index': 4, 'path_mn': 'Toki > Үйлчилгээ > Гар утас лизинг > Төхөөрөмж болон дагалдах хэрэгсэлтэй холбоотой'}
Toki>Хэтэвч>Кредит эргэн төлөлттэй холбоотой>L2-Эргэн төлөлт оруулсан


In [None]:
for ticket in tickets['ticket']:
    final_categorization = await pipeline.run(ticket)
    result = await classifier.classify(ticket)
    keys_in_order = ["domain", "category_1", "category_2", "category_3"]
    valid_parts = [final_categorization.get(key) for key in keys_in_order]
    non_empty_parts = [
        part for part in valid_parts if part is not None and str(part).strip() != ''
    ]
    output = " > ".join(non_empty_parts)
    final_categorizations.append(output)
    kb_results.append(result["path_mn"])

In [None]:
tickets['seq agent'] = final_categorizations
tickets['kb agent'] = kb_results
tickets.to_csv("data/evaluatedtwofold.csv")

In [None]:
# cell5 — quick scores/preview
import pandas as pd
from rapidfuzz import fuzz, process

df = pd.read_csv("data/evaluatedtwofold.csv")


def part_match(a: str, b: str) -> float:
    """
    Compares path parts separated by '>' and returns the ratio of matching,
    order-sensitive, left-to-right (prefix) matches.
    """
    if not isinstance(a, str):
        a = ""
    if not isinstance(b, str):
        b = ""
    pa = [p.strip() for p in a.split(">") if str(p).strip()]
    pb = [p.strip() for p in b.split(">") if str(p).strip()]
    if not pa and not pb:
        return 1.0
    match = 0
    for i, (xa, xb) in enumerate(zip(pa, pb)):
        if xa == xb:
            match += 1
        else:
            break  # stop at first mismatch (prefix agreement)
    denom = max(len(pa), len(pb)) or 1
    return match / denom


def safe_str(x):
    return "" if pd.isna(x) else str(x)


# Choose which model columns you want to compare against the employee column
EMP_COL = "employee"
SEQ_COL = "seq agent"
KB_COL = "kb agent"

df["Fuzzy similarity"] = df.apply(
    lambda r: fuzz.token_set_ratio(safe_str(r[EMP_COL]), safe_str(r[SEQ_COL])) / 100.0,
    axis=1,
)
df["Fuzzy similarity (kb)"] = df.apply(
    lambda r: fuzz.token_set_ratio(safe_str(r[EMP_COL]), safe_str(r[KB_COL])) / 100.0,
    axis=1,
)
df["Part match (seq)"] = df.apply(
    lambda r: part_match(safe_str(r[EMP_COL]), safe_str(r[SEQ_COL])), axis=1
)
df["Part match (kb)"] = df.apply(
    lambda r: part_match(safe_str(r[EMP_COL]), safe_str(r[KB_COL])), axis=1
)

# Optional: if you later add 2.5 variants, repeat with those columns:
for col in ["seq agent 2.5", "kb agent 2.5"]:
    if col in df.columns:
        df[f"Fuzzy similarity ({col})"] = df.apply(
            lambda r: fuzz.token_set_ratio(safe_str(r[EMP_COL]), safe_str(r[col]))
            / 100.0,
            axis=1,
        )
        df[f"Part match ({col})"] = df.apply(
            lambda r: part_match(safe_str(r[EMP_COL]), safe_str(r[col])), axis=1
        )

preview_cols = [
    "Case ID",
    "ticket",
    "employee",
    SEQ_COL,
    KB_COL,
    "Fuzzy similarity",
    "Fuzzy similarity (kb)",
    "Part match (seq)",
    "Part match (kb)",
] + [
    c
    for c in df.columns
    if c.startswith("Fuzzy similarity (seq agent 2.5")
    or c.startswith("Fuzzy similarity (kb agent 2.5")
]
preview = df[[c for c in preview_cols if c in df.columns]].head(10)

# Save and show
out_path = "data/scored_evaluation.csv"
df.to_csv(out_path, index=False)
print(f"Saved: {out_path}")
preview

In [None]:
xd = await classifier.classify(tickets.at[6, "ticket"])
print(xd)
basta = await pipeline.run(tickets.at[6, "ticket"])
langfuse.shutdown()