In [1]:
import pandas as pd
import re
import json

In [2]:
data_path = "/home/vmadmin/intent/src/test/claude-sonnet-4-5_1.csv"
df = pd.read_csv(data_path)

def extract_usage_block(text: str):
    marker = "usage=Usage("
    start = text.find(marker)
    if start == -1:
        return None
    i = start + len(marker)
    depth = 1
    while i < len(text):
        ch = text[i]
        if ch == "(":
            depth += 1
        elif ch == ")":
            depth -= 1
            if depth == 0:
                return text[start + len(marker):i]
        i += 1
    return None

def split_top_level_args(s: str):
    parts = []
    buf = []
    depth = 0
    for ch in s:
        if ch == "(" :
            depth += 1
        elif ch == ")":
            depth -= 1
        if ch == "," and depth == 0:
            part = "".join(buf).strip()
            if part:
                parts.append(part)
            buf = []
        else:
            buf.append(ch)
    tail = "".join(buf).strip()
    if tail:
        parts.append(tail)
    return parts

def parse_cache_creation(value: str):
    if not value or not value.startswith("CacheCreation(") or not value.endswith(")"):
        return {}
    inner = value[len("CacheCreation("):-1]
    fields = {}
    for part in split_top_level_args(inner):
        if "=" not in part:
            continue
        key, v = part.split("=", 1)
        key = key.strip()
        v = v.strip()
        if v in {"None", "null"}:
            fields[key] = None
        elif v.isdigit():
            fields[key] = int(v)
        else:
            fields[key] = v
    return fields

def parse_usage_fields(usage_block: str):
    if not usage_block:
        return {}
    fields = {}
    for part in split_top_level_args(usage_block):
        if "=" not in part:
            continue
        key, value = part.split("=", 1)
        key = key.strip()
        value = value.strip()
        if value in {"None", "null"}:
            fields[key] = None
            continue
        if value.startswith("'") and value.endswith("'"):
            fields[key] = value[1:-1]
            continue
        if value.isdigit():
            fields[key] = int(value)
            continue
        fields[key] = value
    return fields

def extract_messages(text: str):
    if not isinstance(text, str):
        return []
    messages = []
    for match in re.finditer(r"Message\(id='([^']+)'", text):
        start = match.start()
        # find end of this Message(...) by balancing parentheses
        i = start
        depth = 0
        while i < len(text):
            ch = text[i]
            if ch == "(":
                depth += 1
            elif ch == ")":
                depth -= 1
                if depth == 0:
                    i += 1
                    break
            i += 1
        messages.append(text[start:i])
    return messages

def extract_tool_names(text: str):
    if not isinstance(text, str):
        return []
    names = re.findall(r"ToolUseBlock\([^\)]*?name='([^']+)'", text)
    # keep order, unique
    seen = set()
    ordered = []
    for n in names:
        if n not in seen:
            seen.add(n)
            ordered.append(n)
    return ordered

def parse_policy_status(value):
    if not isinstance(value, str) or not value.strip():
        return None
    try:
        data = json.loads(value)
        return data.get("status")
    except json.JSONDecodeError:
        return None

records = []
for _, row in df.iterrows():
    policy_status = parse_policy_status(row.get("policy"))
    for col in ["intent_processing", "type_definition"]:
        raw = row.get(col, "")
        if col == "intent_processing" and (not isinstance(raw, str) or not raw.strip()):
            raw = row.get("tool_call", "")
        for msg in extract_messages(raw):
            msg_id_match = re.search(r"Message\(id='([^']+)'", msg)
            model_match = re.search(r"model='([^']+)'", msg)
            msg_id = msg_id_match.group(1) if msg_id_match else None
            model = model_match.group(1) if model_match else None
            tool_names = extract_tool_names(msg)
            usage_block = extract_usage_block(msg)
            usage_fields = parse_usage_fields(usage_block)
            record = {
                "intent": row.get("intent"),
                "column": col,
                "message_id": msg_id,
                "model": model,
                "tool_use_names": ", ".join(tool_names) if tool_names else None,
                "policy_status": policy_status,
            }
            for k, v in usage_fields.items():
                record[f"usage_{k}"] = v
            records.append(record)

df_new = pd.DataFrame(records)
if "policy_status" in df_new.columns:
    policy_col = df_new.pop("policy_status")
    df_new["policy_status"] = policy_col
display(df_new)

Unnamed: 0,intent,column,message_id,model,tool_use_names,usage_cache_creation,usage_cache_creation_input_tokens,usage_cache_read_input_tokens,usage_input_tokens,usage_output_tokens,usage_server_tool_use,usage_service_tier,policy_status
0,Create a slice to support video journalists tr...,intent_processing,msg_01EaXvCSuofRCrzS6hZiziVF,claude-sonnet-4-5-20250929,create_session,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,2463,205,,standard,Policy created successfully
1,Create a slice to support video journalists tr...,type_definition,msg_01YZfmYG4eiPxAGuEGULeNyQ,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,310,7,,standard,Policy created successfully
2,Provision a slice for a university campus even...,intent_processing,msg_01Td8gmQSxWHV7VYp6MXSZ4Q,claude-sonnet-4-5-20250929,create_session,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,3253,196,,standard,Policy created successfully
3,Provision a slice for a university campus even...,type_definition,msg_01VeSGgfocPVxvsDuxD7vrYV,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,312,7,,standard,Policy created successfully
4,Set up a slice for a shopping mall to improve ...,intent_processing,msg_01PViTwtzG4c9QZso8sKbh6j,claude-sonnet-4-5-20250929,create_session,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,4108,174,,standard,Policy created successfully
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Provision a slice for smart greenhouse climate...,intent_processing,msg_017gjb2A76nRPWEvkrG8Y8gf,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,32753,147,,standard,
113,Establish a slice for public safety siren moni...,intent_processing,msg_015jHpFaEEzFyHCzTV1ursn9,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,33210,158,,standard,
114,Deploy a slice for high-resolution mobile mapp...,intent_processing,msg_01LsSZhPyQ4GsShNGsoHC5Rm,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,33680,149,,standard,
115,Create a slice for autonomous ferry navigation...,intent_processing,msg_01KPzj5fSFy5mMQTDwNRXoJ3,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,34134,146,,standard,


In [6]:
display(df_new)

Unnamed: 0,intent,column,message_id,model,tool_use_names,usage_cache_creation,usage_cache_creation_input_tokens,usage_cache_read_input_tokens,usage_input_tokens,usage_output_tokens,usage_server_tool_use,usage_service_tier,policy_status
0,Create a slice to support video journalists tr...,intent_processing,msg_01EaXvCSuofRCrzS6hZiziVF,claude-sonnet-4-5-20250929,create_session,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,2463,205,,standard,Policy created successfully
1,Create a slice to support video journalists tr...,type_definition,msg_01YZfmYG4eiPxAGuEGULeNyQ,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,310,7,,standard,Policy created successfully
2,Provision a slice for a university campus even...,intent_processing,msg_01Td8gmQSxWHV7VYp6MXSZ4Q,claude-sonnet-4-5-20250929,create_session,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,3253,196,,standard,Policy created successfully
3,Provision a slice for a university campus even...,type_definition,msg_01VeSGgfocPVxvsDuxD7vrYV,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,312,7,,standard,Policy created successfully
4,Set up a slice for a shopping mall to improve ...,intent_processing,msg_01PViTwtzG4c9QZso8sKbh6j,claude-sonnet-4-5-20250929,create_session,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,4108,174,,standard,Policy created successfully
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Provision a slice for smart greenhouse climate...,intent_processing,msg_017gjb2A76nRPWEvkrG8Y8gf,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,32753,147,,standard,
113,Establish a slice for public safety siren moni...,intent_processing,msg_015jHpFaEEzFyHCzTV1ursn9,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,33210,158,,standard,
114,Deploy a slice for high-resolution mobile mapp...,intent_processing,msg_01LsSZhPyQ4GsShNGsoHC5Rm,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,33680,149,,standard,
115,Create a slice for autonomous ferry navigation...,intent_processing,msg_01KPzj5fSFy5mMQTDwNRXoJ3,claude-sonnet-4-5-20250929,,"CacheCreation(ephemeral_1h_input_tokens=0, eph...",0,0,34134,146,,standard,


In [22]:
print("df shape:", df.shape)
print("df_new shape:", df_new.shape)
print("total rows df:", len(df))
print("total rows df_new:", len(df_new))
print("unique intents in df_new:", df_new["intent"].nunique())
if "column" in df_new.columns:
    print("rows by column:")
    display(df_new["column"].value_counts())
else:
    print("columns df_new:", list(df_new.columns))
if "intent_processing" in df.columns:
    print("rows with intent_processing:", df["intent_processing"].notna().sum())
if "type_definition" in df.columns:
    print("rows with type_definition:", df["type_definition"].notna().sum())

df shape: (100, 5)
df_new shape: (117, 13)
total rows df: 100
total rows df_new: 117
unique intents in df_new: 100
rows by column:


column
intent_processing    100
type_definition       17
Name: count, dtype: int64

rows with intent_processing: 17
rows with type_definition: 17


In [25]:
df_new.to_csv("/home/vmadmin/intent/src/results/anthropic-sonnet-4-5.csv", index=False, header=True)