In [None]:
import dbnl
import pandas as pd
import json
from datetime import UTC, datetime, timedelta
import numpy as np
import random
from dbnl_otel_converter import dbnl_df_from_otel_file


# Make sure your version matches the docs at https://docs.dbnl.com/
print("dbnl version:", dbnl.__version__)

In [None]:
# Login to DBNL (using default Sandbox url)
app_url="https://app.dev.dbnl.com/"
dbnl.login(
    api_url="http://localhost:8080/api", # sandbox default
    api_token="<DBNL_API_KEY>", # found at http://localhost:8080/tokens
)

In [None]:
# Create a new project
project = dbnl.get_or_create_project(
    name="ADK Calc AB Testing Example",
    schedule="daily",  # How often DBNL analyzes new data
    default_llm_model_name="quickstart_model" # From step (2) in quickstart
)

In [None]:
df = dbnl_df_from_otel_file("data/traces_v0_only.jsonl")

In [None]:
# Short messages (customize as you like)
complaints = [
    "That’s not right!", "Wrong result again!", "Calculator failed.",
    "Off by a mile.", "Bad math output!", "Totally incorrect!",
    "Oops, wrong calc.", "Computation error.", "Answer is wrong.",
    "Incorrect result.", "Math seems broken.", "Calculation flaw.",
    "Miscalculated that.", "Wrong total shown.", "Completely off!",
    "This seems buggy.", "Bad arithmetic!", "The math is wrong.",
    "Way off the mark.", "Error in result!"
]
praises = [
    "Perfect result!", "Nice work!", "Correct again!", "Spot on!",
    "You nailed it!", "Looks good!", "Math checks out!", "Well done!",
    "Accurate answer!", "Exactly right!", "All good here!", "Bang on target!",
    "That’s correct!", "Great calculation!", "Awesome result!",
    "Nice precision!", "Flawless math!", "Right on point!",
    "Excellent job!", "Spotless result!"
]

def compute_expected(cell):
    try:
        return float(eval(json.loads(json.loads(cell))['new_message']['parts'][0]['text']))
    except Exception:
        return None

def compute_feedback(row, p_keep=0.11):
    # 89% chance to leave both None
    if np.random.rand() > p_keep:
        return pd.Series({"feedback_score": None, "feedback_text": None})

    expected_output = compute_expected(row["input"])
    actual_output = extract_math_output(row["output"])

    if expected_output is None or actual_output is None:
        return pd.Series({"feedback_score": None, "feedback_text": None})
    elif actual_output == expected_output:
        return pd.Series({"feedback_score": 5, "feedback_text": random.choice(praises)})
    else:
        return pd.Series({"feedback_score": 1, "feedback_text": random.choice(complaints)})

def extract_math_output(cell):
    try:
        return float(json.loads(json.loads(cell))['content']['parts'][0]['text'])
    except:
        return None

def extract_agent_input(cell):
    try:
        return json.loads(json.loads(cell))['new_message']['parts'][0]['text']
    except:
        return None

def extract_agent_version(cell):
    # This is extremely hacky. Ideally you could get your cohort from your feature flag system and join it here.
    if "Version: v0" in str(cell):
        return "v0"
    else:
        return "v1"

def compute_absolute_error(row):
    expected_output = compute_expected(row["input"])
    actual_output = extract_math_output(row["output"])
    if expected_output is None or actual_output is None:
        return None
    else:
        return abs(expected_output - actual_output)

def est_cost_from_gen_ai_tokens(spans):
    """Sum gen_ai.usage.input_tokens + gen_ai.usage.output_tokens across all spans and estimate cost."""
    COST = { # per token
        "gen_ai.usage.input_tokens": 0.000000075,
        "gen_ai.usage.output_tokens": 0.00000030,
    }

    if spans is None:
        return 0

    total = 0
    for span in spans:
        attrs = span.get("attributes", [])
        # attrs is a list of (key, value) tuples
        for key, val in attrs:
            if key in ("gen_ai.usage.input_tokens", "gen_ai.usage.output_tokens"):
                if isinstance(val, str):
                    # strip wrapping quotes if present, then try to parse int
                    v = val.strip('"')
                    try:
                        total += int(v)*COST[key]
                    except ValueError:
                        pass  # ignore non-numeric weirdness
    return total

def add_fields(df):
    df["output_expected"] = df["input"].apply(compute_expected)
    df[["feedback_score", "feedback_text"]] = df.apply(compute_feedback, axis=1)
    df["math_output"] = df["output"].apply(extract_math_output)
    df["agent_input"] = df["input"].apply(extract_agent_input)
    df["agent_version"] = df["traces_data"].apply(extract_agent_version)
    df["absolute_error"] = df.apply(compute_absolute_error, axis=1)
    dbnl_spans = dbnl.convert_otlp_traces_data(data=df["traces_data"])
    df["total_cost"] = dbnl_spans.apply(est_cost_from_gen_ai_tokens)
    return df

In [None]:
def break_into_days(df, start_day=datetime.now(tz=UTC), num_days=1):
    day_dfs = [chunk.reset_index(drop=True) for chunk in np.array_split(df, num_days)]

    for idx, df in enumerate(day_dfs):
        df['timestamp'] = start_day.replace(hour=12) + timedelta(days=(idx))
        
    return day_dfs
    

In [None]:
def log_multi_df_to_dbnl(day_dfs):
    print("Uploading data...")
    print(f"See status at: {app_url}/ns/{project.namespace_id}/projects/{project.id}/status")
    
    for idx, day_df in enumerate(day_dfs):
        print(f"{idx + 1} / {len(day_dfs)} publishing log data for {min(day_df['timestamp']).date()}")
        data_start_t = min(day_df['timestamp']).replace(hour=0, minute=0, second=0, microsecond=0)
        data_end_t = data_start_t + timedelta(days=1)
        try:
            dbnl.log(
                project_id=project.id,
                data_start_time=data_start_t,
                data_end_time=data_end_t,
                data=day_df,
            )
        except Exception as e:
            if "Data already exists" in str(e):
                print("Data already exists, skipping...")
                continue
            raise
    
    print("You can now explore your data in DBNL!")
    print(f"{app_url}/ns/{project.namespace_id}/projects/{project.id}")

In [None]:
df = add_fields(df)
print(f"Loaded {len(df)} traces.")

day_dfs = break_into_days(df, start_day=datetime.now(tz=UTC) - timedelta(days=14), num_days=8)
log_multi_df_to_dbnl(day_dfs)


In [None]:
df = dbnl_df_from_otel_file("data/traces_mix.jsonl")
df = add_fields(df)
print(f"Loaded {len(df)} traces.")

day_dfs = break_into_days(df, start_day=datetime.now(tz=UTC) - timedelta(days=6), num_days=3)
log_multi_df_to_dbnl(day_dfs)

In [None]:
df = dbnl_df_from_otel_file("data/traces_v1_only.jsonl")
df = add_fields(df)
print(f"Loaded {len(df)} traces.")

day_dfs = break_into_days(df, start_day=datetime.now(tz=UTC) - timedelta(days=3), num_days=3)
log_multi_df_to_dbnl(day_dfs)