# Explore langfuse tags

Questions to discuss:
* How important is this topic?
* Do we want to automate insights about tags? (e.g., a script, a weekly confluence page, etc.)
    * If so, do we want to introduce more "order" and restrictions so we can keep track of costs associated to tags? 

In [35]:
import pandas as pd
from datetime import date, timedelta

from aily_ai_brain.common.secrets_manager import get_aws_secret_key
from sqlalchemy import create_engine, text
from sqlalchemy.orm import Session, sessionmaker
from aily_py_commons.aily_logging import aily_logging as logging
from aily_py_commons.io.env_vars import (
    INFRASTRUCTURE_PROD,
    AilySettings,
)

## Connect to DB

In [36]:
def create_db_session_standard(tenant_name: str, environment: str):
    """
    Retrieves the database info for the given tenant using AWS credentials,
    then, creates a new database session for it.
    """
    secret_name = f"aily/{tenant_name}/{environment}/rds-langfuse/dbo"
    creds = {k: get_aws_secret_key(secret_name, k) for k in ["username", "password", "host", "port", "dbName"]}

    dialect_driver = "postgresql+psycopg2://"
    engine_url = (
        f"{dialect_driver}{creds['username']}:{creds['password']}@{creds['host']}:{creds['port']}/{creds['dbName']}"
    )
    sql_engine = create_engine(engine_url)
    sql_session = sessionmaker(bind=sql_engine)
    return sql_session

In [37]:
AilySettings(INFRASTRUCTURE_PROD)



AilySettings(AILY_PREFIX='aily', AILY_ENV='prod', AILY_TENANT='infrastructure', AWS_REGION='eu-central-1', AWS_PROFILE='aws-infrastructure')

In [38]:
tenant_name = "infrastructure"
environment = "prod"
tenant_session = create_db_session_standard(tenant_name, environment)

  creds = {k: get_aws_secret_key(secret_name, k) for k in ["username", "password", "host", "port", "dbName"]}


## Download data

In [39]:
def get_traces(session: Session, columns: list[str] = [], ids: list[int] = [], from_date: date = None) -> pd.DataFrame:
    """
    Return traces without the "input", "output",
    and "project_id" columns by default because they will be overwritten
    """
    default_columns = [
        "id",
        "timestamp",
        "tags",
    ]

    if not columns:
        selected_columns = default_columns
    else:
        selected_columns = columns

    selected_columns_text = ", ".join(f'"{col}"' for col in selected_columns)

    conditions = []
    params = {}

    if ids:
        conditions.append("id = ANY (ARRAY[:ids])")
        params["ids"] = ids

    if from_date:
        conditions.append("created_at >= :from_date")
        params["from_date"] = from_date

    if conditions:
        query_where = " WHERE " + " AND ".join(conditions)
    else:
        query_where = ""

    query = text(
        f"""
        SELECT {selected_columns_text}
        FROM public.traces
        {query_where}
        """
    )

    result = session.execute(query, params=params)

    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    return df

def get_generation_observations(
    session: Session, columns: list[str] = [], ids: list[int] = [], from_date: date = None
) -> pd.DataFrame:
    """
    Return observations of type generation filtered by columns and ids.
    """
    default_columns = [
        "id",
        "type",
        "trace_id",
        "completion_tokens",
        "prompt_tokens",
        "total_tokens",
        "version",
        "created_at",
        "unit",
        "internal_model",
    ]

    if not columns:
        selected_columns = default_columns
    else:
        selected_columns = columns

    selected_columns_text = ", ".join(f'"{col}"' for col in selected_columns)

    conditions = ["type = 'GENERATION'"]
    params = {}

    if ids:
        conditions.append("id = ANY (ARRAY[:ids])")
        params["ids"] = ids

    if from_date:
        conditions.append("created_at >= :from_date")
        params["from_date"] = from_date

    query_where = " WHERE " + " AND ".join(conditions)

    query = text(
        f"""
        SELECT {selected_columns_text}
        FROM public.observations
        {query_where}
        """
    )

    result = session.execute(query, params=params)

    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    return df

def get_models(session: Session, columns: list[str] = []) -> pd.DataFrame:
    """
    Return models and their costs.
    """
    if not columns:
        # Select all columns if columns list is empty
        query = text(
            """SELECT *
            FROM public.models
            """
        )
    else:
        # Select specified columns
        selected_columns_text = ", ".join(f'"{col}"' for col in columns)
        query = text(
            f"""SELECT {selected_columns_text}
            FROM public.models
            """
        )

    result = session.execute(query)
    df = pd.DataFrame(result.fetchall(), columns=result.keys())
    return df

In [40]:
today = date.today()
from_date = today - timedelta(days=365)

with tenant_session() as session:
    print("Downloading models:")
    df_models = get_models(session)
    
    print("Downloading traces:")
    df_traces = get_traces(session=session, from_date=from_date)
    
    print("Downloading observations:")
    df_generation_observations = get_generation_observations(session, from_date=from_date)
    

Downloading models:
Downloading traces:
Downloading observations:


In [41]:
df_generation_observations

Unnamed: 0,id,type,trace_id,completion_tokens,prompt_tokens,total_tokens,version,created_at,unit,internal_model
0,coreproduct-3e371bd7-1e95-40f3-946b-52f1a1fa0284,GENERATION,coreproduct-7131923f-3a53-49ea-a625-c97d1b1f1f00,4,540,544,,2024-06-26 07:46:54.650,TOKENS,gpt-3.5-turbo
1,ed823260-9e78-46d8-a785-d3bcd66bbd97,GENERATION,e6286e89-d23c-4cb5-b347-a078e6ac65b4,54,555,609,,2024-06-20 18:57:11.317,TOKENS,gpt-4-turbo-preview
2,a36cbbbc-b946-42c2-9847-2b9f79540855,GENERATION,0f581d01-e994-45fa-b3be-1551efab6d93,2,359,361,,2024-06-20 19:08:17.643,TOKENS,gpt-3.5-turbo
3,d37d6f66-9536-4bdc-90ac-1afe329dab44,GENERATION,52e65955-0f90-4a70-8db3-7076d4fdacc7,587,2519,3106,,2024-06-21 09:11:10.345,TOKENS,gpt-3.5-turbo
4,c49b7fd4-bd20-4692-8502-411894f730a3,GENERATION,0b1eefa8-53d0-4cb1-9c2e-e485edd8c9e4,702,1745,2447,,2024-06-21 09:11:20.674,TOKENS,gpt-3.5-turbo
...,...,...,...,...,...,...,...,...,...,...
823568,coreproduct-29a574e2-db8f-4c2a-a124-64e4af07df9a,GENERATION,coreproduct-1b8861d6-179d-4be6-8b6b-7b421ac5c492,0,2807,2807,,2024-07-17 13:35:05.180,TOKENS,gpt-4-turbo-preview
823569,coreproduct-32dadc0b-c038-43d7-a29a-ebf2ba906efd,GENERATION,coreproduct-dd6db9f8-8ed6-4926-b0da-9671cf70dd63,174,402,576,,2024-07-17 13:35:47.769,TOKENS,gpt-4-turbo-preview
823570,coreproduct-278185f3-a2b7-4787-a9bb-7c592b9d5139,GENERATION,coreproduct-dd6db9f8-8ed6-4926-b0da-9671cf70dd63,424,2806,3230,,2024-07-17 13:35:55.571,TOKENS,gpt-4-turbo-preview
823571,coreproduct-2e61c230-012d-4a87-8fc4-e76db089e29f,GENERATION,coreproduct-dd6db9f8-8ed6-4926-b0da-9671cf70dd63,348,2804,3152,,2024-07-17 13:35:55.603,TOKENS,gpt-4-turbo-preview


## Estimate cost of each observation

In order to estimate the cost of each observation, we need to multiply the `input_price` of the model by the total number of `prompt_tokens`and the `output_price` by the number of `completion_tokens`

In [42]:
def calculate_cost(row, df_models):
  model_info = df_models[df_models["model_name"] == row["internal_model"]]
  
  if model_info.empty:
    return 0

  observation_date = pd.to_datetime(row["created_at"])
  model_prices = model_info[["start_date", "input_price", "output_price"]]
  
  cost = 0
  
  if model_prices.empty:
    return 0
  
  elif model_prices.shape[0] == 1:
    token_prices = model_prices.iloc[0]
    input_price = token_prices["input_price"]
    output_price = token_prices["output_price"]
    cost = row["prompt_tokens"] * input_price + row["completion_tokens"] * output_price
    
  elif model_prices.shape[0] > 1:
    most_recent_prices = model_prices[model_prices["start_date"] <= observation_date].sort_values(by="start_date", ascending=False)
    # print(most_recent_prices)
    if most_recent_prices.empty:
      # TODO: We could also return a cost of 0
      raise Exception(f"The observation {row} is younger than any of the multiple associated prices")
    else:
      token_prices = most_recent_prices.sort_values(by="start_date", ascending=False).iloc[0]
      input_price = token_prices["input_price"]
      output_price = token_prices["output_price"]
      cost = row["prompt_tokens"] * input_price + row["completion_tokens"] * output_price
  
  return cost

In [43]:
df_generation_observations.loc[:, "cost"] = df_generation_observations.apply(calculate_cost, axis=1, args=(df_models,))

## Question answering

* Cuantas trazas hay
* Cuantas trazas de los ultimos X dias que tienen la palabra prod o dev
* Cuanto nos gastamos en prod
* Cuanto nos gastamos en dev
* Cuanto nos gastamos en el resto de trazas
* Rankear las trazas no prod o dev


### How many unique tags are? 

In [44]:
print(f"There are {df_traces['tags'].value_counts().shape[0]} unique tag combinations")

There are 2003 unique tag combinations


In [45]:
# Step 1: Flatten the list values
all_values = [item for sublist in df_traces['tags'] for item in sublist]

# Step 2: Find unique values
unique_values = set(all_values)

# Step 3: Count unique values
unique_count = len(unique_values)

print(f"Number of unique tags: {unique_count}")

Number of unique tags: 1422


###  How many traces with the word `prod` or `dev`

In [46]:
def how_many_contain_the_tag(df, tag_name):
    # Check each list for the presence of the number 2
    contains_it = df['tags'].apply(lambda x: tag_name in x)
    
    # Sum the boolean values to get the count
    count_rows_with_it = contains_it.sum()
    
    print(f"Number of rows containing the tag '{tag_name}': {count_rows_with_it}/{df.shape[0]}")
    
    return count_rows_with_it
    
def how_many_do_not_contain_tags(df, tags_list):
    # Check each list for the absence of all tags in tags_list
    does_not_contain_any = df['tags'].apply(lambda x: all(tag not in x for tag in tags_list))
    
    # Sum the boolean values to get the count
    count_rows_without_tags = does_not_contain_any.sum()
    
    print(f"Number of rows not containing any of the tags {tags_list}: {count_rows_without_tags}/{df.shape[0]}")
    
    return count_rows_without_tags

def how_many_contain_all_tags(df, tags_list):
    # Check each list for the presence of all tags in tags_list
    does_not_contain_any = df['tags'].apply(lambda x: all(tag in x for tag in tags_list))
    
    # Sum the boolean values to get the count
    count_rows_with_tags = does_not_contain_any.sum()
    
    print(f"Number of rows containing all of the tags {tags_list}: {count_rows_with_tags}/{df.shape[0]}")
    
    return count_rows_with_tags
    
n_1 = how_many_contain_the_tag(df_traces, "prod")
n_2 = how_many_contain_the_tag(df_traces, "dev")
n_3 = how_many_do_not_contain_tags(df_traces, ["prod", "dev"])
n_4 = how_many_contain_all_tags(df_traces, ["prod", "dev"])

# We can see that the calculations are correct, there are 2 traces with both dev and prod tags
n_1 + n_2 + n_3

Number of rows containing the tag 'prod': 336668/440968
Number of rows containing the tag 'dev': 79914/440968
Number of rows not containing any of the tags ['prod', 'dev']: 27181/440968
Number of rows containing all of the tags ['prod', 'dev']: 2795/440968


443763

### Rank unique trace combinations according to `cost`

In [47]:
df_generation_observations

Unnamed: 0,id,type,trace_id,completion_tokens,prompt_tokens,total_tokens,version,created_at,unit,internal_model,cost
0,coreproduct-3e371bd7-1e95-40f3-946b-52f1a1fa0284,GENERATION,coreproduct-7131923f-3a53-49ea-a625-c97d1b1f1f00,4,540,544,,2024-06-26 07:46:54.650,TOKENS,gpt-3.5-turbo,0.000276000000000000000000000000
1,ed823260-9e78-46d8-a785-d3bcd66bbd97,GENERATION,e6286e89-d23c-4cb5-b347-a078e6ac65b4,54,555,609,,2024-06-20 18:57:11.317,TOKENS,gpt-4-turbo-preview,0.007170000000000000000000000000
2,a36cbbbc-b946-42c2-9847-2b9f79540855,GENERATION,0f581d01-e994-45fa-b3be-1551efab6d93,2,359,361,,2024-06-20 19:08:17.643,TOKENS,gpt-3.5-turbo,0.000182500000000000000000000000
3,d37d6f66-9536-4bdc-90ac-1afe329dab44,GENERATION,52e65955-0f90-4a70-8db3-7076d4fdacc7,587,2519,3106,,2024-06-21 09:11:10.345,TOKENS,gpt-3.5-turbo,0.002140000000000000000000000000
4,c49b7fd4-bd20-4692-8502-411894f730a3,GENERATION,0b1eefa8-53d0-4cb1-9c2e-e485edd8c9e4,702,1745,2447,,2024-06-21 09:11:20.674,TOKENS,gpt-3.5-turbo,0.001925500000000000000000000000
...,...,...,...,...,...,...,...,...,...,...,...
823568,coreproduct-29a574e2-db8f-4c2a-a124-64e4af07df9a,GENERATION,coreproduct-1b8861d6-179d-4be6-8b6b-7b421ac5c492,0,2807,2807,,2024-07-17 13:35:05.180,TOKENS,gpt-4-turbo-preview,0.02807000000000000000000000000
823569,coreproduct-32dadc0b-c038-43d7-a29a-ebf2ba906efd,GENERATION,coreproduct-dd6db9f8-8ed6-4926-b0da-9671cf70dd63,174,402,576,,2024-07-17 13:35:47.769,TOKENS,gpt-4-turbo-preview,0.009240000000000000000000000000
823570,coreproduct-278185f3-a2b7-4787-a9bb-7c592b9d5139,GENERATION,coreproduct-dd6db9f8-8ed6-4926-b0da-9671cf70dd63,424,2806,3230,,2024-07-17 13:35:55.571,TOKENS,gpt-4-turbo-preview,0.04078000000000000000000000000
823571,coreproduct-2e61c230-012d-4a87-8fc4-e76db089e29f,GENERATION,coreproduct-dd6db9f8-8ed6-4926-b0da-9671cf70dd63,348,2804,3152,,2024-07-17 13:35:55.603,TOKENS,gpt-4-turbo-preview,0.03848000000000000000000000000


### Rank tag combinations according to `cost`

In [48]:
merged_df = df_generation_observations.merge(df_traces, left_on='trace_id', right_on='id')

# Group by the 'tags' column and aggregate the costs
tag_combinations_cost = merged_df.groupby(merged_df['tags'].apply(tuple))['cost'].sum().reset_index()

# Rename columns for clarity
tag_combinations_cost.columns = ['tags', 'total_cost']

tag_combinations_cost.sort_values(["total_cost"], ascending=False)

Unnamed: 0,tags,total_cost
1920,"(prod, scanner, post-processing)",844.9957930000000000000000000
1930,"(prod, sql_agent_langgraph, auto_insights)",464.7707900000000000000000000
1924,"(prod, scanner, prnewswire, extract_content)",426.8825920000000000000000000
1665,"(dev, spend, moderna, invoice_categorization)",353.4621440000000000000000000
1828,"(prod, scanner, globenewswire, extract_content)",243.2467830000000000000000000
...,...,...
1948,"(test, benchmarking, NousResearch/Meta-Llama-3...",0
1947,"(test, benchmarking, NousResearch/Meta-Llama-3...",0
1945,"(test, benchmarking, NousResearch/Meta-Llama-3...",0
1684,"(dev, test_NewBedrock)",0


In [49]:
tag_combinations_cost["total_cost"].sum()

Decimal('7364.514307820000001042507020')

### Rank unique tags according to `cost`

In [50]:
# Expand the 'tags' column so each tag has its own row
expanded_df = merged_df.explode('tags')

# Group by the 'tags' column and aggregate the costs
unique_tags_cost = expanded_df.groupby('tags')['cost'].sum().reset_index()

# Rename columns for clarity
unique_tags_cost.columns = ['tag', 'total_cost']

unique_tags_cost.sort_values(["total_cost"], ascending=False)

Unnamed: 0,tag,total_cost
1281,prod,5279.621634800000001036622130
1331,scanner,3771.614579000000000000000000
990,dev,1966.068155650000000000078240
1005,extract_content,1485.012530500000000000000000
1274,post-processing,978.6641080000000000000000000
...,...,...
1316,sales_by_geo,0
103,NousResearch/Meta-Llama-3-8B-Instruct_06062024...,0
100,NousResearch/Meta-Llama-3-8B-Instruct_06052024...,0
101,NousResearch/Meta-Llama-3-8B-Instruct_06062024...,0


### Adam's questions

How much have we spent on `prod` and `dev`

In [51]:
def contain_the_tag(df, tag_name):  
    return df[df["tags"].apply(lambda x: tag_name in x)]  
    
def do_not_contain_tags(df, tags_list):
    return df[df['tags'].apply(lambda x: all(tag not in x for tag in tags_list))]

def contain_all_tags(df, tags_list):
    return df[df['tags'].apply(lambda x: all(tag in x for tag in tags_list))]

In [63]:
df_contain_prod_tag = contain_the_tag(tag_combinations_cost, "prod").sort_values(["total_cost"], ascending=False)
number = df_contain_prod_tag.shape[0]
cost = df_contain_prod_tag["total_cost"].sum()

print(f"There are {number} tag combinations with 'prod'. They cost {round(cost, 2)}$")
df_contain_prod_tag

There are 405 tag combinations with 'prod'. They cost 5279.62$


Unnamed: 0,tags,total_cost
1920,"(prod, scanner, post-processing)",844.9957930000000000000000000
1930,"(prod, sql_agent_langgraph, auto_insights)",464.7707900000000000000000000
1924,"(prod, scanner, prnewswire, extract_content)",426.8825920000000000000000000
1828,"(prod, scanner, globenewswire, extract_content)",243.2467830000000000000000000
58,"(GRA, RnD, prod, strategy_acceleration)",238.1254240000000000000000000
...,...,...
272,"(brain_example_langfuse, genai, prod)",0.000243000000000000000000000000
1749,"(prod, aily_brain_examples, anonymizer, fake_n...",0.000184500000000000000000000000
1745,"(prod, aily-ai-brain, tests, test_get_llm4)",0.000082000000000000000000000000
1780,"(prod, decision_advisor)",0


In [64]:
df_contain_dev_tag = contain_the_tag(tag_combinations_cost, "dev").sort_values(["total_cost"], ascending=False)
number = df_contain_dev_tag.shape[0]
cost = df_contain_dev_tag["total_cost"].sum()

print(f"There are {number} tag combinations with 'dev'. They cost {round(cost, 2)}$")

df_contain_dev_tag

There are 1492 tag combinations with 'dev'. They cost 1935.70$


Unnamed: 0,tags,total_cost
1665,"(dev, spend, moderna, invoice_categorization)",353.4621440000000000000000000
154,"(agent_network, aily_ai_brain, benchmarking, dev)",214.7574445000000000000000000
402,"(dev, RnD, GRA, fast_track, pathway_recommender)",211.3444240000000000000000000
374,"(dev,)",113.5171130000000000000000000
1658,"(dev, scanner, post-processing, sec-edgar)",65.14933000000000000000000000
...,...,...
1180,"(dev, core, scanner, google_search, companies/...",0.000183000000000000000000000000
1308,"(dev, core, scanner, google_search, companies/...",0.000183000000000000000000000000
1528,"(dev, fin_quarterly_report_test, summary)",0
1527,"(dev, fin_quarterly_report_test, sales_by_geo)",0


In [60]:
tags_combined = ["prod", "dev"]
df_contain_both_prod_dev_tags = contain_all_tags(tag_combinations_cost, tags_combined).sort_values(["total_cost"], ascending=False)
number = df_contain_both_prod_dev_tags.shape[0]
cost = df_contain_both_prod_dev_tags["total_cost"].sum()

print(f"There are {number} tag combinations with both {tags_combined}. They cost {round(cost, 2)}$")

df_contain_both_prod_dev_tags.head(10)

There are 4 tag combinations with both ['prod', 'dev']. They cost 14.09$


Unnamed: 0,tags,total_cost
1754,"(prod, anonymizer_utils, dev, anonymizer)",7.789392
198,"(anonymizer, anonymizer_utils, dev, prod)",5.89488
1755,"(prod, anonymizer_utils, dev, anonymizer, name...",0.378504
1524,"(dev, fin_annual_reports, genai, prod, test)",0.0264


In [62]:
tags_combined = ["prod", "dev"]
contain_neither_prod_dev_tags = do_not_contain_tags(tag_combinations_cost, tags_combined).sort_values(["total_cost"], ascending=False)
number = contain_neither_prod_dev_tags.shape[0]
cost = contain_neither_prod_dev_tags["total_cost"].sum()

print(f"There are {number} tag combinations with neither {tags_combined}. They cost {round(cost, 2)}$")

contain_neither_prod_dev_tags.head(10)

There are 62 tag combinations with neither ['prod', 'dev']. They cost 163.29$


Unnamed: 0,tags,total_cost
1951,"(uat, clinops_articles, rnd_whispers)",28.968533
195,"(ailybot, gpt)",26.6257
1953,"(uat, ctgov_trial_updates, rnd_whispers)",26.4258545
284,"(clinops_articles, rnd_whispers, uat)",24.1895505
186,"(ailybot, confluence-agent)",15.185268
366,"(ctgov_trial_updates, rnd_whispers, uat)",8.1146625
257,"(benchmarking, GPT4_06052024-1744, POS_dataset)",5.8578
256,"(benchmarking, GPT4_06052024-1739, POS_dataset)",5.8578
1952,"(uat, ctgov_new_trials, rnd_whispers)",5.7665405
181,"(ailybot, claude)",5.12938


In [66]:
df_contain_prod_tag.to_csv(f"prod_tags_with_cost.csv", index=False)