In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## LangSmith Analysis

In [None]:
## Setup LangSmith
from langsmith import Client
client = Client()

In [None]:
for dataset in client.list_datasets():
    print(dataset.name)

In [None]:
for project in client.list_projects(reference_dataset_name="blockchain_solana"):
    print(project)

In [None]:
project.feedback_stats

In [None]:
project.id

In [None]:
results = {
    "id": f"{project.id}",
    "name": project.name,
    "url": project.url,
    "dataset" : project.extra["metadata"]["dataset"],
    "collection" : project.extra["metadata"]["collection"],
    "eval-model": project.extra["metadata"]["eval-model"],
    "eval-run": project.extra["metadata"]["run"],
    "start_time": project.start_time.isoformat(),
    "last_run_start_time": project.last_run_start_time.isoformat(),
    "run_count": project.run_count,
    "latency_p50": project.latency_p50.total_seconds(),
    "latency_p99": project.latency_p99.total_seconds(),
    "prompt_tokens": project.prompt_tokens,
    "completion_tokens": project.completion_tokens,
    "total_tokens" : project.total_tokens,
    "answer_correctness_n" : project.feedback_stats["answer_correctness_score"]["n"],
    "answer_correctness_avg" : project.feedback_stats["answer_correctness_score"]["avg"],
    "answer_relevancy_n" : project.feedback_stats["answer_relevancy_score"]["n"],
    "answer_relevancy_avg" : project.feedback_stats["answer_relevancy_score"]["avg"],
    "context_recall_n": project.feedback_stats["context_recall_score"]["n"],
    "context_recall_avg": project.feedback_stats["context_recall_score"]["avg"],
    "context_relevancy_n": project.feedback_stats["context_relevancy_score"]["n"],
    "context_relevancy_avg": project.feedback_stats["context_relevancy_score"]["avg"],
    "faithfulness_n": project.feedback_stats["faithfulness_score"]["n"],
    "faithfulness_avg": project.feedback_stats["faithfulness_score"]["avg"],
    }
results

In [None]:
import json

with open("results.jsonl", "w") as out_file:
    for dataset in client.list_datasets():
        for project in client.list_projects(reference_dataset_id=dataset.id):
            if project.total_tokens:
                results = {
                    "id": f"{project.id}",
                    "name": project.name,
                    "url": project.url,
                    "dataset" : project.extra["metadata"]["dataset"],
                    "collection" : project.extra["metadata"]["collection"],
                    "eval-model": project.extra["metadata"]["eval-model"],
                    "eval-run": project.extra["metadata"]["run"],
                    "start_time": project.start_time.isoformat(),
                    "last_run_start_time": project.last_run_start_time.isoformat(),
                    "run_count": project.run_count,
                    "latency_p50": project.latency_p50.total_seconds(),
                    "latency_p99": project.latency_p99.total_seconds(),
                    "prompt_tokens": project.prompt_tokens,
                    "completion_tokens": project.completion_tokens,
                    "total_tokens" : project.total_tokens,
                    "answer_correctness_n" : project.feedback_stats["answer_correctness_score"]["n"],
                    "answer_correctness_avg" : project.feedback_stats["answer_correctness_score"]["avg"],
                    "answer_relevancy_n" : project.feedback_stats["answer_relevancy_score"]["n"],
                    "answer_relevancy_avg" : project.feedback_stats["answer_relevancy_score"]["avg"],
                    "context_recall_n": project.feedback_stats["context_recall_score"]["n"],
                    "context_recall_avg": project.feedback_stats["context_recall_score"]["avg"],
                    "context_relevancy_n": project.feedback_stats["context_relevancy_score"]["n"],
                    "context_relevancy_avg": project.feedback_stats["context_relevancy_score"]["avg"],
                    "faithfulness_n": project.feedback_stats["faithfulness_score"]["n"],
                    "faithfulness_avg": project.feedback_stats["faithfulness_score"]["avg"],
                }
            else:
                results = {
                    "id": f"{project.id}",
                    "name": project.name,
                    "url": project.url,
                    "dataset" : project.extra["metadata"]["dataset"],
                    "collection" : project.extra["metadata"]["collection"],
                    "eval-model": project.extra["metadata"]["eval-model"],
                    "eval-run": project.extra["metadata"]["run"],
                    "start_time": project.start_time.isoformat(),
                    "last_run_start_time": None,
                    "run_count": project.run_count,
                    "latency_p50": None,
                    "latency_p99": None,
                    "prompt_tokens": project.prompt_tokens,
                    "completion_tokens": project.completion_tokens,
                    "total_tokens" : project.total_tokens,
                    "answer_correctness_n" : None,
                    "answer_correctness_avg" : None,
                    "answer_relevancy_n" : None,
                    "answer_relevancy_avg" : None,
                    "context_recall_n": None,
                    "context_recall_avg": None,
                    "context_relevancy_n": None,
                    "context_relevancy_avg": None,
                    "faithfulness_n": None,
                    "faithfulness_avg": None,
                }

            out_file.write(json.dumps(results) + "\n")

In [None]:
import pandas

df = pandas.read_json("results.jsonl", orient="records", lines=True)
df.to_csv("results.csv", header=True)

## TruLens Analysis

In [None]:
%pip install psycopg2-binary


In [2]:
from trulens_eval import Tru
import os

tru = Tru(database_url=os.getenv("TRULENS_DB_CONN_STRING"))

🦑 Tru initialized with db url postgresql://postgres:***@127.0.0.1:5432 .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [13]:
app_ids_to_example_count = {}

for app in tru.get_apps():
    app_id = app["app_id"]
    dfRecords, feedbackColumns = tru.get_records_and_feedback([app_id])
    app_ids_to_example_count[app_id] = len(dfRecords)

app_ids_to_example_count

{'llama_512_da31_blockchain_solana': 58,
 'lc_512_4676_blockchain_solana': 58,
 'llama_512_da31_braintrust_coda_help_desk': 100,
 'lc_512_4676_braintrust_coda_help_desk': 100,
 'lc_512_4676_covid_qa': 316,
 'llama_512_da31_covid_qa': 316,
 'lc_512_4676_evaluating_llm_survey_paper': 276,
 'llama_512_da31_evaluating_llm_survey_paper': 276,
 'lc_512_4676_history_of_alexnet': 160,
 'llama_512_da31_history_of_alexnet': 160,
 'lc_512_4676_llama_2_paper': 100,
 'lc_512_4676_mini_squad_v2': 40,
 'llama_512_da31_llama_2_paper': 100,
 'llama_512_da31_mini_squad_v2': 195,
 'llama_512_da31_origin_of_covid_19': 24,
 'llama_512_da31_patronus_ai_financebench': 98,
 'llama_512_da31_paul_grahman_essay': 44,
 'llama_512_da31_uber_10k': 822}

In [17]:
board = tru.get_leaderboard(app_ids=app_ids_to_example_count.keys())
board

Unnamed: 0_level_0,qs_relevance_with_cot_reasons,relevance_with_cot_reasons,agreement_measure,groundedness_measure_with_cot_reasons,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lc_512_4676_braintrust_coda_help_desk,0.787,0.958,0.773,0.7473972,2.8,0.0
lc_512_4676_blockchain_solana,0.777586,0.994828,0.793103,0.8565887,3.0,0.0
lc_512_4676_llama_2_paper,0.728,0.889,0.675,0.6891071,2.8,0.0
lc_512_4676_covid_qa,0.697785,0.955063,0.706329,0.755267,2.167722,0.0
llama_512_da31_origin_of_covid_19,0.683333,0.983333,0.741667,0.8659226,3.291667,0.0
llama_512_da31_covid_qa,0.663054,0.958544,0.821519,0.8250409,2.167722,0.0
llama_512_da31_uber_10k,0.599574,0.983942,1.039416,20275750000.0,1.399027,0.0
lc_512_4676_evaluating_llm_survey_paper,0.580797,0.905435,0.842029,0.8473688,2.199275,0.0
llama_512_da31_braintrust_coda_help_desk,0.572,0.991,0.784,0.7685465,2.8,0.0
lc_512_4676_mini_squad_v2,0.4975,0.935,0.7425,0.5016544,3.05,0.0


In [9]:
board.to_csv('results.csv')

In [11]:
len(board)

11