In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## LangSmith Analysis

In [None]:
## Setup LangSmith
from langsmith import Client
client = Client()

In [None]:
for dataset in client.list_datasets():
    print(dataset.name)

In [None]:
for project in client.list_projects(reference_dataset_name="blockchain_solana"):
    print(project)

In [None]:
project.feedback_stats

In [None]:
project.id

In [None]:
results = {
    "id": f"{project.id}",
    "name": project.name,
    "url": project.url,
    "dataset" : project.extra["metadata"]["dataset"],
    "collection" : project.extra["metadata"]["collection"],
    "eval-model": project.extra["metadata"]["eval-model"],
    "eval-run": project.extra["metadata"]["run"],
    "start_time": project.start_time.isoformat(),
    "last_run_start_time": project.last_run_start_time.isoformat(),
    "run_count": project.run_count,
    "latency_p50": project.latency_p50.total_seconds(),
    "latency_p99": project.latency_p99.total_seconds(),
    "prompt_tokens": project.prompt_tokens,
    "completion_tokens": project.completion_tokens,
    "total_tokens" : project.total_tokens,
    "answer_correctness_n" : project.feedback_stats["answer_correctness_score"]["n"],
    "answer_correctness_avg" : project.feedback_stats["answer_correctness_score"]["avg"],
    "answer_relevancy_n" : project.feedback_stats["answer_relevancy_score"]["n"],
    "answer_relevancy_avg" : project.feedback_stats["answer_relevancy_score"]["avg"],
    "context_recall_n": project.feedback_stats["context_recall_score"]["n"],
    "context_recall_avg": project.feedback_stats["context_recall_score"]["avg"],
    "context_relevancy_n": project.feedback_stats["context_relevancy_score"]["n"],
    "context_relevancy_avg": project.feedback_stats["context_relevancy_score"]["avg"],
    "faithfulness_n": project.feedback_stats["faithfulness_score"]["n"],
    "faithfulness_avg": project.feedback_stats["faithfulness_score"]["avg"],
    }
results

In [None]:
import json

with open("results.jsonl", "w") as out_file:
    for dataset in client.list_datasets():
        for project in client.list_projects(reference_dataset_id=dataset.id):
            if project.total_tokens:
                results = {
                    "id": f"{project.id}",
                    "name": project.name,
                    "url": project.url,
                    "dataset" : project.extra["metadata"]["dataset"],
                    "collection" : project.extra["metadata"]["collection"],
                    "eval-model": project.extra["metadata"]["eval-model"],
                    "eval-run": project.extra["metadata"]["run"],
                    "start_time": project.start_time.isoformat(),
                    "last_run_start_time": project.last_run_start_time.isoformat(),
                    "run_count": project.run_count,
                    "latency_p50": project.latency_p50.total_seconds(),
                    "latency_p99": project.latency_p99.total_seconds(),
                    "prompt_tokens": project.prompt_tokens,
                    "completion_tokens": project.completion_tokens,
                    "total_tokens" : project.total_tokens,
                    "answer_correctness_n" : project.feedback_stats["answer_correctness_score"]["n"],
                    "answer_correctness_avg" : project.feedback_stats["answer_correctness_score"]["avg"],
                    "answer_relevancy_n" : project.feedback_stats["answer_relevancy_score"]["n"],
                    "answer_relevancy_avg" : project.feedback_stats["answer_relevancy_score"]["avg"],
                    "context_recall_n": project.feedback_stats["context_recall_score"]["n"],
                    "context_recall_avg": project.feedback_stats["context_recall_score"]["avg"],
                    "context_relevancy_n": project.feedback_stats["context_relevancy_score"]["n"],
                    "context_relevancy_avg": project.feedback_stats["context_relevancy_score"]["avg"],
                    "faithfulness_n": project.feedback_stats["faithfulness_score"]["n"],
                    "faithfulness_avg": project.feedback_stats["faithfulness_score"]["avg"],
                }
            else:
                results = {
                    "id": f"{project.id}",
                    "name": project.name,
                    "url": project.url,
                    "dataset" : project.extra["metadata"]["dataset"],
                    "collection" : project.extra["metadata"]["collection"],
                    "eval-model": project.extra["metadata"]["eval-model"],
                    "eval-run": project.extra["metadata"]["run"],
                    "start_time": project.start_time.isoformat(),
                    "last_run_start_time": None,
                    "run_count": project.run_count,
                    "latency_p50": None,
                    "latency_p99": None,
                    "prompt_tokens": project.prompt_tokens,
                    "completion_tokens": project.completion_tokens,
                    "total_tokens" : project.total_tokens,
                    "answer_correctness_n" : None,
                    "answer_correctness_avg" : None,
                    "answer_relevancy_n" : None,
                    "answer_relevancy_avg" : None,
                    "context_recall_n": None,
                    "context_recall_avg": None,
                    "context_relevancy_n": None,
                    "context_relevancy_avg": None,
                    "faithfulness_n": None,
                    "faithfulness_avg": None,
                }

            out_file.write(json.dumps(results) + "\n")

In [None]:
import pandas

df = pandas.read_json("results.jsonl", orient="records", lines=True)
df.to_csv("results.csv", header=True)

## TruLens Analysis

In [6]:
%pip install psycopg2-binary


Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.4 kB)
Downloading psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.9

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
from trulens_eval import Tru
import os

tru = Tru(database_url=os.getenv("TRULENS_DB_CONN_STRING"))

🦑 Tru initialized with db url postgresql://postgres:***@127.0.0.1:5432 .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [14]:
app_ids = []

for app in tru.get_apps():
    app_id = app["app_id"]
    if app_id.endswith("_512"):
        app_ids.append(app_id)


In [15]:
tru.get_leaderboard(app_ids=app_ids)

Unnamed: 0_level_0,qs_relevance_with_cot_reasons,relevance_with_cot_reasons,groundedness_measure_with_cot_reasons,agreement_measure,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
braintrust_coda_help_desk_open_ai_512,0.8,0.942,0.7507057,0.776,1.615,0.0
blockchain_solana_open_ai_512,0.780172,0.986207,0.8695539,0.808621,1.663793,0.0
llama_2_paper_open_ai_512,0.7315,0.8905,0.6920077,0.673,1.615,0.0
covid_qa_open_ai_512,0.702532,0.953481,283657500000.0,0.710127,1.618671,0.0
evaluating_llm_survey_paper_open_ai_512,0.589312,0.906341,0.7871784,0.825,1.726449,0.0
mini_squad_v2_open_ai_512,0.50375,0.93,0.5059211,0.71875,2.0125,0.0
blockchain_solana_llama_512,0.482031,0.996552,0.8849967,0.748276,2.137931,0.0
history_of_alexnet_open_ai_512,0.056563,0.799687,0.06372329,0.492188,1.565625,0.0
patronus_ai_financebench_llama_512,,0.993878,,6.739796,1.857143,0.0
origin_of_covid_19_llama_512,,0.983333,,0.733333,1.666667,0.0
