In [None]:
from dotenv import load_dotenv

load_dotenv()

## LangSmith Analysis

In [None]:
## Setup LangSmith
from langsmith import Client
client = Client()

In [None]:
for dataset in client.list_datasets():
    print(dataset.name)

In [None]:
for project in client.list_projects(reference_dataset_name="blockchain_solana"):
    print(project)

In [None]:
project.feedback_stats

In [None]:
project.id

In [None]:
results = {
    "id": f"{project.id}",
    "name": project.name,
    "url": project.url,
    "dataset" : project.extra["metadata"]["dataset"],
    "collection" : project.extra["metadata"]["collection"],
    "eval-model": project.extra["metadata"]["eval-model"],
    "eval-run": project.extra["metadata"]["run"],
    "start_time": project.start_time.isoformat(),
    "last_run_start_time": project.last_run_start_time.isoformat(),
    "run_count": project.run_count,
    "latency_p50": project.latency_p50.total_seconds(),
    "latency_p99": project.latency_p99.total_seconds(),
    "prompt_tokens": project.prompt_tokens,
    "completion_tokens": project.completion_tokens,
    "total_tokens" : project.total_tokens,
    "answer_correctness_n" : project.feedback_stats["answer_correctness_score"]["n"],
    "answer_correctness_avg" : project.feedback_stats["answer_correctness_score"]["avg"],
    "answer_relevancy_n" : project.feedback_stats["answer_relevancy_score"]["n"],
    "answer_relevancy_avg" : project.feedback_stats["answer_relevancy_score"]["avg"],
    "context_recall_n": project.feedback_stats["context_recall_score"]["n"],
    "context_recall_avg": project.feedback_stats["context_recall_score"]["avg"],
    "context_relevancy_n": project.feedback_stats["context_relevancy_score"]["n"],
    "context_relevancy_avg": project.feedback_stats["context_relevancy_score"]["avg"],
    "faithfulness_n": project.feedback_stats["faithfulness_score"]["n"],
    "faithfulness_avg": project.feedback_stats["faithfulness_score"]["avg"],
    }
results

In [None]:
import json

with open("results.jsonl", "w") as out_file:
    for dataset in client.list_datasets():
        for project in client.list_projects(reference_dataset_id=dataset.id):
            if project.total_tokens:
                results = {
                    "id": f"{project.id}",
                    "name": project.name,
                    "url": project.url,
                    "dataset" : project.extra["metadata"]["dataset"],
                    "collection" : project.extra["metadata"]["collection"],
                    "eval-model": project.extra["metadata"]["eval-model"],
                    "eval-run": project.extra["metadata"]["run"],
                    "start_time": project.start_time.isoformat(),
                    "last_run_start_time": project.last_run_start_time.isoformat(),
                    "run_count": project.run_count,
                    "latency_p50": project.latency_p50.total_seconds(),
                    "latency_p99": project.latency_p99.total_seconds(),
                    "prompt_tokens": project.prompt_tokens,
                    "completion_tokens": project.completion_tokens,
                    "total_tokens" : project.total_tokens,
                    "answer_correctness_n" : project.feedback_stats["answer_correctness_score"]["n"],
                    "answer_correctness_avg" : project.feedback_stats["answer_correctness_score"]["avg"],
                    "answer_relevancy_n" : project.feedback_stats["answer_relevancy_score"]["n"],
                    "answer_relevancy_avg" : project.feedback_stats["answer_relevancy_score"]["avg"],
                    "context_recall_n": project.feedback_stats["context_recall_score"]["n"],
                    "context_recall_avg": project.feedback_stats["context_recall_score"]["avg"],
                    "context_relevancy_n": project.feedback_stats["context_relevancy_score"]["n"],
                    "context_relevancy_avg": project.feedback_stats["context_relevancy_score"]["avg"],
                    "faithfulness_n": project.feedback_stats["faithfulness_score"]["n"],
                    "faithfulness_avg": project.feedback_stats["faithfulness_score"]["avg"],
                }
            else:
                results = {
                    "id": f"{project.id}",
                    "name": project.name,
                    "url": project.url,
                    "dataset" : project.extra["metadata"]["dataset"],
                    "collection" : project.extra["metadata"]["collection"],
                    "eval-model": project.extra["metadata"]["eval-model"],
                    "eval-run": project.extra["metadata"]["run"],
                    "start_time": project.start_time.isoformat(),
                    "last_run_start_time": None,
                    "run_count": project.run_count,
                    "latency_p50": None,
                    "latency_p99": None,
                    "prompt_tokens": project.prompt_tokens,
                    "completion_tokens": project.completion_tokens,
                    "total_tokens" : project.total_tokens,
                    "answer_correctness_n" : None,
                    "answer_correctness_avg" : None,
                    "answer_relevancy_n" : None,
                    "answer_relevancy_avg" : None,
                    "context_recall_n": None,
                    "context_recall_avg": None,
                    "context_relevancy_n": None,
                    "context_relevancy_avg": None,
                    "faithfulness_n": None,
                    "faithfulness_avg": None,
                }

            out_file.write(json.dumps(results) + "\n")

In [None]:
import pandas

df = pandas.read_json("results.jsonl", orient="records", lines=True)
df.to_csv("results.csv", header=True)

## TruLens Analysis

In [None]:
%pip install psycopg2-binary

In [None]:
from trulens_eval import Tru
import os

tru = Tru(database_url=os.getenv("TRULENS_DB_CONN_STRING"))

In [None]:
app_ids_to_example_count = {}

for app in tru.get_apps():
    app_id = app["app_id"]
    dfRecords, feedbackColumns = tru.get_records_and_feedback([app_id])
    app_ids_to_example_count[app_id] = len(dfRecords)

app_ids_to_example_count

In [None]:
board = tru.get_leaderboard(app_ids=app_ids_to_example_count.keys())
board

In [None]:
board.to_csv('results.csv')

In [None]:
import pandas as pd

# Create an empty DataFrame with specified columns and data types
df = pd.DataFrame(columns=["langchain", "llamaindex"])
df = df.astype({"langchain": "float", "llamaindex": "float"})

# Populate empty rows with dataset names
for index, row in board.iterrows():
    parts = index.split("_")
    tool = parts[0]
    dataset = "_".join(parts[3:])

    if dataset not in df.index:
        df.loc[dataset] = [None, None]

df = df.sort_index()
df

In [None]:
# create copies of the empty dataFrame for specific measurements
groundedness = df.copy(deep=True)
answer_relevance = df.copy(deep=True)
context_relevance = df.copy(deep=True)
answer_correctness = df.copy(deep=True)
latency = df.copy(deep=True)
total_cost = df.copy(deep=True)

for index, row in board.iterrows():
    parts = index.split("_")
    tool = parts[0]
    dataset = "_".join(parts[3:])

    if tool == "lc":
        tool = "langchain"
    elif tool == "llama":
        tool = "llamaindex"

    groundedness.at[dataset, tool] = row["groundedness_measure_with_cot_reasons"]
    answer_relevance.at[dataset, tool] = row["relevance_with_cot_reasons"]
    context_relevance.at[dataset, tool] = row["qs_relevance_with_cot_reasons"]
    answer_correctness.at[dataset, tool] = row["agreement_measure"]
    latency.at[dataset, tool] = row["latency"]
    total_cost.at[dataset, tool] = row["total_cost"]


In [None]:
groundedness

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [None]:
fig, ax = plt.subplots()
ax = groundedness.plot.bar(colormap=ListedColormap(['#0671b7', '#f678a7']), ax=ax)

# Some styling tweaks using Matplotlib
ax.set_xlabel(None)
ax.set_ylabel('Score')
ax.set_xlabel('Dataset')
ax.set_ylim([0, 1])
ax.set_title('Groundedness (answer supported by the context)')
fig.tight_layout(pad=1)

In [None]:
fig, ax = plt.subplots()
ax = answer_relevance.plot.bar(colormap=ListedColormap(['#0671b7', '#f678a7']), ax=ax)

# Some styling tweaks using Matplotlib
ax.set_xlabel(None)
ax.set_ylabel('Score')
ax.set_xlabel('Dataset')
ax.set_ylim([0.75, 1])
ax.set_title('Answer Relevance (answer relevance to query)')
fig.tight_layout(pad=1)

In [None]:
fig, ax = plt.subplots()
ax = context_relevance.plot.bar(colormap=ListedColormap(['#0671b7', '#f678a7']), ax=ax)

# Some styling tweaks using Matplotlib
ax.set_xlabel(None)
ax.set_ylabel('Score')
ax.set_xlabel('Dataset')
ax.set_ylim([0, 1])
ax.set_title('Context Relevance (context relevance to query)')
fig.tight_layout(pad=1)

In [None]:
fig, ax = plt.subplots()
ax = answer_correctness.plot.bar(colormap=ListedColormap(['#0671b7', '#f678a7']), ax=ax)

# Some styling tweaks using Matplotlib
ax.set_xlabel(None)
ax.set_ylabel('Score')
ax.set_xlabel('Dataset')
ax.set_ylim([0, 1])
ax.set_title('Answer Correctness (answer compared to ground)')
fig.tight_layout(pad=1)

In [None]:
fig, ax = plt.subplots()
ax = latency.plot.bar(colormap=ListedColormap(['#0671b7', '#f678a7']), ax=ax)

# Some styling tweaks using Matplotlib
ax.set_xlabel(None)
ax.set_ylabel('Score')
ax.set_xlabel('Dataset')
#ax.set_ylim([0, 1])
ax.set_title('Latency')
fig.tight_layout(pad=1)

In [None]:
fig, ax = plt.subplots()
ax = total_cost.plot.bar(colormap=ListedColormap(['#0671b7', '#f678a7']), ax=ax)

# Some styling tweaks using Matplotlib
ax.set_xlabel(None)
ax.set_ylabel('Score')
ax.set_xlabel('Dataset')
#ax.set_ylim([0, 1])
ax.set_title('Total Cost')
fig.tight_layout(pad=1)

## Thorough Data Analysis

In [None]:
apps = {}

columns_to_keep = [
    "record_id", "input", "output", "tags",
    "groundedness_measure_with_cot_reasons",
    "relevance_with_cot_reasons",
    "qs_relevance_with_cot_reasons",
    "agreement_measure",
    "latency", "total_tokens", "total_cost"]

for app in tru.get_apps():
    app_id = app["app_id"]
    dfRecords, feedbackColumns = tru.get_records_and_feedback([app_id])
    apps[app_id] = dfRecords[columns_to_keep]

### Testing Normality

The Shapiro-Wilk test for normality, when used in the context of comparing two methods on the same set of test cases (especially in a paired test scenario), implies that each data point in one dataset corresponds to a data point in the other dataset. In other words, the data points are paired.

This is particularly relevant when you want to:

1. **Perform a Paired Sample Test**: In a paired sample t-test or a Wilcoxon signed-rank test, the difference between each pair of observations is crucial. These tests are based on the differences within each pair (i.e., each data point in Method 1 is subtracted from the corresponding data point in Method 2). Therefore, it's important that the two datasets are aligned such that each data point in one dataset has a direct, corresponding data point in the other dataset.

1. **Test for Normality in Paired Differences**: When using the Shapiro-Wilk test in this context, you're typically testing the normality of these differences, not the individual datasets. Therefore, the datasets must be paired correctly before calculating these differences.

For example, if you have a list of test cases and you apply Method 1 and Method 2 to each test case, you should ensure that the results for each method are aligned such that the result of Method 1 for Test Case 1 is in the same position (same index) as the result of Method 2 for Test Case 1, and so on for all test cases.

In summary, for paired analyses, the order and pairing of data points between datasets are crucial. The results for each test case from each method need to be correctly aligned for the paired analysis to be valid.

In [None]:
apps.keys()

In [None]:
# re-arrange data, and sort all dataFrames by the `input` column

dataFrames = {}

for app_id in apps:
    parts = app_id.split("_")
    tool = parts[0]
    dataset = "_".join(parts[3:])

    if dataset not in dataFrames:
        dataFrames[dataset] = {}

    dataFrames[dataset][tool] = apps[app_id].sort_values(by="input")

In [None]:
import scipy.stats as stats

columns_to_test = [
    "groundedness_measure_with_cot_reasons",
    "relevance_with_cot_reasons",
    "qs_relevance_with_cot_reasons",
    "agreement_measure",
    "latency"]

for dataset in dataFrames:
    print(dataset)
    lc_df = dataFrames[dataset]["lc"]
    llama_df = dataFrames[dataset]["llama"]
    if len(lc_df) != len(llama_df):
        print("\tDataFrames do not have equal row counts, skipping :(")
        continue
    for test in columns_to_test:
        print(f"\tTesting normality of {test}:")
        lc_values = lc_df[test].to_list()
        llama_values = llama_df[test].to_list()

        # Calculate the differences
        differences = [x - y for x, y in zip(lc_values, llama_values)]

        # Perform the Shapiro-Wilk Test
        statistic, p_value = stats.shapiro(differences)

        print(f"\t\tShapiro-Wilk Test statistic: {statistic}")
        print("\t\tP-value:", p_value)

        # Interpretation
        alpha = 0.05
        if p_value > alpha:
            print("\t\t\tData follows a normal distribution (fail to reject H0)")
        else:
            print("\t\t\tData does NOT follow a normal distribution (reject H0)")

Conclusion: Most datasets comparisons do NOT follow a normal distribution

### Testing Betterness

**Wilcoxon Signed-Rank Test**: This is a non-parametric alternative to the paired sample t-test and is used to compare two related samples or repeated measurements on a single sample to assess whether their population mean ranks differ. It's appropriate for your scenario where you have paired data (the same cases tested with two different methods).

In [None]:
from scipy import stats
import numpy as np

columns_to_test = [
    "groundedness_measure_with_cot_reasons",
    "relevance_with_cot_reasons",
    "qs_relevance_with_cot_reasons",
    "agreement_measure",
    "latency"]

column_translation = {
    "groundedness_measure_with_cot_reasons": "groundedness",
    "relevance_with_cot_reasons" : "answer_relevance",
    "qs_relevance_with_cot_reasons" : "context_relevance",
    "agreement_measure" : "answer_correctness",
    "latency" : "latency"
}

for dataset in dataFrames:
    print(dataset)
    # Assuming lc_df and llama_df are paired datasets
    lc_df = dataFrames[dataset]["lc"]
    llama_df = dataFrames[dataset]["llama"]
    if len(lc_df) != len(llama_df):
        print("\tDataFrames do not have equal row counts, skipping :(")
        continue
    for test in columns_to_test:
        translated = column_translation[test]
        print(f"\tTesting differences of {translated}:")
        lc_values = lc_df[test].to_list()
        llama_values = llama_df[test].to_list()

        # Perform the Wilcoxon Signed-Rank Test
        stat, p = stats.wilcoxon(lc_values, llama_values)
        print('\t\tStatistics=%.3f, p=%.3f' % (stat, p))

        # Calculate the median of the differences
        differences = np.array(llama_values) - np.array(lc_values)
        median_difference = np.median(differences)
        print('\t\tMedian of Differences:', median_difference)

        # Interpretation
        alpha = 0.05
        if p > alpha:
            print('\t\t\tSame distribution (fail to reject H0)')
        else:
            print('\t\t\tDifferent distribution (reject H0)')
            if median_difference > 0:
                print('\t\t\t\tLlama generally scores higher.')
            elif median_difference < 0:
                print('\t\t\t\tLangchain generally scores higher.')
            else:
                print('\t\t\t\tNo difference in the median scores.')



In [None]:
for dataset_name in dataFrames.keys():
    for tool_name in dataFrames[dataset_name].keys():
        dataFrames[dataset_name][tool_name].to_json(f"./results/tool_comparison/{dataset_name}-{tool_name}.jsonl", orient="records", lines=True)