In [50]:
import pandas as pd
import json
from dotenv import load_dotenv
from langsmith import Client
from jsonschema import validate, ValidationError
from pprint import pprint

load_dotenv('/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/.env')

client = Client()

### Create a Dataset in LangSmith

In [13]:
dataset_to_create = "omg"

In [None]:
# Define Dataset schema
inputs_schema = {
    "type": "object",
    "properties": {
        "question": {"type": "string"},
    },
    "required": ["question"]
}

outputs_schema = {
    "type": "object",
    "properties": {
        "ground_truth_answer": {"type": "string"},
        "ground_truth_sources": {
            "type": "array",
            "items": {"type": "string"}
        },
    },
    "required": ["ground_truth_answer"]
}


# Check if dataset exists
datasets = client.list_datasets()
dataset = next((d for d in datasets if d.name == dataset_to_create), None)

if dataset:
    dataset_id = dataset.id
    print(
        f"Dataset '{dataset_to_create}' already exists with ID: {dataset_id}")
else:
    # If the dataset doesn't exist, create it
    print(f"Dataset '{dataset_to_create}' does not exist. Creating it now.")
    response = client.create_dataset(
        dataset_name=dataset_to_create,
        description="This is my imported dataset.",
        inputs_schema=inputs_schema,
        outputs_schema=outputs_schema,
    )
    dataset_id = response.id
    print(f"Dataset '{dataset_to_create}' created with ID: {dataset_id}")

### Format Evaluation Examples as JSONL

In [None]:
filename = "/Users/drew_wilkins/Downloads/dataset_gtv1_116f9507-0683-41a1-82aa-a905d6d8a225"

# Load your preprocessed CSV file
csv_file = f'{filename}.csv'
df = pd.read_csv(csv_file)


# Ensure ground_truth_sources is formatted as an array
def format_ground_truth_sources(value):
    if pd.isna(value) or value == "" or value == "NaN":
        return []  # Empty array for blank or NaN values
    try:
        # Parse JSON strings if already in JSON-like format
        return json.loads(value)
    except (json.JSONDecodeError, TypeError):
        # Otherwise, wrap it in an array
        return [value]


# Ensure the ground_truth_sources column exists
if 'output_ground_truth_sources' not in df.columns:
    df['output_ground_truth_sources'] = ""

# Apply formatting to the ground_truth_sources column
if 'output_ground_truth_sources' in df.columns:
    df['output_ground_truth_sources'] = df['output_ground_truth_sources'].apply(
        lambda x: format_ground_truth_sources(x)
    )

# Save as JSON
jsonl_file = f'{filename}.jsonl'  # Replace with your desired JSON file name
with open(jsonl_file, 'w') as f:
    for _, row in df.iterrows():
        jsonl_entry = {
            "input": {
                "question": row.get("input_question", "")
            },
            "output": {
                "ground_truth_answer": row.get("output_ground_truth_answer", ""),
                "ground_truth_sources": row.get("output_ground_truth_sources", [])
            }
        }
        f.write(json.dumps(jsonl_entry) + '\n')

    print(f"JSONL file saved to: {jsonl_file}")
jsonl_file
print(df.head())

### Add examples to Langsmith dataset

In [56]:
# Set parameters
# Use "dataset_to_create" if continuing from above
dataset_name = "ASK-groundtruth-v2"

# use "jsonl_file" if continuing from above
jsonl_file_to_add = jsonl_file

In [54]:
# Fetch Dataset and Its Schema
# List all datasets
datasets = client.list_datasets()
dataset = next((d for d in datasets if d.name == dataset_name), None)

if dataset:
    print(f"Dataset '{dataset_name}' retrieved successfully.\n")

    # Access the input and output schemas
    input_schema = dataset.inputs_schema
    output_schema = dataset.outputs_schema

    print("Input Schema:")
    pprint(input_schema)
    print("\nOutput Schema:")
    pprint(output_schema)
else:
    print(f"Dataset '{dataset_name}' not found.")

Dataset 'oyoy' retrieved successfully.

Input Schema:
{'properties': {'question': {'type': 'string'}},
 'required': ['question'],
 'type': 'object'}

Output Schema:
{'properties': {'ground_truth_answer': {'type': 'string'},
                'ground_truth_sources': {'items': {'type': 'string'},
                                         'type': 'array'}},
 'required': ['ground_truth_answer'],
 'type': 'object'}


In [57]:
# Validate Examples Against the Schema
def validate_against_schema(data, schema):
    """Validate a dictionary against a schema."""
    from jsonschema import validate, ValidationError
    try:
        validate(instance=data, schema=schema)
        return True
    except ValidationError as e:
        print(f"Validation error: {e}")
        return False


with open(jsonl_file_to_add, "r") as f:
    rows = [json.loads(line) for line in f]

valid_examples = []
for row in rows:
    if "input" in row and "output" in row:
        if validate_against_schema(row["input"], input_schema) and validate_against_schema(row["output"], output_schema):
            valid_examples.append(row)
        else:
            print(f"Validation error for example: {row}")
    else:
        print(f"Row missing 'input' or 'output': {row}")


# Load Examples from JSONL Fil
if valid_examples:
    inputs = [example["input"] for example in valid_examples]
    outputs = [example["output"] for example in valid_examples]

    try:
        response = client.create_examples(
            dataset_id=None,
            dataset_name=dataset_name,
            inputs=inputs,
            outputs=outputs,
        )
        print(
            f"Successfully added {len(inputs)} examples to dataset '{dataset_name}'.")
    except Exception as e:
        print("Error adding examples:")
        print(e)
else:
    print("No valid examples to add.")
print(response)

Successfully added 42 examples to dataset 'ASK-groundtruth-v2'.
None
