In [None]:
# Load HuggingFace dataset into Arize Phoenix running in Docker.

!uv pip install arize-phoenix datasets pandas pyarrow

[2mAudited [1m4 packages[0m [2min 5ms[0m[0m


In [10]:
"""
Load HuggingFace dataset into Arize Phoenix running in Docker.
"""
import pandas as pd
from datasets import load_dataset
import phoenix as px

def load_ragas_to_phoenix(
    dataset_name: str = "dwb2023/ragas-golden-dataset-v2",
    phoenix_dataset_name: str = "ragas_golden_v2"
) -> object:
    """
    Load RAGAS dataset into Phoenix Docker instance.
    
    Args:
        dataset_name: HF dataset identifier
        phoenix_dataset_name: Name for the dataset in Phoenix
        
    Returns:
        Phoenix dataset object
    """
    # Load from HuggingFace
    print(f"Loading {dataset_name} from HuggingFace...")
    hf_dataset = load_dataset(dataset_name, split="train")
    
    # Convert to pandas DataFrame
    df = hf_dataset.to_pandas()
    print(f"Loaded {len(df)} rows with columns: {list(df.columns)}")
    
    # Map RAGAS columns to Phoenix structure
    df_mapped = prepare_ragas_for_phoenix(df)
    
    # Connect to Docker Phoenix instance
    try:
        # Phoenix should auto-detect the Docker instance on localhost:6006
        client = px.Client(endpoint="http://localhost:6006")
        
        dataset = client.upload_dataset(
            dataframe=df_mapped,
            dataset_name=phoenix_dataset_name,
            input_keys=["question"],  # user_input -> question
            output_keys=["reference_answer"]  # reference -> reference_answer
        )
        print(f"✅ Dataset uploaded to Phoenix: {dataset}")
        return dataset
        
    except Exception as e:
        print(f"❌ Upload failed: {e}")
        # Try without explicit endpoint
        try:
            client = px.Client()
            dataset = client.upload_dataset(
                dataframe=df_mapped,
                dataset_name=phoenix_dataset_name,
                input_keys=["question"],
                output_keys=["reference_answer"]
            )
            print(f"✅ Dataset uploaded (fallback method): {dataset}")
            return dataset
        except Exception as e2:
            print(f"❌ Fallback failed: {e2}")
            raise

def prepare_ragas_for_phoenix(df: pd.DataFrame) -> pd.DataFrame:
    """
    Transform RAGAS dataset to Phoenix-friendly format.
    
    RAGAS columns: ['user_input', 'reference_contexts', 'reference', 'synthesizer_name']
    Phoenix expects: clear input/output separation
    """
    df_clean = df.copy()
    
    # Rename columns for clarity
    column_mapping = {
        'user_input': 'question',
        'reference': 'reference_answer',
        'reference_contexts': 'contexts',
        'synthesizer_name': 'model_name'
    }
    
    df_clean = df_clean.rename(columns=column_mapping)
    
    # Convert lists to strings if needed (Phoenix works better with strings)
    if 'contexts' in df_clean.columns:
        df_clean['contexts'] = df_clean['contexts'].apply(
            lambda x: '\n'.join(x) if isinstance(x, list) else str(x)
        )
    
    # Add ID column if missing
    if 'id' not in df_clean.columns:
        df_clean['id'] = range(len(df_clean))
    
    # Ensure string types for text columns
    text_cols = ['question', 'reference_answer', 'contexts']
    for col in text_cols:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].astype(str)
    
    print(f"Prepared dataset with columns: {list(df_clean.columns)}")
    return df_clean

# Usage example
if __name__ == "__main__":
    # Fix environment first
    import os
    if 'PHOENIX_COLLECTOR_ENDPOINT' in os.environ:
        del os.environ['PHOENIX_COLLECTOR_ENDPOINT']
    
    # Load the dataset (no need to launch_app since Phoenix is in Docker)
    dataset = load_ragas_to_phoenix()
    
    print(f"🎉 Access Phoenix UI at: http://localhost:6006")

Loading dwb2023/ragas-golden-dataset-v2 from HuggingFace...
Loaded 12 rows with columns: ['user_input', 'reference_contexts', 'reference', 'synthesizer_name']
Prepared dataset with columns: ['question', 'contexts', 'reference_answer', 'model_name', 'id']
📤 Uploading dataset...
❌ Upload failed: Dataset with the same name already exists: name='ragas_golden_v2'
📤 Uploading dataset...
❌ Fallback failed: Dataset with the same name already exists: name='ragas_golden_v2'


DatasetUploadError: Dataset with the same name already exists: name='ragas_golden_v2'