# H2O Drive to Text2Everything - End-to-End Integration

This notebook demonstrates a complete end-to-end workflow for transferring data from H2O Drive to the Text2Everything API using the official SDK.

## Overview

This notebook will guide you through:
1. Setting up the environment and authentication
2. Connecting to H2O Drive and Text2Everything
3. Discovering and selecting projects
4. Creating and configuring database connectors (Snowflake)
5. Loading data from H2O Drive
6. Preparing data for the SDK
7. Uploading data using bulk operations
8. Verifying the upload results

## Prerequisites

- H2O Drive access with data organized in the expected structure
- Text2Everything API access with valid API key
- Required Python packages installed

## 1. Setup and Installation

In [None]:
# Install required packages if not already installed
import sys
import subprocess

def install_package(package):
    try:
        __import__(package)
        print(f"✅ {package} is already installed")
    except ImportError:
        print(f"📦 Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-qqq", package])
        print(f"✅ {package} installed successfully")

# Install required packages
install_package("h2o_drive>=4.1.0")
install_package("tqdm")
install_package("pydantic")
install_package("./text2everything_sdk-0.1.5-py3-none-any.whl")

print("\n🎉 All packages are ready!")

## 2. Import Libraries and Setup

In [None]:
import os
import sys
import json
import asyncio
from pathlib import Path
from typing import Dict, List, Any, Tuple, Optional

# H2O Drive imports
import h2o_drive
from h2o_drive import core

# Text2Everything SDK imports
from text2everything_sdk import Text2EverythingClient
from text2everything_sdk.exceptions import (
    AuthenticationError,
    ValidationError,
    NotFoundError,
    RateLimitError,
    ServerError
)

# Import our integration components
from drive_to_t2e_integration import DriveManager, prepare_data_for_sdk

print("✅ All libraries imported successfully")

## 3. Configuration Setup

Define configuration structure similar to the config_example.py file. **IMPORTANT: Set your API key directly in the T2E_CONFIG below before running the notebook.**

In [None]:
# Configuration structure - customize these values for your environment
# IMPORTANT: Set your access token in the T2E_CONFIG below
T2E_ACCESS_TOKEN = ""  # Replace with your access token
T2E_WORKSPACE_NAME = ""  # Optional workspace name

# Text2Everything API Configuration
T2E_CONFIG = {
    "base_url": "http://text2everything.text2everything.svc.cluster.local:8000",
    "access_token": T2E_ACCESS_TOKEN,
    "workspace_name": T2E_WORKSPACE_NAME,
    "timeout": 200,
    "max_retries": 1
}

# H2O Drive Configuration
DRIVE_CONFIG = {
    "auto_connect": True,  # Automatically connect to H2O Drive
    "default_bucket": "user",  # Use user bucket by default
}

# Data Processing Configuration
DATA_CONFIG = {
    "validate_before_upload": True,  # Enable validation before upload
    "skip_empty_files": True,  # Skip files with no content
    "auto_generate_names": True,  # Auto-generate names from filenames if missing
}

# Logging Configuration
LOGGING_CONFIG = {
    "level": "INFO",  # DEBUG, INFO, WARNING, ERROR
    "show_progress": True,  # Show progress bars
    "verbose_errors": True,  # Show detailed error messages
}

print("✅ Configuration structure defined")
print("📋 Available configurations:")
print(f"   - T2E_CONFIG: API settings")
print(f"   - DRIVE_CONFIG: H2O Drive settings")
print(f"   - DATA_CONFIG: Data processing options")
print(f"   - LOGGING_CONFIG: Logging preferences")

## 4. Validate Configuration

In [None]:
# Validate configuration - ensure access token is set

if not T2E_CONFIG["access_token"]:
    print("❌ Access token not configured!")
    print("Please set your OIDC access token in the T2E_CONFIG above.")
    print("Update the 'access_token' field with your actual token.")
    raise ValueError("Access token must be configured in T2E_CONFIG before proceeding")

# Display final configuration (with masked token)
print("✅ Configuration validated successfully")
print("\n📋 Final Configuration:")
print(f"   Base URL: {T2E_CONFIG['base_url']}")
access_token_display = T2E_CONFIG['access_token']
if len(access_token_display) > 4:
    access_token_display = '*' * (len(access_token_display) - 4) + access_token_display[-4:]
else:
    access_token_display = '****'
print(f"   Access Token: {access_token_display}")
print(f"   Workspace: {T2E_CONFIG.get('workspace_name') or '(none)'}")
print(f"   Timeout: {T2E_CONFIG['timeout']}s")
print(f"   Max Retries: {T2E_CONFIG['max_retries']}")
print(f"   Validation Enabled: {DATA_CONFIG['validate_before_upload']}")
print(f"   Progress Display: {LOGGING_CONFIG['show_progress']}")

## 5. Connect to H2O Drive

In [None]:
# Connect to H2O Drive
print("🔌 Connecting to H2O Drive...")

try:
    drive = h2o_drive.connect()
    bucket = drive.user_bucket()
    drive_manager = DriveManager(bucket)
    print("✅ H2O Drive connected successfully")
except Exception as e:
    print(f"❌ H2O Drive connection failed: {e}")
    raise

## 6. Initialize Text2Everything SDK

In [None]:
# Initialize Text2Everything SDK client
print("🔌 Initializing Text2Everything SDK...")

try:
    sdk_client = Text2EverythingClient(
        base_url=T2E_CONFIG['base_url'],
        access_token=T2E_CONFIG['access_token'],
        workspace_name=T2E_CONFIG.get('workspace_name'),
        timeout=T2E_CONFIG['timeout'],
        max_retries=T2E_CONFIG['max_retries']
    )
    print("✅ Text2Everything SDK initialized successfully")
except Exception as e:
    print(f"❌ SDK initialization failed: {e}")
    raise

## 7. Discover Available Projects

In [None]:
# List Text2Everything projects
print("📋 Discovering Text2Everything projects...")

try:
    t2e_projects = sdk_client.projects.list()
    print(f"✅ Found {len(t2e_projects)} Text2Everything projects:")
    
    for i, project in enumerate(t2e_projects, 1):
        print(f"   {i}. {project.name} (ID: {project.id})")
        if project.description:
            print(f"      Description: {project.description}")
    
    if not t2e_projects:
        print("❌ No Text2Everything projects found. Please create a project first.")
        
except Exception as e:
    print(f"❌ Error listing T2E projects: {e}")
    raise

### Optional: Create a New Text2Everything Project

If no projects were found above, you can create a new project using the cell below. Uncomment and run it if needed.

In [None]:
# # Uncomment and run this cell if you need to create a new Text2Everything project

# # Project configuration
# PROJECT_NAME = "My H2O Drive Integration Project"  # Change this to your desired project name
# PROJECT_DESCRIPTION = "Project created for H2O Drive to Text2Everything integration"

# try:
#     print(f"🔨 Creating new Text2Everything project: {PROJECT_NAME}")
    
#     new_project = sdk_client.projects.create(
#         name=PROJECT_NAME,
#         description=PROJECT_DESCRIPTION
#     )
    
#     print(f"✅ Project created successfully!")
#     print(f"   Name: {new_project.name}")
#     print(f"   ID: {new_project.id}")
#     print(f"   Description: {new_project.description}")
    
#     # Add the new project to the list
#     t2e_projects.append(new_project)
#     print(f"\n📋 Updated project list now contains {len(t2e_projects)} projects")
    
# except Exception as e:
#     print(f"❌ Failed to create project: {e}")
#     print("Please check your API key and permissions")

In [None]:
# List H2O Drive projects
print("📋 Discovering H2O Drive projects...")

try:
    drive_projects = await drive_manager.list_projects_in_drive()
    
    # Check if only 'home' exists and look deeper
    if len(drive_projects) == 1 and drive_projects[0] == "home":
        print("🔍 Found 'home' as the only project. Looking for subdirectories within home/...")
        home_subdirs = await drive_manager.list_subdirectories("home")
        
        if home_subdirs:
            print(f"✅ Found {len(home_subdirs)} subdirectories in home/:")
            actual_projects = [f"home/{subdir}" for subdir in home_subdirs]
        else:
            print("⚠️  No subdirectories found in home/. Using 'home' as the project.")
            actual_projects = drive_projects
    else:
        actual_projects = drive_projects
    
    print(f"✅ Found {len(actual_projects)} H2O Drive projects:")
    for i, project in enumerate(actual_projects, 1):
        print(f"   {i}. {project}")
    
    # Update drive_projects for later use
    drive_projects = actual_projects
    
    if not drive_projects:
        print("❌ No H2O Drive projects found. Please upload data to Drive first.")
        
except Exception as e:
    print(f"❌ Error listing Drive projects: {e}")
    raise

## 7.5. Create and Configure Snowflake Connector

Before proceeding with data integration, let's set up a Snowflake connector that can be used for analytics and data warehouse operations alongside your H2O Drive data.

### Snowflake Configuration

Configure your Snowflake connection parameters. **IMPORTANT: Update these values with your actual Snowflake credentials before running.**

In [None]:
# Snowflake connection configuration
# IMPORTANT: Replace these values with your actual Snowflake credentials

SNOWFLAKE_CONFIG = {
    "account": "your-account.snowflakecomputing.com",  # Replace with your Snowflake account URL
    "warehouse": "COMPUTE_WH",  # Replace with your warehouse name
    "database": "H2O_ANALYTICS",  # Replace with your database name
    "schema": "PUBLIC",  # Replace with your schema name
    "username": "your-username",  # Replace with your Snowflake username
    "password": "your-password",  # Replace with your Snowflake password
    "role": "ANALYST_ROLE"  # Optional: Replace with your role
}

print("📋 Snowflake Configuration:")
print(f"   Account: {SNOWFLAKE_CONFIG['account']}")
print(f"   Database: {SNOWFLAKE_CONFIG['database']}")
print(f"   Warehouse: {SNOWFLAKE_CONFIG['warehouse']}")
print(f"   Schema: {SNOWFLAKE_CONFIG['schema']}")
print("   ⚠️  Please update the configuration above with your actual Snowflake credentials")

### Create Snowflake Connector

Create a Snowflake connector that can be used for analytics workflows and data warehouse operations.

In [None]:
# Validate Snowflake configuration before creating connector
if (SNOWFLAKE_CONFIG["account"] == "your-account.snowflakecomputing.com" or 
    SNOWFLAKE_CONFIG["username"] == "your-username" or 
    SNOWFLAKE_CONFIG["password"] == "your-password"):
    print("❌ Snowflake credentials not configured!")
    print("Please update the SNOWFLAKE_CONFIG above with your actual credentials.")
    print("You can skip this section if you don't need Snowflake integration.")
    # Uncomment the next line to skip connector creation
    # raise ValueError("Snowflake credentials must be configured")

# Create Snowflake connector
print("🔌 Creating Snowflake connector...")

try:
    snowflake_connector = sdk_client.connectors.create(
        name="H2O Drive Analytics Warehouse",
        description="Snowflake data warehouse for H2O Drive analytics and processed data",
        db_type="snowflake",
        host=SNOWFLAKE_CONFIG["account"],
        port=443,  # Snowflake standard HTTPS port
        username=SNOWFLAKE_CONFIG["username"],
        password=SNOWFLAKE_CONFIG["password"],
        database=SNOWFLAKE_CONFIG["database"],
        config={
            "warehouse": SNOWFLAKE_CONFIG["warehouse"],
            # "schema": SNOWFLAKE_CONFIG.get("schema", "PUBLIC"),
            "role": SNOWFLAKE_CONFIG.get("role")
        }
    )
    
    print("✅ Snowflake connector created successfully!")
    print(f"   Connector ID: {snowflake_connector.id}")
    print(f"   Name: {snowflake_connector.name}")
    print(f"   Database: {snowflake_connector.database}")
    print(f"   Host: {snowflake_connector.host}")
    
except Exception as e:
    print(f"❌ Failed to create Snowflake connector: {e}")
    print("Please check your Snowflake credentials and configuration")
    print("You can continue with the notebook without the Snowflake connector if needed")
    snowflake_connector = None

### Manage Snowflake Connectors

List and manage your Snowflake connectors for future reference.

In [None]:
# List and manage connectors
print("📋 Managing Snowflake connectors...")

try:
    # List all connectors
    all_connectors = sdk_client.connectors.list()
    snowflake_connectors = [c for c in all_connectors if c.db_type.lower() == "snowflake"]
    
    print(f"✅ Found {len(snowflake_connectors)} Snowflake connector(s):")
    for connector in snowflake_connectors:
        print(f"   • {connector.name} (ID: {connector.id})")
        print(f"     Database: {connector.database}")
        print(f"     Host: {connector.host}")
        if connector.description:
            print(f"     Description: {connector.description}")
        print()
    
    # Store connector ID for potential future use
    if snowflake_connector:
        SNOWFLAKE_CONNECTOR_ID = snowflake_connector.id
        print(f"💾 Snowflake Connector ID saved for future use: {SNOWFLAKE_CONNECTOR_ID}")
        print("   This connector can now be used in Text2Everything workflows for:")
        print("   - Querying Snowflake data alongside H2O Drive content")
        print("   - Analytics workflows that combine multiple data sources")
        print("   - Data enrichment and validation operations")
    
    if len(snowflake_connectors) == 0:
        print("ℹ️  No Snowflake connectors found. This is normal if connector creation failed.")
        
except Exception as e:
    print(f"❌ Error managing connectors: {e}")
    print("This doesn't affect the rest of the H2O Drive integration workflow")

## 8. Select Projects for Integration

### 8.1 From Text2Everything

In [None]:
# Select Text2Everything project (destination)
if t2e_projects:
    print("\n🎯 Select Text2Everything project (destination):")
    
    # Select the first available project (change index as needed)
    project_index = 0  # Change this to select a different project
    if project_index < len(t2e_projects):
        selected_t2e_project = t2e_projects[project_index]
    else:
        selected_t2e_project = t2e_projects[0]  # Fallback to first project
    
    project_id = selected_t2e_project.id
    
    print(f"✅ Selected T2E project: {selected_t2e_project.name} (ID: {project_id})")
else:
    raise ValueError("No Text2Everything projects available")

### 8.2 From Drive

In [None]:
# Select H2O Drive project (source)
if drive_projects:
    print("\n🎯 Select H2O Drive project (source):")
    
    # Select the first available project (change index as needed)
    drive_project_index = 0  # Change this to select a different project
    if drive_project_index < len(drive_projects):
        selected_drive_project = drive_projects[drive_project_index]
    else:
        selected_drive_project = drive_projects[0]  # Fallback to first project
    
    print(f"✅ Selected Drive project: {selected_drive_project}")
else:
    raise ValueError("No H2O Drive projects available")

## 9. Load Data from H2O Drive

In [None]:
# Load project data from H2O Drive
print(f"📥 Loading data from H2O Drive project: {selected_drive_project}")

try:
    project_data = await drive_manager.load_project_data(selected_drive_project)
    
    # Display summary
    total_files = sum(len(files) for files in project_data.values())
    print(f"\n📊 Data loaded successfully:")
    print(f"   Total files: {total_files}")
    
    for data_type, files in project_data.items():
        print(f"   - {data_type}: {len(files)} files")
        
        # Show first few file names as examples
        if files:
            for i, (file_key, _) in enumerate(files[:3]):
                filename = Path(file_key).name
                print(f"     • {filename}")
            if len(files) > 3:
                print(f"     ... and {len(files) - 3} more files")
    
    if total_files == 0:
        print("❌ No data found in selected Drive project")
        raise ValueError("No data available for upload")
        
except Exception as e:
    print(f"❌ Error loading data from Drive: {e}")
    raise

## 10. Prepare Data for SDK

In [None]:
# Prepare data for Text2Everything SDK
print("🔧 Preparing data for Text2Everything SDK...")

try:
    sdk_ready_data = prepare_data_for_sdk(project_data)
    
    print("✅ Data prepared successfully:")
    for data_type, items in sdk_ready_data.items():
        print(f"   - {data_type}: {len(items)} items")
        
        # Show example of prepared data structure
        if items:
            example_item = items[0]
            print(f"     Example {data_type[:-1]} structure:")
            for key, value in example_item.items():
                if isinstance(value, str) and len(value) > 50:
                    display_value = value[:50] + "..."
                else:
                    display_value = value
                print(f"       {key}: {display_value}")
            print()
    
except Exception as e:
    print(f"❌ Error preparing data: {e}")
    raise

## 11. Upload Data Using SDK Bulk Operations

In [None]:
# Upload data to Text2Everything using SDK bulk operations
print("🚀 Starting bulk upload to Text2Everything...")

upload_results = {}

try:
    # Upload contexts
    if sdk_ready_data.get('contexts'):
        print(f"\n📤 Uploading {len(sdk_ready_data['contexts'])} contexts...")
        try:
            contexts = sdk_client.contexts.bulk_create(
                project_id=project_id,
                contexts=sdk_ready_data['contexts']
            )
            upload_results['contexts'] = {
                'success': len(contexts),
                'total': len(sdk_ready_data['contexts']),
                'failed': 0,
                'items': contexts
            }
            print(f"   ✅ {len(contexts)} contexts uploaded successfully")
            
            # Show examples of uploaded contexts
            for i, context in enumerate(contexts[:3]):
                print(f"     • {context.name} (ID: {context.id})")
            if len(contexts) > 3:
                print(f"     ... and {len(contexts) - 3} more contexts")
                
        except ValidationError as e:
            print(f"   ⚠️  Contexts upload had validation issues: {e}")
            upload_results['contexts'] = {
                'success': 0,
                'total': len(sdk_ready_data['contexts']),
                'failed': len(sdk_ready_data['contexts']),
                'error': str(e)
            }
        except Exception as e:
            print(f"   ❌ Contexts upload failed: {e}")
            upload_results['contexts'] = {
                'success': 0,
                'total': len(sdk_ready_data['contexts']),
                'failed': len(sdk_ready_data['contexts']),
                'error': str(e)
            }

    # Upload schema metadata
    if sdk_ready_data.get('schema_metadata'):
        print(f"\n📤 Uploading {len(sdk_ready_data['schema_metadata'])} schema metadata items...")
        try:
            schemas = sdk_client.schema_metadata.bulk_create(
                project_id=project_id,
                schema_metadata_list=sdk_ready_data['schema_metadata'],
                validate=True
            )
            upload_results['schema_metadata'] = {
                'success': len(schemas),
                'total': len(sdk_ready_data['schema_metadata']),
                'failed': 0,
                'items': schemas
            }
            print(f"   ✅ {len(schemas)} schema metadata items uploaded successfully")
            
            # Show examples of uploaded schemas
            for i, schema in enumerate(schemas[:3]):
                print(f"     • {schema.name} (ID: {schema.id})")
            if len(schemas) > 3:
                print(f"     ... and {len(schemas) - 3} more schemas")
                
        except ValidationError as e:
            print(f"   ⚠️  Schema metadata upload had validation issues: {e}")
            upload_results['schema_metadata'] = {
                'success': 0,
                'total': len(sdk_ready_data['schema_metadata']),
                'failed': len(sdk_ready_data['schema_metadata']),
                'error': str(e)
            }
        except Exception as e:
            print(f"   ❌ Schema metadata upload failed: {e}")
            upload_results['schema_metadata'] = {
                'success': 0,
                'total': len(sdk_ready_data['schema_metadata']),
                'failed': len(sdk_ready_data['schema_metadata']),
                'error': str(e)
            }

    # Upload golden examples
    if sdk_ready_data.get('golden_examples'):
        print(f"\n📤 Uploading {len(sdk_ready_data['golden_examples'])} golden examples...")
        try:
            examples = sdk_client.golden_examples.bulk_create(
                project_id=project_id,
                golden_examples=sdk_ready_data['golden_examples']
            )
            upload_results['golden_examples'] = {
                'success': len(examples),
                'total': len(sdk_ready_data['golden_examples']),
                'failed': 0,
                'items': examples
            }
            print(f"   ✅ {len(examples)} golden examples uploaded successfully")
            
            # Show examples of uploaded golden examples
            for i, example in enumerate(examples[:3]):
                print(f"     • {example.description} (ID: {example.id})")
            if len(examples) > 3:
                print(f"     ... and {len(examples) - 3} more examples")
                
        except ValidationError as e:
            print(f"   ⚠️  Golden examples upload had validation issues: {e}")
            upload_results['golden_examples'] = {
                'success': 0,
                'total': len(sdk_ready_data['golden_examples']),
                'failed': len(sdk_ready_data['golden_examples']),
                'error': str(e)
            }
        except Exception as e:
            print(f"   ❌ Golden examples upload failed: {e}")
            upload_results['golden_examples'] = {
                'success': 0,
                'total': len(sdk_ready_data['golden_examples']),
                'failed': len(sdk_ready_data['golden_examples']),
                'error': str(e)
            }

except AuthenticationError as e:
    print(f"❌ Authentication failed: {e}")
    print("Please check your API key in T2E_CONFIG")
    raise
except RateLimitError as e:
    print(f"❌ Rate limit exceeded. Retry after: {e.retry_after} seconds")
    raise
except Exception as e:
    print(f"❌ Unexpected error during upload: {e}")
    raise

## 12. Upload Summary and Results

In [None]:
# Display comprehensive upload summary
print("\n📊 Upload Summary")
print("=" * 50)

total_successful = sum(r.get('success', 0) for r in upload_results.values())
total_attempted = sum(r.get('total', 0) for r in upload_results.values())
total_failed = sum(r.get('failed', 0) for r in upload_results.values())

print(f"📈 Overall Results:")
print(f"   Total items attempted: {total_attempted}")
print(f"   Successfully uploaded: {total_successful}")
print(f"   Failed uploads: {total_failed}")
print(f"   Success rate: {(total_successful/total_attempted*100):.1f}%" if total_attempted > 0 else "   Success rate: N/A")

print(f"\n📋 Detailed Results:")
for data_type, result in upload_results.items():
    success = result.get('success', 0)
    total = result.get('total', 0)
    failed = result.get('failed', 0)
    
    status_icon = "✅" if failed == 0 else "⚠️" if success > 0 else "❌"
    print(f"   {status_icon} {data_type}: {success}/{total} successful")
    
    if 'error' in result:
        print(f"      Error: {result['error']}")

if total_successful == total_attempted and total_attempted > 0:
    print("\n🎉 All data uploaded successfully!")
elif total_successful > 0:
    print("\n✅ Upload completed with some issues. Check the details above.")
else:
    print("\n❌ Upload failed. Please check the errors above.")

## 13. Verify Upload Results

In [None]:
# Verify the uploaded data by querying the API
print("🔍 Verifying uploaded data...")

try:
    # Check contexts
    if upload_results.get('contexts', {}).get('success', 0) > 0:
        contexts_list = sdk_client.contexts.list(project_id=project_id)
        print(f"✅ Verified {len(contexts_list)} contexts in project")
        
        # Show first few contexts
        for context in contexts_list[:3]:
            print(f"   • {context.name}: {len(context.content)} characters")
    
    # Check schema metadata
    if upload_results.get('schema_metadata', {}).get('success', 0) > 0:
        schemas_list = sdk_client.schema_metadata.list(project_id=project_id)
        print(f"✅ Verified {len(schemas_list)} schema metadata items in project")
        
        # Show first few schemas
        for schema in schemas_list[:3]:
            print(f"   • {schema.name}: {type(schema.schema_data).__name__} data")
    
    # Check golden examples
    if upload_results.get('golden_examples', {}).get('success', 0) > 0:
        examples_list = sdk_client.golden_examples.list(project_id=project_id)
        print(f"✅ Verified {len(examples_list)} golden examples in project")
        
        # Show first few examples
        for example in examples_list[:3]:
            print(f"   • {example.description}: '{example.user_query[:50]}...'")
    
    print("\n✅ Verification complete!")
    
except Exception as e:
    print(f"⚠️  Verification failed: {e}")
    print("This might be normal if the upload had issues.")

## 14. Cleanup

In [None]:
# Clean up resources
try:
    sdk_client.close()
    print("✅ SDK client connection closed")
except:
    pass

print("\n🎉 End-to-End Integration Complete!")
print("\n📝 Summary of what we accomplished:")
print("   1. ✅ Connected to H2O Drive and Text2Everything")
print("   2. ✅ Discovered available projects")
print("   3. ✅ Loaded data from H2O Drive")
print("   4. ✅ Prepared data for the SDK")
print("   5. ✅ Uploaded data using bulk operations")
print("   6. ✅ Verified upload results")
print("\nYou can now use your uploaded data in Text2Everything!")