# Simple H2O Drive Upload

This notebook demonstrates how to upload local data files to H2O Drive using a simple, direct approach.

## What you'll learn:
- Connect directly to H2O Drive
- Load local project data from filesystem
- Upload files to Drive using native API
- Verify uploads and list Drive contents
- Simple progress tracking

## Overview

We will:
1. Connect to H2O Drive directly
2. Load data from local project folder
3. Upload files to Drive with organized structure
4. Verify the uploads

## Setup and Configuration

In [None]:
# Install required packages
import sys
!{sys.executable} -m pip install -qqq h2o-cloud-discovery
!{sys.executable} -m pip install -qqq "h2o-drive>=4"
!{sys.executable} -m pip install -qqq python-dotenv

print("✅ Packages installed successfully!")

In [None]:
# Standard library imports
import os
import json
import asyncio
import tempfile
from pathlib import Path
from typing import Dict, List, Any, Tuple

# H2O Drive imports
import h2o_drive
import h2o_discovery
from dotenv import load_dotenv

print("✅ All imports successful!")

In [None]:
# Configuration - Update these for your environment
# Load environment variables if available
load_dotenv(".env.upload")

# Set your H2O Cloud environment and token
# Option 1: Set directly (replace with your values)
# os.environ["H2O_CLOUD_ENVIRONMENT"] = "https://your-environment.h2o.ai/"
# os.environ["H2O_CLOUD_CLIENT_PLATFORM_TOKEN"] = "your-token-here"

# Option 2: Use environment variables from .env file
# (Recommended for security)

print("✅ Configuration loaded!")
print(f"Environment: {os.environ.get('H2O_CLOUD_ENVIRONMENT', 'Not set')}")
token = os.environ.get('H2O_CLOUD_CLIENT_PLATFORM_TOKEN', '')
if token:
    print(f"Token: {'*' * (len(token) - 4) + token[-4:]}")
else:
    print("Token: Not set")

## 1. Connect to H2O Drive

Connect directly to H2O Drive using the native API.

In [None]:
# Connect to H2O Drive
print("🔌 Connecting to H2O Drive...")

try:
    # Discover H2O services
    discovery = h2o_discovery.discover()
    
    # Connect to Drive
    drive_client = h2o_drive.connect(discovery=discovery)
    bucket = drive_client.user_bucket()
    
    print("✅ Connected to H2O Drive successfully!")
    
    # Test connection by listing some objects
    objects = await bucket.list_objects()
    print(f"📁 Found {len(objects)} objects in your Drive")
    
except Exception as e:
    print(f"❌ Failed to connect to H2O Drive: {e}")
    print("Please check your H2O_CLOUD_ENVIRONMENT and H2O_CLOUD_CLIENT_PLATFORM_TOKEN")
    raise

## 2. Load Local Project Data

Load data from your local project folder. Update the path below to match your data location.

In [None]:
def load_local_project_data(project_path: Path) -> Dict[str, List[Tuple[str, Any]]]:
    """
    Load project data from local filesystem.
    
    Expected structure:
    project_path/
    ├── schema_metadata/
    ├── contexts/
    └── golden_examples/
    """
    project_data = {
        "schema_metadata": [],
        "contexts": [],
        "golden_examples": []
    }
    
    # Load schema metadata
    schema_path = project_path / "schema_metadata"
    if schema_path.exists():
        for json_file in schema_path.glob("**/*.json"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    project_data["schema_metadata"].append((str(json_file), data))
            except Exception as e:
                print(f"⚠️ Error loading {json_file}: {e}")
    
    # Load contexts (JSON and TXT files)
    contexts_path = project_path / "contexts"
    if contexts_path.exists():
        # JSON files
        for json_file in contexts_path.glob("**/*.json"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    project_data["contexts"].append((str(json_file), data))
            except Exception as e:
                print(f"⚠️ Error loading {json_file}: {e}")
        
        # Text files
        for txt_file in contexts_path.glob("**/*.txt"):
            try:
                with open(txt_file, 'r', encoding='utf-8') as f:
                    data = f.read()
                    project_data["contexts"].append((str(txt_file), data))
            except Exception as e:
                print(f"⚠️ Error loading {txt_file}: {e}")
    
    # Load golden examples
    examples_path = project_path / "golden_examples"
    if examples_path.exists():
        for json_file in examples_path.glob("**/*.json"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    project_data["golden_examples"].append((str(json_file), data))
            except Exception as e:
                print(f"⚠️ Error loading {json_file}: {e}")
    
    return project_data

print("✅ Data loading function defined!")

In [None]:
# Define the local data path - UPDATE THIS PATH
local_data_path = Path("jsons_generados_20250828")  # Change this to your data folder

# Check if the path exists
if not local_data_path.exists():
    print(f"❌ Local data path not found: {local_data_path}")
    print("Please update the 'local_data_path' variable above with the correct path to your data")
    print("Current working directory:", Path.cwd())
    print("Available directories:")
    for item in Path.cwd().iterdir():
        if item.is_dir():
            print(f"  📁 {item.name}")
else:
    print(f"✅ Found local data path: {local_data_path}")
    
    # Show directory structure
    for item in local_data_path.iterdir():
        if item.is_dir():
            file_count = len(list(item.glob("**/*.json"))) + len(list(item.glob("**/*.txt")))
            print(f"📁 {item.name}: {file_count} files")
    
    # Load the project data
    print("\n📥 Loading project data...")
    project_data = load_local_project_data(local_data_path)
    
    # Print summary
    print("\n📊 Loaded Project Data Summary:")
    total_files = 0
    for data_type, files in project_data.items():
        count = len(files)
        total_files += count
        print(f"  - {data_type}: {count} files")
        
        # Show first few files
        for file_path, _ in files[:3]:
            filename = Path(file_path).name
            print(f"    • {filename}")
        if len(files) > 3:
            print(f"    ... and {len(files) - 3} more")
    
    print(f"\n✅ Total files loaded: {total_files}")

## 3. Upload Files to H2O Drive

Upload the loaded data to H2O Drive with organized structure.

In [None]:
async def upload_data_to_drive(bucket, project_data: Dict[str, List[Tuple[str, Any]]], project_name: str = "my_project"):
    """
    Upload project data to H2O Drive with organized structure.
    """
    upload_results = {
        "schema_metadata": {"success": 0, "failed": 0, "errors": []},
        "contexts": {"success": 0, "failed": 0, "errors": []},
        "golden_examples": {"success": 0, "failed": 0, "errors": []}
    }
    
    for data_type, files in project_data.items():
        if not files:
            print(f"⚠️ No {data_type} files to upload")
            continue
            
        print(f"\n📤 Uploading {len(files)} {data_type} files...")
        
        for file_path, data in files:
            filename = Path(file_path).name
            drive_key = f"{project_name}/{data_type}/{filename}"
            
            try:
                # Create temporary file
                with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.tmp') as tmp_file:
                    if isinstance(data, str):
                        # Text content
                        tmp_file.write(data)
                    else:
                        # JSON content
                        json.dump(data, tmp_file, indent=2, ensure_ascii=False)
                    temp_path = tmp_file.name
                
                # Upload to Drive
                await bucket.upload_file(temp_path, drive_key)
                
                # Clean up temp file
                os.remove(temp_path)
                
                upload_results[data_type]["success"] += 1
                print(f"  ✅ Uploaded {filename}")
                
            except Exception as e:
                upload_results[data_type]["failed"] += 1
                upload_results[data_type]["errors"].append(f"{filename}: {str(e)}")
                print(f"  ❌ Failed to upload {filename}: {e}")
                
                # Clean up temp file if it exists
                if 'temp_path' in locals() and os.path.exists(temp_path):
                    os.remove(temp_path)
    
    return upload_results

print("✅ Upload function defined!")

In [None]:
# Upload the data to Drive
if 'project_data' in locals() and any(len(files) > 0 for files in project_data.values()):
    project_name = "home/my_uploaded_project"  # Change this to your desired project name, keep the 'home/' prefix
    if not "home" in project_name.split("/")[0]:
        project_name = "home/" + project_name

    print(f"🚀 Starting upload to H2O Drive...")
    print(f"Project name: {project_name}")
    
    # Perform the upload
    upload_results = await upload_data_to_drive(bucket, project_data, project_name)
    
    # Display results summary
    print("\n📊 Upload Results Summary:")
    print("=" * 50)
    
    total_success = 0
    total_failed = 0
    
    for data_type, results in upload_results.items():
        success = results["success"]
        failed = results["failed"]
        total_success += success
        total_failed += failed
        
        status_icon = "✅" if failed == 0 else "⚠️" if success > 0 else "❌"
        print(f"{status_icon} {data_type}: {success} successful, {failed} failed")
        
        # Show errors if any
        if results["errors"]:
            for error in results["errors"][:3]:  # Show first 3 errors
                print(f"    • {error}")
            if len(results["errors"]) > 3:
                print(f"    ... and {len(results['errors']) - 3} more errors")
    
    print(f"\n📈 Overall: {total_success} successful, {total_failed} failed")
    
    if total_failed == 0:
        print("🎉 All files uploaded successfully!")
    elif total_success > 0:
        print("✅ Upload completed with some issues.")
    else:
        print("❌ Upload failed completely.")
        
else:
    print("❌ No project data available to upload. Please run the data loading step first.")

## 4. Verify Uploads

List the uploaded files in H2O Drive to verify the upload was successful.

In [None]:
# List all objects in Drive to see our uploads
print("🔍 Verifying uploads in H2O Drive...")

try:
    all_objects = await bucket.list_objects()
    
    # Filter objects that belong to our project
    if 'project_name' in locals():
        project_objects = [obj for obj in all_objects if obj.key.startswith(f"{project_name}/")]
        
        if project_objects:
            print(f"\n📁 Found {len(project_objects)} files for project '{project_name}':")
            
            # Group by data type
            by_type = {"schema_metadata": [], "contexts": [], "golden_examples": []}
            
            for obj in project_objects:
                key_parts = obj.key.split('/')
                if len(key_parts) >= 3:
                    data_type = key_parts[1]
                    if data_type in by_type:
                        by_type[data_type].append(obj.key)
            
            # Display organized results
            for data_type, files in by_type.items():
                if files:
                    print(f"\n  📂 {data_type}: {len(files)} files")
                    for file_key in files[:5]:  # Show first 5 files
                        filename = Path(file_key).name
                        print(f"    • {filename}")
                    if len(files) > 5:
                        print(f"    ... and {len(files) - 5} more files")
        else:
            print(f"❌ No files found for project '{project_name}'")
    else:
        print(f"📁 Total objects in Drive: {len(all_objects)}")
        
        # Show recent objects
        if all_objects:
            print("\nRecent objects:")
            for obj in all_objects[-10:]:  # Show last 10 objects
                print(f"  • {obj.key}")
    
except Exception as e:
    print(f"❌ Error verifying uploads: {e}")

## 5. Summary and Cleanup

Review what we accomplished and clean up resources.

In [None]:
# Final summary
print("📋 Upload Session Summary:")
print("=" * 40)

if 'upload_results' in locals():
    total_attempted = sum(r["success"] + r["failed"] for r in upload_results.values())
    total_successful = sum(r["success"] for r in upload_results.values())
    
    print(f"📊 Files processed: {total_attempted}")
    print(f"✅ Successfully uploaded: {total_successful}")
    print(f"❌ Failed uploads: {total_attempted - total_successful}")
    
    if total_successful > 0:
        success_rate = (total_successful / total_attempted) * 100
        print(f"📈 Success rate: {success_rate:.1f}%")
        
        if 'project_name' in locals():
            print(f"📁 Project name in Drive: {project_name}")
else:
    print("No upload was performed in this session.")

print("\n🎉 Simple Drive Upload Complete!")
print("\n📚 What you accomplished:")
print("   • Connected directly to H2O Drive")
print("   • Loaded local project data")
print("   • Uploaded files with organized structure")
print("   • Verified uploads in Drive")
print("   • Simple progress tracking")