# Uploading Documents

Learn how to upload documents to Storm API for learning and RAG (Retrieval-Augmented Generation).

## Setup

In [None]:
import requests
import json
import os
import time
from datetime import datetime

# Configuration
API_KEY = os.getenv("STORM_API_KEY", "your-api-key-here")
API_URL = "https://https://live-stargate.sionic.im"

headers = {"storm-api-key": API_KEY}

print("✅ Setup complete")

## 1. Upload Methods Overview

Storm API supports two document upload methods:
- **File Upload**: Direct file upload from your local system
- **URL Upload**: Upload from URLs (Google Drive, S3, etc.)

In [None]:
# Get agent and bucket IDs first
response = requests.get(
    f"{API_URL}/api/v2/agents",
    headers=headers,
    params={"page": 1, "size": 1}
)

if response.status_code == 200:
    agent = response.json()["data"]["data"][0]
    agent_id = agent["id"]
    print(f"✅ Using agent: {agent['name']}")
    
    # Get first bucket
    response = requests.get(
        f"{API_URL}/api/v2/buckets",
        headers=headers,
        params={"agentId": agent_id, "page": 1, "size": 1}
    )
    
    if response.status_code == 200 and response.json()["data"]["data"]:
        bucket = response.json()["data"]["data"][0]
        bucket_id = bucket["id"]
        print(f"✅ Using bucket: {bucket['name']}")
    else:
        print("❌ No buckets found. Creating one...")
        response = requests.post(
            f"{API_URL}/api/v2/buckets",
            headers=headers,
            json={"agentId": agent_id, "name": "test-bucket"}
        )
        bucket = response.json()["data"]
        bucket_id = bucket["id"]
        print(f"✅ Created bucket: {bucket['name']}")

## 2. Upload Document by File

Upload documents directly from your file system.

In [None]:
def upload_document_by_file(bucket_id, file_path, parser_type="DEFAULT", webhook_url=None):
    """Upload a document file to Storm API."""
    
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return None
    
    # Prepare form data
    data = {
        "bucketId": bucket_id,
        "parserType": parser_type
    }
    
    if webhook_url:
        data["webhookUrl"] = webhook_url
    
    # Upload file
    with open(file_path, 'rb') as f:
        files = {"file": (os.path.basename(file_path), f)}
        
        response = requests.post(
            f"{API_URL}/api/v2/documents/by-file",
            headers=headers,
            data=data,
            files=files
        )
    
    if response.status_code == 200:
        document = response.json()["data"]
        print(f"✅ Document uploaded successfully!")
        print(f"   ID: {document['id']}")
        print(f"   Name: {document['name']}")
        print(f"   Status: {document['status']}")
        print(f"   Parser: {document['parserType']}")
        return document
    else:
        print(f"❌ Upload failed: {response.status_code}")
        print(response.text)
        return None

# Create a sample file to upload
sample_file = "sample_document.txt"
with open(sample_file, "w") as f:
    f.write("""Storm API Document Upload Example

This is a sample document to demonstrate the Storm API document upload functionality.

Key Features:
- Document learning with AI
- Intelligent search and retrieval
- Multi-format support
- Real-time processing

Storm API makes it easy to build RAG applications!""")

# Upload the document
if 'bucket_id' in locals():
    uploaded_doc = upload_document_by_file(bucket_id, sample_file)
    
    # Clean up
    os.remove(sample_file)

## 3. Upload Document by URL

Upload documents from external URLs like Google Drive or S3.

In [None]:
def upload_document_by_url(bucket_id, url, parser_type="DEFAULT", webhook_url=None):
    """Upload a document from URL to Storm API."""
    
    data = {
        "bucketId": bucket_id,
        "url": url,
        "parserType": parser_type
    }
    
    if webhook_url:
        data["webhookUrl"] = webhook_url
    
    response = requests.post(
        f"{API_URL}/api/v2/documents/by-url",
        headers=headers,
        json=data
    )
    
    if response.status_code == 200:
        document = response.json()["data"]
        print(f"✅ Document URL uploaded successfully!")
        print(f"   ID: {document['id']}")
        print(f"   Name: {document['name']}")
        print(f"   Source: {document['source']}")
        print(f"   Status: {document['status']}")
        return document
    else:
        print(f"❌ Upload failed: {response.status_code}")
        print(response.text)
        return None

# Example URLs (replace with actual URLs)
example_urls = [
    "https://drive.google.com/file/d/YOUR_FILE_ID/view?usp=sharing",
    "https://your-bucket.s3.amazonaws.com/document.pdf"
]

print("📌 URL Upload Examples:")
print("\nGoogle Drive:")
print("1. Make sure file is publicly accessible")
print("2. Use the sharing link")
print("\nS3:")
print("1. File must be publicly readable")
print("2. Or use pre-signed URLs")

# Uncomment to test with a real URL
# if 'bucket_id' in locals():
#     url = "YOUR_DOCUMENT_URL_HERE"
#     uploaded_url_doc = upload_document_by_url(bucket_id, url)

## 4. Parser Types

Storm API supports different parser types for document processing.

In [None]:
print("📋 Parser Types:\n")

print("1. DEFAULT Parser:")
print("   • Fast processing (10-60 seconds)")
print("   • Standard text extraction")
print("   • Good for most documents")
print("   • Preserves basic formatting")

print("\n2. STORM_PARSE Parser:")
print("   • Advanced processing (may take longer)")
print("   • Better handling of complex layouts")
print("   • Extracts tables and structured data")
print("   • Ideal for technical documents")

# Example: Compare parsers
def compare_parsers(bucket_id, file_path):
    """Upload same document with different parsers."""
    parsers = ["DEFAULT", "STORM_PARSE"]
    results = {}
    
    for parser in parsers:
        print(f"\nTesting {parser} parser...")
        doc = upload_document_by_file(bucket_id, file_path, parser_type=parser)
        if doc:
            results[parser] = doc
    
    return results

# Create a test document with complex formatting
complex_file = "complex_document.txt"
with open(complex_file, "w") as f:
    f.write("""Financial Report Q4 2024

| Category | Q3 2024 | Q4 2024 | Change |
|----------|---------|---------|--------|
| Revenue  | $1.2M   | $1.5M   | +25%   |
| Expenses | $800K   | $900K   | +12.5% |
| Profit   | $400K   | $600K   | +50%   |

Key Insights:
• Strong revenue growth driven by new product launches
• Controlled expense increase despite expansion
• Profit margin improved from 33% to 40%
""")

# Test different parsers
# if 'bucket_id' in locals():
#     parser_results = compare_parsers(bucket_id, complex_file)

# Clean up
os.remove(complex_file)

## 5. Monitor Document Processing

In [None]:
def check_document_status(document_id, agent_id, bucket_id):
    """Check the processing status of a document."""
    
    response = requests.get(
        f"{API_URL}/api/v2/documents",
        headers=headers,
        params={
            "agentId": agent_id,
            "bucketId": bucket_id,
            "documentId": document_id
        }
    )
    
    if response.status_code == 200:
        documents = response.json()["data"]["data"]
        if documents:
            doc = documents[0]
            return doc["status"]
    return None

def wait_for_document_processing(document_id, agent_id, bucket_id, timeout=300):
    """Wait for document to finish processing."""
    
    start_time = time.time()
    
    print(f"⏳ Waiting for document to process...", end="")
    
    while time.time() - start_time < timeout:
        status = check_document_status(document_id, agent_id, bucket_id)
        
        if status == "completed":
            print(" ✅ Completed!")
            return True
        elif status == "failed":
            print(" ❌ Failed!")
            return False
        
        print(".", end="", flush=True)
        time.sleep(5)
    
    print(" ⏱️ Timeout!")
    return False

# Monitor the uploaded document
if 'uploaded_doc' in locals() and uploaded_doc:
    doc_id = uploaded_doc['id']
    print(f"\nMonitoring document: {doc_id}")
    
    # Check initial status
    print(f"Initial status: {uploaded_doc['status']}")
    
    # Wait for processing
    if uploaded_doc['status'] == "progress":
        success = wait_for_document_processing(doc_id, agent_id, bucket_id)

## 6. Batch Upload

In [None]:
import concurrent.futures

def batch_upload_documents(bucket_id, file_paths, max_workers=3):
    """Upload multiple documents concurrently."""
    
    results = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all uploads
        future_to_file = {
            executor.submit(upload_document_by_file, bucket_id, fp): fp
            for fp in file_paths
        }
        
        # Collect results
        for future in concurrent.futures.as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                result = future.result()
                results.append({
                    "file": file_path,
                    "success": result is not None,
                    "document": result
                })
            except Exception as e:
                results.append({
                    "file": file_path,
                    "success": False,
                    "error": str(e)
                })
    
    # Summary
    successful = sum(1 for r in results if r["success"])
    print(f"\n📊 Batch Upload Summary:")
    print(f"   • Total: {len(results)}")
    print(f"   • Successful: {successful}")
    print(f"   • Failed: {len(results) - successful}")
    
    return results

# Create test files for batch upload
test_files = []
for i in range(3):
    filename = f"batch_test_{i+1}.txt"
    with open(filename, "w") as f:
        f.write(f"""Document {i+1}
        
This is test document number {i+1} for batch upload.
It contains sample content for Storm API processing.
""")
    test_files.append(filename)

print(f"Created {len(test_files)} test files for batch upload")

# Perform batch upload
if 'bucket_id' in locals():
    batch_results = batch_upload_documents(bucket_id, test_files)
    
    # Show results
    print("\n📄 Individual Results:")
    for result in batch_results:
        status = "✅" if result["success"] else "❌"
        print(f"{status} {result['file']}")

# Clean up
for f in test_files:
    os.remove(f)

## 7. Webhook Integration

Use webhooks to get notified when document processing completes.

In [None]:
# Example webhook payload structure
webhook_example = {
    "documentId": "1234567890",
    "status": "completed",
    "bucketId": "bucket-123",
    "name": "document.pdf",
    "characters": 5000,
    "processingTime": 45,
    "timestamp": "2024-01-01T12:00:00Z"
}

print("🪝 Webhook Integration:\n")
print("1. Set up webhook endpoint:")
print("   POST https://your-server.com/webhooks/storm-document")

print("\n2. Include webhook URL in upload:")
print("""   upload_document_by_file(
       bucket_id,
       "document.pdf",
       webhook_url="https://your-server.com/webhooks/storm-document"
   )""")

print("\n3. Expected webhook payload:")
print(json.dumps(webhook_example, indent=2))

print("\n4. Webhook handler example (Flask):")
print("""@app.route('/webhooks/storm-document', methods=['POST'])
def handle_storm_webhook():
    data = request.json
    
    if data['status'] == 'completed':
        # Document ready for use
        process_completed_document(data['documentId'])
    elif data['status'] == 'failed':
        # Handle failure
        log_failed_document(data['documentId'])
    
    return {'status': 'ok'}, 200""")

## 8. Best Practices

In [None]:
print("📚 Document Upload Best Practices:\n")

print("1. File Size & Format:")
print("   • Maximum file size: 10 MB")
print("   • Supported formats: PDF, DOCX, TXT, etc.")
print("   • Compress large files before upload")

print("\n2. Error Handling:")
print("   • Always check response status")
print("   • Implement retry logic for failures")
print("   • Log errors for debugging")

print("\n3. Performance:")
print("   • Use batch upload for multiple files")
print("   • Implement progress tracking")
print("   • Use webhooks for async processing")

print("\n4. Organization:")
print("   • Use meaningful file names")
print("   • Organize documents in appropriate buckets")
print("   • Add metadata when available")

# Example: Robust upload function
def robust_upload(bucket_id, file_path, max_retries=3):
    """Upload with retry logic and error handling."""
    
    for attempt in range(max_retries):
        try:
            # Check file
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")
            
            # Check file size
            file_size = os.path.getsize(file_path)
            if file_size > 10 * 1024 * 1024:  # 10 MB
                raise ValueError(f"File too large: {file_size / 1024 / 1024:.1f} MB")
            
            # Upload
            result = upload_document_by_file(bucket_id, file_path)
            
            if result:
                return result
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
    
    return None

print("\n✅ Upload function with retry logic created")

## Summary

You've learned how to:
- ✅ Upload documents by file and URL
- ✅ Use different parser types
- ✅ Monitor document processing
- ✅ Perform batch uploads
- ✅ Integrate webhooks
- ✅ Follow best practices

## Next Steps

- [Document Processing](./03-document-processing.md) - Learn about document learning
- [Basic Chat](../04-chat-system/01-basic-chat.ipynb) - Start chatting with your documents