# NIC ETL Pipeline
Complete document processing pipeline from GitLab repository ingestion to Qdrant vector database storage.

In [None]:
# Cell 1: Environment Configuration and Constants
import os
from pathlib import Path
from typing import Dict, Any, Optional, Union
from dataclasses import dataclass
from dotenv import load_dotenv
import json
from enum import Enum

class Environment(Enum):
    DEVELOPMENT = "development"
    STAGING = "staging"
    PRODUCTION = "production"

class ConfigurationManager:
    def __init__(self, env_file: Optional[str] = None):
        self.env_file = env_file or ".env"
        self.environment = self._detect_environment()
        
        # Load environment variables
        self._load_environment_variables()
        
        # Initialize configuration
        self.config = self._build_configuration()
        
        # Validate configuration
        self._validate_configuration()
    
    def _detect_environment(self) -> Environment:
        """Detect current environment"""
        env_name = os.getenv('NIC_ENVIRONMENT', 'development').lower()
        
        try:
            return Environment(env_name)
        except ValueError:
            print(f"Unknown environment '{env_name}', defaulting to development")
            return Environment.DEVELOPMENT
    
    def _load_environment_variables(self):
        """Load environment variables from .env file"""
        env_path = Path(self.env_file)
        
        if env_path.exists():
            load_dotenv(env_path)
            print(f"Loaded configuration from {env_path}")
        else:
            print(f"No .env file found at {env_path}, using environment variables and defaults")
    
    def _build_configuration(self) -> Dict[str, Any]:
        """Build complete configuration from multiple sources"""
        config = {
            'environment': self.environment.value,
            
            # GitLab Configuration
            'gitlab': {
                'url': os.getenv('GITLAB_URL', 'http://gitlab.processa.info'),
                'token': os.getenv('GITLAB_TOKEN', 'glpat-zycwWRydKE53SHxxpfbN'),
                'project': os.getenv('GITLAB_PROJECT', 'nic/documentacao/base-de-conhecimento'),
                'branch': os.getenv('GITLAB_BRANCH', 'main'),
                'folder': os.getenv('GITLAB_FOLDER', '30-Aprovados')
            },
            
            # Qdrant Configuration
            'qdrant': {
                'url': os.getenv('QDRANT_URL', 'https://qdrant.codrstudio.dev/'),
                'api_key': os.getenv('QDRANT_API_KEY', '93f0c9d6b9a53758f2376decf318b3ae300e9bdb50be2d0e9c893ee4469fd857'),
                'collection': os.getenv('QDRANT_COLLECTION', 'nic')
            },
            
            # Processing Configuration
            'docling': {
                'enable_ocr': os.getenv('DOCLING_ENABLE_OCR', 'true').lower() == 'true',
                'ocr_languages': os.getenv('DOCLING_OCR_LANGUAGES', 'pt,en').split(','),
                'confidence_threshold': float(os.getenv('DOCLING_CONFIDENCE_THRESHOLD', '0.75'))
            },
            
            # Chunking Configuration
            'chunking': {
                'size': int(os.getenv('CHUNK_SIZE', '500')),
                'overlap': int(os.getenv('CHUNK_OVERLAP', '100')),
                'min_size': int(os.getenv('CHUNK_MIN_SIZE', '100'))
            },
            
            # Embedding Configuration
            'embedding': {
                'model': os.getenv('EMBEDDING_MODEL', 'BAAI/bge-m3'),
                'batch_size': int(os.getenv('EMBEDDING_BATCH_SIZE', '32')),
                'device': os.getenv('EMBEDDING_DEVICE', 'cpu'),
                'normalize': os.getenv('EMBEDDING_NORMALIZE', 'true').lower() == 'true'
            },
            
            # Cache Configuration
            'cache': {
                'dir': Path(os.getenv('CACHE_DIR', './cache')),
                'state_file': Path(os.getenv('CACHE_STATE_FILE', './cache/pipeline_state.json')),
                'max_size_gb': float(os.getenv('CACHE_MAX_SIZE_GB', '10')),
                'cleanup_interval_hours': int(os.getenv('CACHE_CLEANUP_INTERVAL', '24'))
            },
            
            # Logging Configuration
            'logging': {
                'level': os.getenv('LOG_LEVEL', 'INFO'),
                'file': os.getenv('LOG_FILE', './logs/nic_etl.log'),
                'max_size_mb': int(os.getenv('LOG_MAX_SIZE_MB', '100')),
                'backup_count': int(os.getenv('LOG_BACKUP_COUNT', '5'))
            },
            
            # Performance Configuration
            'performance': {
                'max_workers': int(os.getenv('MAX_WORKERS', '4')),
                'timeout_seconds': int(os.getenv('TIMEOUT_SECONDS', '300')),
                'retry_attempts': int(os.getenv('RETRY_ATTEMPTS', '3')),
                'batch_size': int(os.getenv('BATCH_SIZE', '100'))
            },
            
            # Feature Flags
            'features': {
                'enable_caching': os.getenv('FEATURE_ENABLE_CACHING', 'true').lower() == 'true',
                'enable_parallel_processing': os.getenv('FEATURE_PARALLEL_PROCESSING', 'true').lower() == 'true',
                'enable_quality_checks': os.getenv('FEATURE_QUALITY_CHECKS', 'true').lower() == 'true',
                'enable_metrics': os.getenv('FEATURE_METRICS', 'true').lower() == 'true'
            }
        }
        
        # Environment-specific overrides
        config = self._apply_environment_overrides(config)
        
        return config
    
    def _apply_environment_overrides(self, config: Dict[str, Any]) -> Dict[str, Any]:
        """Apply environment-specific configuration overrides"""
        
        if self.environment == Environment.DEVELOPMENT:
            # Development overrides
            config['logging']['level'] = 'DEBUG'
            config['embedding']['batch_size'] = 8  # Smaller batches for development
            config['performance']['max_workers'] = 2
            config['cache']['max_size_gb'] = 2
            
        elif self.environment == Environment.STAGING:
            # Staging overrides
            config['logging']['level'] = 'INFO'
            config['embedding']['batch_size'] = 16
            config['performance']['max_workers'] = 4
            config['features']['enable_quality_checks'] = True
            
        elif self.environment == Environment.PRODUCTION:
            # Production overrides
            config['logging']['level'] = 'WARNING'
            config['embedding']['batch_size'] = 32
            config['performance']['max_workers'] = 8
            config['performance']['timeout_seconds'] = 600
            config['features']['enable_metrics'] = True
            
            # Production security settings
            if not config['gitlab']['token'] or config['gitlab']['token'].startswith('glpat-'):
                print("WARNING: Using development token in production")
        
        return config
    
    def _validate_configuration(self):
        """Validate configuration values"""
        validation_errors = []
        
        # Validate required fields
        required_fields = [
            ('gitlab.url', self.config['gitlab']['url']),
            ('gitlab.token', self.config['gitlab']['token']),
            ('qdrant.url', self.config['qdrant']['url']),
            ('qdrant.api_key', self.config['qdrant']['api_key'])
        ]
        
        for field_name, value in required_fields:
            if not value or value.strip() == '':
                validation_errors.append(f"Required field '{field_name}' is empty")
        
        # Validate numeric ranges
        if self.config['chunking']['size'] <= 0:
            validation_errors.append("Chunk size must be positive")
        
        if self.config['chunking']['overlap'] >= self.config['chunking']['size']:
            validation_errors.append("Chunk overlap must be less than chunk size")
        
        if self.config['embedding']['batch_size'] <= 0:
            validation_errors.append("Embedding batch size must be positive")
        
        # Validate paths
        cache_dir = self.config['cache']['dir']
        if not cache_dir.parent.exists():
            validation_errors.append(f"Cache directory parent does not exist: {cache_dir.parent}")
        
        # Report validation errors
        if validation_errors:
            print("Configuration validation errors:")
            for error in validation_errors:
                print(f"  - {error}")
            raise ValueError(f"Configuration validation failed: {len(validation_errors)} errors")
        
        print(f"Configuration validation passed for {self.environment.value} environment")
    
    def get(self, key: str, default: Any = None) -> Any:
        """Get configuration value using dot notation"""
        keys = key.split('.')
        value = self.config
        
        for k in keys:
            if isinstance(value, dict) and k in value:
                value = value[k]
            else:
                return default
        
        return value
    
    def set(self, key: str, value: Any):
        """Set configuration value using dot notation"""
        keys = key.split('.')
        config = self.config
        
        for k in keys[:-1]:
            if k not in config:
                config[k] = {}
            config = config[k]
        
        config[keys[-1]] = value
    
    def update_runtime_config(self, updates: Dict[str, Any]):
        """Update configuration at runtime"""
        for key, value in updates.items():
            self.set(key, value)
        
        # Re-validate after updates
        self._validate_configuration()
    
    def export_config(self, include_secrets: bool = False) -> Dict[str, Any]:
        """Export configuration (optionally excluding secrets)"""
        config_copy = self.config.copy()
        
        if not include_secrets:
            # Mask sensitive values
            sensitive_keys = [
                'gitlab.token',
                'qdrant.api_key'
            ]
            
            for key in sensitive_keys:
                value = self.get(key)
                if value:
                    self.set(key, value[:8] + '***' if len(value) > 8 else '***')
        
        return config_copy
    
    def create_env_template(self, output_path: str = '.env.template'):
        """Create .env template file with all configuration options"""
        template_content = '''# NIC ETL Pipeline Configuration
# Copy this file to .env and update values as needed

# Environment (development, staging, production)
NIC_ENVIRONMENT=development

# GitLab Configuration
GITLAB_URL=http://gitlab.processa.info
GITLAB_TOKEN=your_gitlab_token_here
GITLAB_PROJECT=nic/documentacao/base-de-conhecimento
GITLAB_BRANCH=main
GITLAB_FOLDER=30-Aprovados

# Qdrant Configuration
QDRANT_URL=https://qdrant.codrstudio.dev/
QDRANT_API_KEY=your_qdrant_api_key_here
QDRANT_COLLECTION=nic

# Processing Configuration
DOCLING_ENABLE_OCR=true
DOCLING_OCR_LANGUAGES=pt,en
DOCLING_CONFIDENCE_THRESHOLD=0.75

# Chunking Configuration
CHUNK_SIZE=500
CHUNK_OVERLAP=100
CHUNK_MIN_SIZE=100

# Embedding Configuration
EMBEDDING_MODEL=BAAI/bge-m3
EMBEDDING_BATCH_SIZE=32
EMBEDDING_DEVICE=cpu
EMBEDDING_NORMALIZE=true

# Cache Configuration
CACHE_DIR=./cache
CACHE_STATE_FILE=./cache/pipeline_state.json
CACHE_MAX_SIZE_GB=10
CACHE_CLEANUP_INTERVAL=24

# Logging Configuration
LOG_LEVEL=INFO
LOG_FILE=./logs/nic_etl.log
LOG_MAX_SIZE_MB=100
LOG_BACKUP_COUNT=5

# Performance Configuration
MAX_WORKERS=4
TIMEOUT_SECONDS=300
RETRY_ATTEMPTS=3
BATCH_SIZE=100

# Feature Flags
FEATURE_ENABLE_CACHING=true
FEATURE_PARALLEL_PROCESSING=true
FEATURE_QUALITY_CHECKS=true
FEATURE_METRICS=true
'''
        
        with open(output_path, 'w') as f:
            f.write(template_content)
        
        print(f"Environment template created at {output_path}")

# Initialize global configuration
config_manager = ConfigurationManager()
CONFIG = config_manager.config

# Export commonly used constants
GITLAB_URL = CONFIG['gitlab']['url']
GITLAB_TOKEN = CONFIG['gitlab']['token']
GITLAB_PROJECT = CONFIG['gitlab']['project']
GITLAB_BRANCH = CONFIG['gitlab']['branch']
GITLAB_FOLDER = CONFIG['gitlab']['folder']

QDRANT_URL = CONFIG['qdrant']['url']
QDRANT_API_KEY = CONFIG['qdrant']['api_key']
QDRANT_COLLECTION = CONFIG['qdrant']['collection']

CACHE_DIR = CONFIG['cache']['dir']
STATE_FILE = CONFIG['cache']['state_file']

# Ensure cache directory exists
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Print configuration summary
print(f"\n=== NIC ETL Configuration ===")
print(f"Environment: {CONFIG['environment']}")
print(f"GitLab: {CONFIG['gitlab']['url']}")
print(f"Qdrant: {CONFIG['qdrant']['url']}")
print(f"Cache: {CONFIG['cache']['dir']}")
print(f"Embedding Model: {CONFIG['embedding']['model']}")
print(f"Chunk Size: {CONFIG['chunking']['size']} tokens")
print(f"Features: {', '.join([k for k, v in CONFIG['features'].items() if v])}")
print(f"================================\n")

In [None]:
# Cell 2: Dependencies Installation and Imports
import subprocess
import sys
import importlib
import pkg_resources
from packaging import version
import warnings
from typing import List, Dict, Tuple, Optional

class DependencyManager:
    """Manages package dependencies and imports for the NIC ETL pipeline"""
    
    def __init__(self):
        self.required_packages = {
            # Core dependencies
            'requests': '>=2.28.0',
            'python-dotenv': '>=0.19.0',
            'pathlib': None,  # Built-in
            'json': None,     # Built-in
            
            # GitLab integration
            'python-gitlab': '>=3.0.0',
            'urllib3': '>=1.26.0',
            
            # Document processing
            'docling': '>=1.0.0',
            'pypdf': '>=3.0.0',
            'python-docx': '>=0.8.11',
            'pillow': '>=9.0.0',
            
            # Text processing and embeddings
            'transformers': '>=4.21.0',
            'torch': '>=1.12.0',
            'sentence-transformers': '>=2.2.0',
            'tokenizers': '>=0.13.0',
            
            # Vector database
            'qdrant-client': '>=1.6.0',
            
            # Data processing
            'pandas': '>=1.5.0',
            'numpy': '>=1.21.0',
            
            # Utilities
            'tqdm': '>=4.64.0',
            'psutil': '>=5.9.0',
            'filelock': '>=3.8.0',
            'hashlib': None,  # Built-in
            'datetime': None, # Built-in
            'logging': None,  # Built-in
            'concurrent.futures': None,  # Built-in
            'multiprocessing': None,     # Built-in
            'threading': None,           # Built-in
            'time': None,                # Built-in
            'os': None,                  # Built-in
            'sys': None,                 # Built-in
            'shutil': None,              # Built-in
            'tempfile': None,            # Built-in
            'zipfile': None,             # Built-in
            'mimetypes': None,           # Built-in
        }
        
        self.failed_imports = []
        self.successful_imports = []
    
    def check_package_version(self, package_name: str, required_version: Optional[str] = None) -> Tuple[bool, str]:
        """Check if package is installed and meets version requirements"""
        try:
            if required_version is None:
                # Built-in module or no version requirement
                importlib.import_module(package_name)
                return True, "built-in"
            
            installed_version = pkg_resources.get_distribution(package_name).version
            
            if required_version.startswith('>='):
                min_version = required_version[2:]
                if version.parse(installed_version) >= version.parse(min_version):
                    return True, installed_version
                else:
                    return False, f"installed: {installed_version}, required: {required_version}"
            else:
                # Exact version match
                if installed_version == required_version:
                    return True, installed_version
                else:
                    return False, f"installed: {installed_version}, required: {required_version}"
        
        except (importlib.util.find_spec, pkg_resources.DistributionNotFound, ImportError):
            return False, "not installed"
    
    def install_package(self, package_name: str) -> bool:
        """Install a package using pip"""
        try:
            print(f"Installing {package_name}...")
            result = subprocess.run(
                [sys.executable, '-m', 'pip', 'install', package_name, '--quiet'],
                capture_output=True,
                text=True,
                timeout=300
            )
            
            if result.returncode == 0:
                print(f"✓ Successfully installed {package_name}")
                return True
            else:
                print(f"✗ Failed to install {package_name}: {result.stderr}")
                return False
        
        except subprocess.TimeoutExpired:
            print(f"✗ Timeout installing {package_name}")
            return False
        except Exception as e:
            print(f"✗ Error installing {package_name}: {e}")
            return False
    
    def ensure_dependencies(self, auto_install: bool = True) -> bool:
        """Ensure all required dependencies are installed"""
        print("Checking dependencies...")
        
        missing_packages = []
        version_mismatches = []
        
        for package, required_version in self.required_packages.items():
            is_ok, version_info = self.check_package_version(package, required_version)
            
            if is_ok:
                self.successful_imports.append(f"{package} ({version_info})")
            else:
                if "not installed" in version_info:
                    missing_packages.append(package)
                else:
                    version_mismatches.append(f"{package}: {version_info}")
                
                self.failed_imports.append(f"{package}: {version_info}")
        
        # Report status
        print(f"✓ {len(self.successful_imports)} packages available")
        
        if missing_packages:
            print(f"✗ {len(missing_packages)} packages missing: {', '.join(missing_packages)}")
        
        if version_mismatches:
            print(f"⚠ {len(version_mismatches)} version mismatches:")
            for mismatch in version_mismatches:
                print(f"  - {mismatch}")
        
        # Auto-install missing packages
        if auto_install and missing_packages:
            print("\\nInstalling missing packages...")
            
            for package in missing_packages:
                required_version = self.required_packages[package]
                package_spec = f"{package}{required_version}" if required_version else package
                
                if self.install_package(package_spec):
                    # Re-check after installation
                    is_ok, version_info = self.check_package_version(package, required_version)
                    if is_ok:
                        self.successful_imports.append(f"{package} ({version_info})")
                        self.failed_imports = [x for x in self.failed_imports if not x.startswith(package)]
        
        return len(self.failed_imports) == 0
    
    def import_dependencies(self) -> bool:
        """Import all required dependencies"""
        print("\\nImporting dependencies...")
        
        imports_successful = True
        
        try:
            # Core Python modules
            global os, sys, json, hashlib, datetime, logging, time, shutil, tempfile, zipfile, mimetypes
            global threading, multiprocessing, concurrent
            import os
            import sys
            import json
            import hashlib
            import datetime
            import logging
            import time
            import shutil
            import tempfile
            import zipfile
            import mimetypes
            import threading
            import multiprocessing
            import concurrent.futures
            
            print("✓ Core Python modules")
            
            # Environment and configuration
            global Path, load_dotenv
            from pathlib import Path
            from dotenv import load_dotenv
            
            print("✓ Environment modules")
            
            # HTTP and API clients
            global requests, gitlab, urllib3
            import requests
            import gitlab
            import urllib3
            
            print("✓ HTTP and API modules")
            
            # Document processing
            global docling, pypdf, docx, PIL
            try:
                import docling
                print("✓ Docling (OCR and document processing)")
            except ImportError:
                print("⚠ Docling not available - OCR features will be limited")
                docling = None
            
            import pypdf
            import docx
            from PIL import Image
            
            print("✓ Document processing modules")
            
            # Text processing and embeddings
            global transformers, torch, sentence_transformers, tokenizers
            import transformers
            import torch
            import sentence_transformers
            import tokenizers
            
            print("✓ Text processing and embedding modules")
            
            # Vector database
            global qdrant_client
            import qdrant_client
            from qdrant_client.models import Distance, VectorParams, PointStruct
            
            print("✓ Vector database modules")
            
            # Data processing
            global pd, np
            import pandas as pd
            import numpy as np
            
            print("✓ Data processing modules")
            
            # Utilities
            global tqdm, psutil, filelock
            from tqdm import tqdm
            import psutil
            import filelock
            
            print("✓ Utility modules")
            
        except ImportError as e:
            print(f"✗ Import error: {e}")
            imports_successful = False
        
        return imports_successful
    
    def validate_critical_dependencies(self) -> bool:
        """Validate that critical dependencies are working correctly"""
        print("\\nValidating critical dependencies...")
        
        validations_passed = True
        
        try:
            # Test configuration loading
            from dotenv import load_dotenv
            print("✓ Environment configuration")
            
            # Test HTTP requests
            import requests
            print("✓ HTTP client")
            
            # Test GitLab client (without actual connection)
            import gitlab
            print("✓ GitLab client")
            
            # Test document processing
            import pypdf
            import docx
            from PIL import Image
            print("✓ Document processors")
            
            # Test embeddings
            import transformers
            import torch
            print("✓ Embedding models")
            
            # Test vector database
            import qdrant_client
            print("✓ Vector database client")
            
            # Test data processing
            import pandas as pd
            import numpy as np
            print("✓ Data processing")
            
        except Exception as e:
            print(f"✗ Validation error: {e}")
            validations_passed = False
        
        return validations_passed
    
    def print_summary(self):
        """Print dependency summary"""
        print("\\n" + "="*60)
        print("DEPENDENCY SUMMARY")
        print("="*60)
        
        print(f"Total packages: {len(self.required_packages)}")
        print(f"Successfully loaded: {len(self.successful_imports)}")
        print(f"Failed/Missing: {len(self.failed_imports)}")
        
        if self.failed_imports:
            print("\\nFailed imports:")
            for failed in self.failed_imports:
                print(f"  ✗ {failed}")
        
        print("\\nSuccessful imports:")
        for success in self.successful_imports:
            print(f"  ✓ {success}")
        
        print("="*60 + "\\n")

# Initialize dependency manager
dep_manager = DependencyManager()

# Ensure all dependencies are available
if dep_manager.ensure_dependencies(auto_install=True):
    print("✅ All dependencies satisfied")
else:
    print("⚠️ Some dependencies are missing or have version conflicts")

# Import all dependencies
if dep_manager.import_dependencies():
    print("✅ All imports successful")
else:
    print("❌ Some imports failed")

# Validate critical functionality
if dep_manager.validate_critical_dependencies():
    print("✅ All validations passed")
else:
    print("❌ Some validations failed")

# Print summary
dep_manager.print_summary()

# Configure warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

print("\\n🚀 Dependencies loaded successfully - Ready for pipeline operations!")

In [None]:
# Error Handling and Monitoring Framework
import logging
import time
import traceback
import functools
from typing import Dict, Any, Optional, Callable, List
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
import json
from pathlib import Path

class ErrorSeverity(Enum):
    LOW = \"low\"
    MEDIUM = \"medium\"
    HIGH = \"high\"
    CRITICAL = \"critical\"

class ErrorCategory(Enum):
    NETWORK = \"network\"
    PROCESSING = \"processing\"
    VALIDATION = \"validation\"
    RESOURCE = \"resource\"
    CONFIGURATION = \"configuration\"
    EXTERNAL_SERVICE = \"external_service\"

@dataclass
class ErrorEvent:
    timestamp: datetime
    severity: ErrorSeverity
    category: ErrorCategory
    component: str
    message: str
    exception_type: str
    stack_trace: str
    context: Dict[str, Any] = field(default_factory=dict)
    retry_count: int = 0
    resolved: bool = False

class ErrorHandler:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.error_log: List[ErrorEvent] = []
        self.retry_policies: Dict[str, Dict[str, Any]] = {}
        
        # Setup logging
        self._setup_logging()
        
        # Initialize default retry policies
        self._setup_retry_policies()
        
        # Performance metrics
        self.metrics = {
            'total_errors': 0,
            'errors_by_category': {},
            'errors_by_severity': {},
            'retry_success_rate': 0.0,
            'avg_resolution_time': 0.0
        }
    
    def _setup_logging(self):
        \"\"\"Setup comprehensive logging system\"\"\"
        log_config = self.config.get('logging', {})
        
        # Create logs directory
        log_file = Path(log_config.get('file', './logs/nic_etl.log'))
        log_file.parent.mkdir(parents=True, exist_ok=True)
        
        # Configure logging
        logging.basicConfig(
            level=getattr(logging, log_config.get('level', 'INFO')),
            format='%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        
        self.logger = logging.getLogger('NIC_ETL_ErrorHandler')
        self.logger.info(\"Error handling system initialized\")
    
    def _setup_retry_policies(self):
        \"\"\"Setup retry policies for different error types\"\"\"
        self.retry_policies = {
            'network': {
                'max_retries': 5,
                'base_delay': 1.0,
                'exponential_backoff': True,
                'max_delay': 60.0
            },
            'processing': {
                'max_retries': 3,
                'base_delay': 2.0,
                'exponential_backoff': True,
                'max_delay': 30.0
            },
            'external_service': {
                'max_retries': 4,
                'base_delay': 5.0,
                'exponential_backoff': True,
                'max_delay': 120.0
            },
            'default': {
                'max_retries': 2,
                'base_delay': 1.0,
                'exponential_backoff': False,
                'max_delay': 10.0
            }
        }
    
    def handle_error(self,
                    exception: Exception,
                    component: str,
                    category: ErrorCategory,
                    severity: ErrorSeverity = ErrorSeverity.MEDIUM,
                    context: Optional[Dict[str, Any]] = None) -> ErrorEvent:
        \"\"\"Handle and log error event\"\"\"
        
        error_event = ErrorEvent(
            timestamp=datetime.now(),
            severity=severity,
            category=category,
            component=component,
            message=str(exception),
            exception_type=type(exception).__name__,
            stack_trace=traceback.format_exc(),
            context=context or {}
        )
        
        # Log error
        self._log_error(error_event)
        
        # Store error
        self.error_log.append(error_event)
        
        # Update metrics
        self._update_metrics(error_event)
        
        # Check if alert is needed
        if severity in [ErrorSeverity.HIGH, ErrorSeverity.CRITICAL]:
            self._trigger_alert(error_event)
        
        return error_event
    
    def _log_error(self, error_event: ErrorEvent):
        \"\"\"Log error with appropriate severity level\"\"\"
        log_message = f\"[{error_event.category.value}] {error_event.component}: {error_event.message}\"
        
        if error_event.severity == ErrorSeverity.CRITICAL:
            self.logger.critical(log_message)
        elif error_event.severity == ErrorSeverity.HIGH:
            self.logger.error(log_message)
        elif error_event.severity == ErrorSeverity.MEDIUM:
            self.logger.warning(log_message)
        else:
            self.logger.info(log_message)
        
        # Log context if available
        if error_event.context:
            self.logger.debug(f\"Error context: {json.dumps(error_event.context)}\")
    
    def _update_metrics(self, error_event: ErrorEvent):
        \"\"\"Update error metrics\"\"\"
        self.metrics['total_errors'] += 1
        
        # Update category metrics
        category = error_event.category.value
        self.metrics['errors_by_category'][category] = (
            self.metrics['errors_by_category'].get(category, 0) + 1
        )
        
        # Update severity metrics
        severity = error_event.severity.value
        self.metrics['errors_by_severity'][severity] = (
            self.metrics['errors_by_severity'].get(severity, 0) + 1
        )
    
    def _trigger_alert(self, error_event: ErrorEvent):
        \"\"\"Trigger alert for high-severity errors\"\"\"
        alert_message = f\"ALERT: {error_event.severity.value.upper()} error in {error_event.component}\"
        self.logger.critical(alert_message)
        
        # In production, this would integrate with alerting systems
        # For now, we'll write to a special alert log
        alert_file = Path('./logs/alerts.log')
        with open(alert_file, 'a') as f:
            f.write(f\"{datetime.now().isoformat()} - {alert_message}\\n\")
    
    def retry_with_backoff(self, func: Callable, *args, **kwargs) -> Any:
        \"\"\"Execute function with retry logic and exponential backoff\"\"\"
        component = kwargs.pop('_component', func.__name__)
        category = kwargs.pop('_category', ErrorCategory.PROCESSING)
        
        # Get retry policy
        policy_name = category.value if category.value in self.retry_policies else 'default'
        policy = self.retry_policies[policy_name]
        
        last_exception = None
        
        for attempt in range(policy['max_retries'] + 1):
            try:
                result = func(*args, **kwargs)
                
                # Log successful retry
                if attempt > 0:
                    self.logger.info(f\"Retry successful for {component} on attempt {attempt + 1}\")
                
                return result
                
            except Exception as e:
                last_exception = e
                
                if attempt < policy['max_retries']:
                    # Calculate delay
                    if policy['exponential_backoff']:
                        delay = min(
                            policy['base_delay'] * (2 ** attempt),
                            policy['max_delay']
                        )
                    else:
                        delay = policy['base_delay']
                    
                    # Log retry attempt
                    self.logger.warning(
                        f\"Attempt {attempt + 1} failed for {component}, \"
                        f\"retrying in {delay:.1f}s: {str(e)}\"
                    )
                    
                    time.sleep(delay)
                else:
                    # Final failure
                    error_event = self.handle_error(
                        e, component, category, ErrorSeverity.HIGH
                    )
                    error_event.retry_count = attempt + 1
        
        # All retries exhausted
        raise last_exception
    
    def get_error_summary(self, last_hours: int = 24) -> Dict[str, Any]:
        \"\"\"Get error summary for specified time period\"\"\"
        cutoff_time = datetime.now() - timedelta(hours=last_hours)
        
        recent_errors = [
            error for error in self.error_log
            if error.timestamp >= cutoff_time
        ]
        
        summary = {
            'time_period_hours': last_hours,
            'total_errors': len(recent_errors),
            'errors_by_category': {},
            'errors_by_severity': {},
            'errors_by_component': {},
            'critical_errors': [],
            'top_error_types': {}
        }
        
        # Analyze recent errors
        for error in recent_errors:
            # By category
            category = error.category.value
            summary['errors_by_category'][category] = (
                summary['errors_by_category'].get(category, 0) + 1
            )
            
            # By severity
            severity = error.severity.value
            summary['errors_by_severity'][severity] = (
                summary['errors_by_severity'].get(severity, 0) + 1
            )
            
            # By component
            component = error.component
            summary['errors_by_component'][component] = (
                summary['errors_by_component'].get(component, 0) + 1
            )
            
            # Critical errors
            if error.severity == ErrorSeverity.CRITICAL:
                summary['critical_errors'].append({
                    'timestamp': error.timestamp.isoformat(),
                    'component': error.component,
                    'message': error.message
                })
            
            # Error types
            error_type = error.exception_type
            summary['top_error_types'][error_type] = (
                summary['top_error_types'].get(error_type, 0) + 1
            )
        
        return summary

class HealthMonitor:
    \"\"\"Monitor system health and performance\"\"\"
    
    def __init__(self, error_handler: ErrorHandler):
        self.error_handler = error_handler
        self.health_checks: Dict[str, Callable] = {}
        self.performance_metrics: Dict[str, List[float]] = {}
        self.last_health_check = None
    
    def register_health_check(self, name: str, check_func: Callable[[], bool]):
        \"\"\"Register a health check function\"\"\"
        self.health_checks[name] = check_func
    
    def run_health_checks(self) -> Dict[str, Any]:
        \"\"\"Run all registered health checks\"\"\"
        results = {
            'timestamp': datetime.now().isoformat(),
            'overall_health': 'healthy',
            'checks': {},
            'failed_checks': []
        }
        
        for name, check_func in self.health_checks.items():
            try:
                start_time = time.time()
                is_healthy = check_func()
                duration = time.time() - start_time
                
                results['checks'][name] = {
                    'status': 'healthy' if is_healthy else 'unhealthy',
                    'duration_ms': round(duration * 1000, 2)
                }
                
                if not is_healthy:
                    results['failed_checks'].append(name)
                    results['overall_health'] = 'degraded'
                    
            except Exception as e:
                results['checks'][name] = {
                    'status': 'error',
                    'error': str(e)
                }
                results['failed_checks'].append(name)
                results['overall_health'] = 'degraded'
        
        self.last_health_check = results
        return results
    
    def record_performance_metric(self, name: str, value: float):
        \"\"\"Record a performance metric\"\"\"
        if name not in self.performance_metrics:
            self.performance_metrics[name] = []
        
        self.performance_metrics[name].append(value)
        
        # Keep only last 1000 measurements
        if len(self.performance_metrics[name]) > 1000:
            self.performance_metrics[name] = self.performance_metrics[name][-1000:]
    
    def get_performance_summary(self) -> Dict[str, Any]:
        \"\"\"Get performance metrics summary\"\"\"
        summary = {}
        
        for name, values in self.performance_metrics.items():
            if values:
                summary[name] = {
                    'count': len(values),
                    'avg': sum(values) / len(values),
                    'min': min(values),
                    'max': max(values),
                    'latest': values[-1] if values else 0
                }
        
        return summary

# Decorator for automatic error handling
def handle_errors(component: str, 
                 category: ErrorCategory = ErrorCategory.PROCESSING,
                 severity: ErrorSeverity = ErrorSeverity.MEDIUM,
                 retry: bool = False):
    \"\"\"Decorator for automatic error handling\"\"\"
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            try:
                if retry:
                    return error_handler.retry_with_backoff(
                        func, *args, _component=component, _category=category, **kwargs
                    )
                else:
                    return func(*args, **kwargs)
            except Exception as e:
                error_handler.handle_error(e, component, category, severity)
                raise
        return wrapper
    return decorator

# Initialize global error handler
error_handler = ErrorHandler(CONFIG)
health_monitor = HealthMonitor(error_handler)

# Register default health checks
def check_cache_directory():
    return CACHE_DIR.exists() and CACHE_DIR.is_dir()

def check_disk_space():
    import shutil
    free_space_gb = shutil.disk_usage(CACHE_DIR).free / (1024**3)
    return free_space_gb > 1.0  # At least 1GB free

health_monitor.register_health_check('cache_directory', check_cache_directory)
health_monitor.register_health_check('disk_space', check_disk_space)

print(\"✅ Error handling and monitoring system initialized\")"

In [None]:
# Cell 3: GitLab Connection and Authentication Functions
# TO BE IMPLEMENTED

In [None]:
# Cell 4: Document Retrieval and Caching Functions
# TO BE IMPLEMENTED

In [None]:
# Cell 5: Docling Processing and OCR Functions
# TO BE IMPLEMENTED

In [None]:
# Cell 6: Text Chunking Functions
# TO BE IMPLEMENTED

In [None]:
# Cell 7: Embedding Generation Functions
# TO BE IMPLEMENTED

In [None]:
# Cell 8: Qdrant Integration Functions
# TO BE IMPLEMENTED

In [None]:
# Cell 9: Metadata Management (NIC Schema)
# TO BE IMPLEMENTED

In [None]:
# Cell 10: Main Pipeline Orchestration
# TO BE IMPLEMENTED

In [None]:
# Cell 11: Testing and Validation Functions
# TO BE IMPLEMENTED

In [None]:
# Cell 12: Pipeline Execution and Monitoring
# TO BE IMPLEMENTED