In [0]:
%run ./EnvPrep

# Config

In [0]:
"""
Configuration settings for the Credit Transfer Analysis System
Updated to include batch processing options
"""

import os
from pathlib import Path

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    env_path = Path(".") / '.env'
    if env_path.exists():
        load_dotenv(env_path)
        print(f"Loaded environment variables from {env_path}")
    else:
        print(".env file is not found!!! .. using Config.py")
except ImportError:
    print("python-dotenv not installed. Install with: pip install python-dotenv")
except Exception as e:
    print(f"Warning: Could not load .env file: {e}")


class Config:
    """System configuration with batch processing support"""
    
    # Project paths
    BASE_DIR = Path(".")
    DATA_DIR = BASE_DIR / "data"
    OUTPUT_DIR = BASE_DIR / "output"
    CACHE_DIR = BASE_DIR / "cache"
    
    # Create directories if they don't exist
    for dir_path in [DATA_DIR, OUTPUT_DIR, CACHE_DIR]:
        dir_path.mkdir(exist_ok=True)

    # GPU Configuration
    VLLM_GPU_ID = int(os.getenv("VLLM_GPU_ID", "0"))  # GPU ID for vLLM
    EMBEDDING_GPU_ID = int(os.getenv("EMBEDDING_GPU_ID", "1"))  # GPU ID for embeddings
    GPU_MEMORY_UTILIZATION = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.9"))  # Memory fraction for vLLM
    
    # Azure OpenAI Configuration
    AZURE_OPENAI_ENDPOINT = os.getenv("ENDPOINT_URL", "https://ehsaninstance1.openai.azure.com/")
    AZURE_OPENAI_DEPLOYMENT = os.getenv("DEPLOYMENT_NAME", "gpt-4o")
    AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", None)
    AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
    AZURE_OPENAI_TIMEOUT = int(os.getenv("AZURE_OPENAI_TIMEOUT", "60"))
    AZURE_OPENAI_MAX_TOKENS = int(os.getenv("AZURE_OPENAI_MAX_TOKENS", "4000"))
    AZURE_OPENAI_TEMPERATURE = float(os.getenv("AZURE_OPENAI_TEMPERATURE", "0.0"))
    USE_AZURE_OPENAI = os.getenv("USE_AZURE_OPENAI", "false").lower() == "true"
    
    # vLLM Configuration
    USE_VLLM = os.getenv("USE_VLLM", "true").lower() == "true"
    VLLM_MODEL_NAME = os.getenv("VLLM_MODEL_NAME", "gpt-oss-120b")
    VLLM_NUM_GPUS = int(os.getenv("VLLM_NUM_GPUS", "1"))
    VLLM_MAX_MODEL_LEN = int(os.getenv("VLLM_MAX_MODEL_LEN", "8192"))
    
    # Batch Processing Configuration
    USE_VLLM_BATCH = os.getenv("USE_VLLM_BATCH", "true").lower() == "true"
    VLLM_BATCH_SIZE = int(os.getenv("VLLM_BATCH_SIZE", "8"))
    VLLM_MAX_BATCH_SIZE = int(os.getenv("VLLM_MAX_BATCH_SIZE", "16"))
    VLLM_BATCH_TIMEOUT = int(os.getenv("VLLM_BATCH_TIMEOUT", "120"))
    
    # Model directories
    MODEL_CACHE_DIR = os.getenv("MODEL_CACHE_DIR", "/root/.cache/huggingface/hub")
    EXTERNAL_MODEL_DIR = os.getenv("EXTERNAL_MODEL_DIR", "/Volumes/jsa_external_prod/external_vols/scratch/Scratch/Ehsan/Models")
    
    # Legacy Web API Configuration (kept for compatibility)
    GENAI_ENDPOINT = os.getenv("GENAI_ENDPOINT", "http://localhost:8080")
    GENAI_API_KEY = os.getenv("GENAI_API_KEY", None)
    GENAI_TIMEOUT = int(os.getenv("GENAI_TIMEOUT", "30"))
    USE_GENAI = os.getenv("USE_GENAI", "false").lower() == "true"
    
    # Embedding Configuration
    EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME", "jinaai--jina-embeddings-v4")
    EMBEDDING_DEVICE = os.getenv("EMBEDDING_DEVICE", "cuda:1")
    EMBEDDING_BATCH_SIZE = int(os.getenv("EMBEDDING_BATCH_SIZE", "64"))
    EMBEDDING_MODE = os.getenv("EMBEDDING_MODE", "embedding")
    
    # Legacy embedding configurations
    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
    EMBEDDING_ENDPOINT = os.getenv("EMBEDDING_ENDPOINT", None)
    EMBEDDING_API_KEY = os.getenv("EMBEDDING_API_KEY", None)
    USE_EMBEDDING_API = os.getenv("USE_EMBEDDING_API", "false").lower() == "true"
    EMBEDDING_DIM = int(os.getenv("EMBEDDING_DIM", "768"))
    
    # Analysis Configuration
    MIN_ALIGNMENT_SCORE = float(os.getenv("MIN_ALIGNMENT_SCORE", "0.5"))
    MAX_UNIT_COMBINATION = int(os.getenv("MAX_UNIT_COMBINATION", "3"))
    SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.8"))
    PARTIAL_THRESHOLD = float(os.getenv("PARTIAL_THRESHOLD", "0.6"))
    
    # Scoring Weights
    COVERAGE_WEIGHT = float(os.getenv("COVERAGE_WEIGHT", "0.5"))
    CONTEXT_WEIGHT = float(os.getenv("CONTEXT_WEIGHT", "0.25"))
    QUALITY_WEIGHT = float(os.getenv("QUALITY_WEIGHT", "0.15"))
    EDGE_PENALTY_WEIGHT = float(os.getenv("EDGE_PENALTY_WEIGHT", "0.1"))
    
    # Edge Case Thresholds
    CONTEXT_IMBALANCE_THRESHOLD = float(os.getenv("CONTEXT_IMBALANCE_THRESHOLD", "0.3"))
    BREADTH_RATIO_MIN = float(os.getenv("BREADTH_RATIO_MIN", "0.7"))
    BREADTH_RATIO_MAX = float(os.getenv("BREADTH_RATIO_MAX", "1.5"))
    
    # Study Level Configuration
    STUDY_LEVEL_WEIGHTS = {
        "introductory": 0.2,
        "intermediate": 0.4,
        "advanced": 0.6,
        "specialized": 0.8,
        "postgraduate": 1.0
    }
    
    STUDY_LEVEL_SKILL_MAPPING = {
        "introductory": "NOVICE",
        "intermediate": "COMPETENT",
        "advanced": "PROFICIENT",
        "specialized": "EXPERT",
        "postgraduate": "EXPERT"
    }
    
    # Credit Hour Configuration
    CREDIT_POINT_TO_HOURS = float(os.getenv("CREDIT_POINT_TO_HOURS", "12.5"))
    HOUR_RATIO_MIN = float(os.getenv("HOUR_RATIO_MIN", "0.7"))
    HOUR_RATIO_MAX = float(os.getenv("HOUR_RATIO_MAX", "1.5"))
    
    # Logging Configuration
    LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
    LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    LOG_FILE = OUTPUT_DIR / "analysis.log"
    
    # Cache Configuration
    ENABLE_CACHE = os.getenv("ENABLE_CACHE", "true").lower() == "true"
    CACHE_EXPIRY_DAYS = int(os.getenv("CACHE_EXPIRY_DAYS", "30"))
    
    # Report Configuration
    REPORT_FORMAT = os.getenv("REPORT_FORMAT", "json")
    INCLUDE_EDGE_CASES = os.getenv("INCLUDE_EDGE_CASES", "true").lower() == "true"
    MAX_RECOMMENDATIONS_PER_COURSE = int(os.getenv("MAX_RECOMMENDATIONS_PER_COURSE", "5"))
    
    @classmethod
    def get_config_dict(cls) -> dict:
        """Get configuration as dictionary"""
        return {
            "min_alignment_score": cls.MIN_ALIGNMENT_SCORE,
            "max_unit_combination": cls.MAX_UNIT_COMBINATION,
            "similarity_threshold": cls.SIMILARITY_THRESHOLD,
            "partial_threshold": cls.PARTIAL_THRESHOLD,
            "use_vllm_batch": cls.USE_VLLM_BATCH,
            "vllm_batch_size": cls.VLLM_BATCH_SIZE,
            "weights": {
                "coverage": cls.COVERAGE_WEIGHT,
                "context": cls.CONTEXT_WEIGHT,
                "quality": cls.QUALITY_WEIGHT,
                "edge_penalty": cls.EDGE_PENALTY_WEIGHT
            },
            "thresholds": {
                "context_imbalance": cls.CONTEXT_IMBALANCE_THRESHOLD,
                "breadth_ratio_min": cls.BREADTH_RATIO_MIN,
                "breadth_ratio_max": cls.BREADTH_RATIO_MAX
            }
        }
    
    @classmethod
    def get_azure_openai_config(cls) -> dict:
        """Get Azure OpenAI configuration as dictionary"""
        return {
            "endpoint": cls.AZURE_OPENAI_ENDPOINT,
            "deployment": cls.AZURE_OPENAI_DEPLOYMENT,
            "api_key": cls.AZURE_OPENAI_API_KEY,
            "api_version": cls.AZURE_OPENAI_API_VERSION,
            "timeout": cls.AZURE_OPENAI_TIMEOUT,
            "max_tokens": cls.AZURE_OPENAI_MAX_TOKENS,
            "temperature": cls.AZURE_OPENAI_TEMPERATURE
        }
    
    @classmethod
    def get_vllm_config(cls) -> dict:
        """Get vLLM configuration as dictionary"""
        return {
            "model_name": cls.VLLM_MODEL_NAME,
            "number_gpus": cls.VLLM_NUM_GPUS,
            "max_model_len": cls.VLLM_MAX_MODEL_LEN,
            "use_batch": cls.USE_VLLM_BATCH,
            "batch_size": cls.VLLM_BATCH_SIZE,
            "max_batch_size": cls.VLLM_MAX_BATCH_SIZE,
            "batch_timeout": cls.VLLM_BATCH_TIMEOUT,
            "model_cache_dir": cls.MODEL_CACHE_DIR,
            "external_model_dir": cls.EXTERNAL_MODEL_DIR
        }
    
    @classmethod
    def save_config(cls, filepath: str = None):
        """Save configuration to file"""
        import json
        
        if filepath is None:
            filepath = cls.OUTPUT_DIR / "config.json"
        
        config_dict = cls.get_config_dict()
        config_dict["azure_openai"] = cls.get_azure_openai_config()
        config_dict["vllm"] = cls.get_vllm_config()
        
        with open(filepath, 'w') as f:
            json.dump(config_dict, f, indent=2)
    
    @classmethod
    def load_config(cls, filepath: str):
        """Load configuration from file"""
        import json
        
        with open(filepath, 'r') as f:
            config = json.load(f)
        
        # Update class attributes
        for key, value in config.items():
            if key == "azure_openai":
                for azure_key, azure_value in value.items():
                    attr_name = f"AZURE_OPENAI_{azure_key.upper()}"
                    if hasattr(cls, attr_name):
                        setattr(cls, attr_name, azure_value)
            elif key == "vllm":
                for vllm_key, vllm_value in value.items():
                    if vllm_key == "use_batch":
                        setattr(cls, "USE_VLLM_BATCH", vllm_value)
                    elif vllm_key == "batch_size":
                        setattr(cls, "VLLM_BATCH_SIZE", vllm_value)
                    else:
                        attr_name = f"VLLM_{vllm_key.upper()}"
                        if hasattr(cls, attr_name):
                            setattr(cls, attr_name, vllm_value)
            elif hasattr(cls, key.upper()):
                setattr(cls, key.upper(), value)

    # Model configurations
    EMBEDDING_MODELS = {
        "jinaai--jina-embeddings-v4": {
            "model_id": "jinaai/jina-embeddings-v4",
            "revision": "737fa5c46f0262ceba4a462ffa1c5bcf01da416f",
            "trust_remote_code": True,
            "embedding_dim": 768
        },
        "jinaai--jina-embeddings-v3": {
            "model_id": "jinaai/jina-embeddings-v3",
            "revision": None,
            "trust_remote_code": True,
            "embedding_dim": 1024
        },
        "BAAI--bge-large-en-v1.5": {
            "model_id": "BAAI/bge-large-en-v1.5",
            "revision": None,
            "trust_remote_code": False,
            "embedding_dim": 1024
        },
        "BAAI--bge-base-en-v1.5": {
            "model_id": "BAAI/bge-base-en-v1.5",
            "revision": None,
            "trust_remote_code": False,
            "embedding_dim": 768
        },
        "sentence-transformers--all-MiniLM-L6-v2": {
            "model_id": "sentence-transformers/all-MiniLM-L6-v2",
            "revision": None,
            "trust_remote_code": False,
            "embedding_dim": 384
        },
        "sentence-transformers--all-mpnet-base-v2": {
            "model_id": "sentence-transformers/all-mpnet-base-v2",
            "revision": None,
            "trust_remote_code": False,
            "embedding_dim": 768
        },
        "intfloat--e5-large-v2": {
            "model_id": "intfloat/e5-large-v2",
            "revision": None,
            "trust_remote_code": False,
            "embedding_dim": 1024
        },
        "WhereIsAI--UAE-Large-V1": {
            "model_id": "WhereIsAI/UAE-Large-V1",
            "revision": None,
            "trust_remote_code": False,
            "embedding_dim": 1024
        }
    }
    
    # Model configurations
    MODELS = {
        "mistralai--Mistral-7B-Instruct-v0.2": {
            "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
            "revision": "41b61a33a2483885c981aa79e0df6b32407ed873",
            "template": "Mistral"
        },
        "mistralai--Mistral-7B-Instruct-v0.3": {
            "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
            "revision": "e0bc86c23ce5aae1db576c8cca6f06f1f73af2db",
            "template": "Mistral"
        },
        "neuralmagic--Meta-Llama-3.1-70B-Instruct-quantized.w4a16": {
            "model_id": "neuralmagic/Meta-Llama-3.1-70B-Instruct-quantized.w4a16",
            "revision": "8c670bcdb23f58a977e1440354beb7c3e455961d",
            "template": "Llama"
        },
        "meta-llama--Llama-3.1-8B-Instruct": {
            "model_id": "meta-llama/Llama-3.1-8B-Instruct",
            "revision": "0e9e39f249a16976918f6564b8830bc894c89659",
            "template": "Llama"
        },
        "neuralmagic--Meta-Llama-3.1-70B-Instruct-FP8": {
            "model_id": "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8",
            "revision": "08b31c0f951f2227f6cdbc088cdb6fd139aecf0f",
            "template": "Llama"
        },
        "microsoft--Phi-4-mini-instruct": {
            "model_id": "microsoft/Phi-4-mini-instruct",
            "revision": "c0fb9e74abda11b496b7907a9c6c9009a7a0488f",
            "template": "Phi"
        },
        "cortecs--Llama-3.3-70B-Instruct-FP8-Dynamic": {
            "model_id": "cortecs/Llama-3.3-70B-Instruct-FP8-Dynamic",
            "revision": "3722358cc2b990b22304489b2f87ef3bb876d6f6",
            "template": "Llama"
        },
        "gpt-oss-120b": {
            "model_id": "/Volumes/jsa_external_prod/external_vols/scratch/Scratch/Ehsan/Models/gpt-oss-120b",
            "revision": None,
            "template": "GPT"
        }
    }

import logging
# Configure logging
logging.basicConfig(
    level=getattr(logging, Config.LOG_LEVEL),
    format=Config.LOG_FORMAT,
    handlers=[
        logging.FileHandler(Config.LOG_FILE),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)
# Silence Py4J logs while keeping your app logs at INFO
logging.getLogger("py4j").setLevel(logging.WARNING)
logging.getLogger("py4j.clientserver").setLevel(logging.WARNING)
logging.getLogger("py4j.java_gateway").setLevel(logging.WARNING)

# EmbeddingInterface

In [0]:
"""
Interface for local embedding model integration with multi-GPU support
"""

import logging
import numpy as np
import torch
from typing import List, Optional, Union, Dict
import shutil
from pathlib import Path
from huggingface_hub import snapshot_download
# from config import Config

# logger = logging.getLogger(__name__)

# Import sentence-transformers
try:
    from sentence_transformers import SentenceTransformer
    SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
    SENTENCE_TRANSFORMERS_AVAILABLE = False
    logger.error("sentence-transformers not installed. Please install it for embedding support.")


class EmbeddingInterface:
    """Interface for local embedding model with multi-GPU support"""
    
    def __init__(self, 
                 model_name: str = "jinaai--jina-embeddings-v4",
                 model_cache_dir: str = "/home/ehsan/.cache/huggingface/hub",
                 external_model_dir: str = "/Volumes/jsa_external_prod/external_vols/scratch/Scratch/Ehsan/Models",
                 device: str = "cuda",
                 batch_size: int = 32):
        """
        Initialize embedding interface with local model
        
        Args:
            model_name: Name of the model to use from MODELS dict
            model_cache_dir: Directory for HuggingFace cache
            external_model_dir: Directory containing pre-downloaded models
            device: Device to run model on (cuda, cuda:0, cuda:1, cpu)
            batch_size: Default batch size for encoding
        """
        if not SENTENCE_TRANSFORMERS_AVAILABLE:
            raise ImportError("sentence-transformers is required for embeddings. Install with: pip install sentence-transformers")
        
        self.MODELS = Config.EMBEDDING_MODELS
        self.model_name = model_name
        self.model_cache_dir = Path(model_cache_dir)
        self.external_model_dir = Path(external_model_dir)
        self.device_str = device
        self.default_batch_size = batch_size
        
        # Parse device string to handle specific GPU selection
        if device.startswith("cuda"):
            if ":" in device:
                # Specific GPU like cuda:1
                self.device_id = int(device.split(":")[1])
            else:
                # Default to cuda:0
                self.device_id = 0
            # Set CUDA device for this model
            torch.cuda.set_device(self.device_id)
            self.device = torch.device(f"cuda:{self.device_id}")
        else:
            self.device_id = None
            self.device = torch.device("cpu")
        
        logger.info(f"Embedding interface will use device: {self.device}")
        
        # Get model configuration
        if model_name not in self.MODELS:
            logger.warning(f"Unknown model: {model_name}. Using default configuration.")
            self.model_config = {
                "model_id": model_name,
                "revision": None,
                "trust_remote_code": True,
                "embedding_dim": 768
            }
        else:
            self.model_config = self.MODELS[model_name]
        
        self.embedding_dim = self.model_config.get("embedding_dim", 768)
        self.trust_remote_code = self.model_config.get("trust_remote_code", False)
        
        # Initialize the model
        self.model = None
        self._initialize_model()
        
        # Initialize cache
        self.cache = {}
        
    def _initialize_model(self):
        """Initialize the SentenceTransformer model with proper device handling"""
        try:
            snapshot_location = self._get_snapshot_location()
            logger.info(f"Loading embedding model from: {snapshot_location}")
            
            # Create a context manager to ensure model loads on correct device
            with torch.cuda.device(self.device_id) if self.device_id is not None else torch.cuda.device(0):
                # Initialize model - do NOT pass device parameter to avoid conflicts
                self.model = SentenceTransformer(
                    snapshot_location,
                    trust_remote_code=self.trust_remote_code,
                    device=self.device
                )
                
                # Manually move model to the correct device
                self.model = self.model.to(self.device)
                
                # Set the device in the model's internal state
                self.model._target_device = self.device
                
                # Ensure all sub-modules are on the correct device
                for module in self.model.modules():
                    module.to(self.device)
                
                # If model has tokenizer, ensure it's configured correctly
                if hasattr(self.model, 'tokenizer'):
                    # Some tokenizers have device-specific settings
                    if hasattr(self.model.tokenizer, 'padding_side'):
                        self.model.tokenizer.padding_side = 'right'
            
            # Update embedding dimension from model
            self.embedding_dim = self.model.get_sentence_embedding_dimension()
            logger.info(f"Successfully loaded embedding model: {self.model_name} on {self.device} (dim={self.embedding_dim})")
            
            # Verify device placement
            self._verify_device_placement()
            
        except Exception as e:
            logger.error(f"Failed to initialize embedding model: {e}")
            raise
    
    def _verify_device_placement(self):
        """Verify all model parameters are on the correct device"""
        for name, param in self.model.named_parameters():
            if param.device != self.device:
                logger.warning(f"Parameter {name} is on {param.device}, moving to {self.device}")
                param.data = param.data.to(self.device)
        
        # Check buffers as well
        for name, buffer in self.model.named_buffers():
            if buffer.device != self.device:
                logger.warning(f"Buffer {name} is on {buffer.device}, moving to {self.device}")
                buffer.data = buffer.data.to(self.device)
    
    def _get_snapshot_location(self, copy_model: bool = True) -> str:
        """Get or download model snapshot location"""
        model_id = self.model_config['model_id']
        revision = self.model_config.get('revision')
        
        if revision:
            external_path = self.external_model_dir / f"models--{self.model_name}"
            cache_path = self.model_cache_dir / f"models--{self.model_name}"
            
            if copy_model and external_path.exists():
                try:
                    shutil.copytree(str(external_path), str(cache_path))
                    logger.info(f"Copied model from {external_path} to {cache_path}")
                except FileExistsError:
                    logger.info(f"Model already exists in cache: {cache_path}")
            
            snapshot_location = snapshot_download(
                repo_id=model_id,
                revision=revision,
                cache_dir=str(self.model_cache_dir)
            )
        else:
            if copy_model:
                external_path = self.external_model_dir / self.model_name
                cache_path = self.model_cache_dir / self.model_name
                
                if external_path.exists():
                    try:
                        shutil.copytree(str(external_path), str(cache_path))
                        logger.info(f"Copied model from {external_path} to {cache_path}")
                    except FileExistsError:
                        logger.info(f"Model already exists in cache: {cache_path}")
                    
                    snapshot_location = str(cache_path)
                else:
                    snapshot_location = snapshot_download(
                        repo_id=model_id,
                        cache_dir=str(self.model_cache_dir)
                    )
            else:
                cache_path = self.model_cache_dir / self.model_name
                if cache_path.exists():
                    snapshot_location = str(cache_path)
                else:
                    snapshot_location = snapshot_download(
                        repo_id=model_id,
                        cache_dir=str(self.model_cache_dir)
                    )
        
        return snapshot_location
    
    def encode(self, texts: Union[str, List[str]], 
               batch_size: Optional[int] = None,
               show_progress: bool = False,
               convert_to_tensor: bool = False,
               normalize_embeddings: bool = True) -> np.ndarray:
        """
        Generate embeddings for texts with proper device handling
        
        Args:
            texts: Single text or list of texts to encode
            batch_size: Batch size for encoding
            show_progress: Whether to show progress bar
            convert_to_tensor: Return PyTorch tensor instead of numpy
            normalize_embeddings: Whether to normalize embeddings
            
        Returns:
            Numpy array of embeddings (or tensor if convert_to_tensor=True)
        """
        # Handle single text input
        if isinstance(texts, str):
            texts = [texts]
        
        if not texts:
            return np.array([])
        
        # Use cache for single texts
        if len(texts) == 1 and texts[0] in self.cache and not convert_to_tensor:
            return self.cache[texts[0]]
        
        # Set batch size
        if batch_size is None:
            batch_size = self.default_batch_size
        
        # Ensure we're in the right CUDA context
        with torch.cuda.device(self.device_id) if self.device_id is not None else torch.cuda.device(0):
            # Double-check model is on correct device
            self.model._target_device = self.device
            
            # Use a custom encoding approach to ensure device consistency
            try:
                # For Jina models with task parameter
                embeddings = self.model.encode(
                    texts,
                    task='text-matching',
                    prompt_name='passage',
                    batch_size=batch_size,
                    show_progress_bar=show_progress,
                    convert_to_tensor=convert_to_tensor,
                    normalize_embeddings=normalize_embeddings
                )
                    
            except Exception as e:
                logger.warning(f"Custom encoding failed: {e}. Falling back to standard encode with device.")
                # Fallback to standard encode
                embeddings = self.model.encode(
                    texts,
                    task='text-matching',
                    prompt_name='passage',
                    batch_size=batch_size,
                    show_progress_bar=show_progress,
                    convert_to_tensor=convert_to_tensor,
                    normalize_embeddings=normalize_embeddings,
                    device=self.device
                )
        
        # Cache single text results
        if len(texts) == 1 and not convert_to_tensor:
            if isinstance(embeddings, np.ndarray):
                self.cache[texts[0]] = embeddings[0] if embeddings.ndim > 1 else embeddings
        
        return embeddings
    
    def encode_batch(self, texts: List[str], batch_size: Optional[int] = None) -> np.ndarray:
        """
        Encode multiple texts in batches (convenience method)
        """
        return self.encode(texts, batch_size=batch_size)
    
    def similarity(self, embeddings1: np.ndarray, 
                  embeddings2: np.ndarray) -> np.ndarray:
        """
        Calculate cosine similarity between embeddings
        """
        # Handle single embedding
        if embeddings1.ndim == 1:
            embeddings1 = embeddings1.reshape(1, -1)
        if embeddings2.ndim == 1:
            embeddings2 = embeddings2.reshape(1, -1)
        
        # Use numpy operations to avoid device issues
        # Normalize embeddings
        norm1 = embeddings1 / (np.linalg.norm(embeddings1, axis=1, keepdims=True) + 1e-10)
        norm2 = embeddings2 / (np.linalg.norm(embeddings2, axis=1, keepdims=True) + 1e-10)
        
        # Calculate cosine similarity
        similarity_matrix = np.dot(norm1, norm2.T)
        
        return similarity_matrix
    
    def similarity_score(self, text1: str, text2: str) -> float:
        """Calculate similarity score between two texts"""
        embeddings = self.encode([text1, text2])
        sim_matrix = self.similarity(embeddings[0:1], embeddings[1:2])
        return float(sim_matrix[0, 0])
    
    def find_most_similar(self, query: str, 
                         candidates: List[str], 
                         top_k: int = 5) -> List[tuple]:
        """Find most similar texts from candidates"""
        if not candidates:
            return []
        
        all_texts = [query] + candidates
        embeddings = self.encode(all_texts)
        
        query_embedding = embeddings[0:1]
        candidate_embeddings = embeddings[1:]
        
        similarities = self.similarity(query_embedding, candidate_embeddings)
        scores = similarities[0]
        
        top_indices = np.argsort(scores)[-top_k:][::-1]
        
        results = [(candidates[idx], float(scores[idx])) for idx in top_indices]
        return results
    
    def save_embeddings(self, embeddings: np.ndarray, 
                       texts: List[str], 
                       filepath: str):
        """Save embeddings to file"""
        import json
        
        if torch.is_tensor(embeddings):
            embeddings = embeddings.cpu().numpy()
        
        data = {
            "embeddings": embeddings.tolist(),
            "texts": texts,
            "model": self.model_name,
            "dimension": self.embedding_dim,
            "device": self.device_str
        }
        
        with open(filepath, 'w') as f:
            json.dump(data, f)
        
        logger.info(f"Saved {len(texts)} embeddings to {filepath}")
    
    def load_embeddings(self, filepath: str) -> tuple:
        """Load embeddings from file"""
        import json
        
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        embeddings = np.array(data["embeddings"])
        texts = data["texts"]
        
        logger.info(f"Loaded {len(texts)} embeddings from {filepath}")
        return embeddings, texts
    
    def clear_cache(self):
        """Clear the embedding cache"""
        self.cache.clear()
        logger.info("Embedding cache cleared")

# SkillExtractor

In [0]:
"""
Advanced skill extraction from course descriptions using pure Gen AI
"""

import logging
from typing import List, Dict, Optional
from collections import defaultdict

from models.base_models import Skill, UnitOfCompetency, UniCourse
from models.enums import SkillLevel, SkillContext, SkillCategory, StudyLevel
from interfaces.genai_interface import GenAIInterface
# from interfaces.embedding_interface import EmbeddingInterface

# logger = logging.getLogger(__name__)


class SkillExtractor:
    """Advanced skill extraction using pure Gen AI approach"""
    
    def __init__(self, 
                 genai: Optional[GenAIInterface] = None,
                 embeddings: Optional[EmbeddingInterface] = None,
                 use_genai: bool = True):
        """
        Initialize skill extractor with Gen AI only approach
        
        Args:
            genai: GenAI interface for advanced extraction
            embeddings: Embedding interface for similarity
            use_genai: Whether to use GenAI for extraction
        """
        self.genai = genai
        self.embeddings = embeddings
        self.use_genai = use_genai and genai is not None
        
        # Cache for processed texts
        self.cache = {}
    
    def extract_from_vet_unit(self, unit: UnitOfCompetency) -> List[Skill]:
        """
        Extract skills from VET unit using Gen AI
        
        Args:
            unit: VET unit to extract skills from
            
        Returns:
            List of extracted skills
        """
        logger.info(f"Extracting skills from VET unit: {unit.code}")
        
        # Check cache
        cache_key = f"vet_{unit.code}"
        if cache_key in self.cache:
            unit.extracted_skills = self.cache[cache_key]
            return self.cache[cache_key]
        
        skills = []
        
        # Combine all text sources
        full_text = unit.get_full_text()
        
        if self.use_genai:
            # Step 1: Extract skills using Gen AI
            ai_skills = self.genai.extract_skills_prompt(full_text, "VET unit")
            skills.extend(self._convert_ai_skills(ai_skills, f"VET:{unit.code}"))
            logger.debug(f"AI extracted {len(ai_skills)} skills from {unit.code}")
            
            # Step 2: Identify implicit skills
            explicit_skill_names = [s["name"] for s in ai_skills]
            implicit_skills = self.genai.identify_implicit_skills(full_text, explicit_skill_names)
            
            for impl_skill in implicit_skills:
                # Get category from AI and validate it
                category_str = self.genai.categorize_skill(impl_skill["name"], full_text)
                category = self._validate_category(category_str)
                
                skill = Skill(
                    name=impl_skill["name"],
                    category=category,
                    level=SkillLevel.COMPETENT,
                    context=SkillContext.PRACTICAL,
                    confidence=impl_skill.get("confidence", 0.6),
                    source=f"VET:{unit.code}_implicit"
                )
                skills.append(skill)
            
            # Step 3: Handle composite skills
            skill_names = [s.name for s in skills]
            composite_result = self.genai.decompose_composite_skills(skill_names)
            
            for comp_skill in composite_result.get("composite_skills", []):
                if comp_skill.get("is_composite"):
                    for component in comp_skill.get("components", []):
                        # Validate category
                        category = self._validate_category(component.get("category", "technical"))
                        
                        new_skill = Skill(
                            name=component["name"],
                            category=category,
                            level=SkillLevel.COMPETENT,
                            context=SkillContext.PRACTICAL,
                            confidence=0.7,
                            source=f"VET:{unit.code}_decomposed"
                        )
                        skills.append(new_skill)
            
            # Step 4: Determine context
            context_result = self.genai.determine_context(full_text)
            primary_context = context_result.get("context_analysis", {}).get("primary_context", "practical")
            
            for skill in skills:
                if primary_context == "practical":
                    skill.context = SkillContext.PRACTICAL
                elif primary_context == "theoretical":
                    skill.context = SkillContext.THEORETICAL
                else:
                    skill.context = SkillContext.HYBRID
            
            # Step 5: Deduplicate skills
            skills = self._deduplicate_skills_with_ai(skills)
        
        else:
            # Fallback: Create minimal skills from learning outcomes
            logger.warning(f"No GenAI available, using minimal extraction for {unit.code}")
            for outcome in unit.learning_outcomes[:5]:  # Limit to avoid too many
                skill = Skill(
                    name=self._clean_text(outcome)[:50],
                    category=SkillCategory.TECHNICAL,
                    level=SkillLevel.COMPETENT,
                    context=SkillContext.PRACTICAL,
                    confidence=0.5,
                    source=f"VET:{unit.code}_fallback"
                )
                skills.append(skill)
        
        # Cache and assign
        self.cache[cache_key] = skills
        unit.extracted_skills = skills
        
        logger.info(f"Extracted {len(skills)} skills from {unit.code}")
        return skills
    
    def extract_from_uni_course(self, course: UniCourse) -> List[Skill]:
        """
        Extract skills from university course using Gen AI
        
        Args:
            course: University course to extract skills from
            
        Returns:
            List of extracted skills
        """
        logger.info(f"Extracting skills from Uni course: {course.code}")
        
        # Check cache
        cache_key = f"uni_{course.code}"
        if cache_key in self.cache:
            course.extracted_skills = self.cache[cache_key]
            return self.cache[cache_key]
        
        skills = []
        
        # Combine all text sources
        full_text = course.get_full_text()
        
        if self.use_genai:
            # Step 1: Identify study level
            if not course.study_level or course.study_level == "intermediate":
                study_level_result = self.genai.identify_study_level(full_text)
                course.study_level = study_level_result.get("study_level", "intermediate")
            
            # Step 2: Extract skills using Gen AI
            ai_skills = self.genai.extract_skills_prompt(full_text, "university course")
            skills.extend(self._convert_ai_skills(ai_skills, f"UNI:{course.code}"))
            logger.debug(f"AI extracted {len(ai_skills)} skills from {course.code}")
            
            # Step 3: Analyze prerequisites
            if course.prerequisites:
                prereq_result = self.genai.analyze_prerequisites(course.prerequisites, full_text)
                for prereq in prereq_result.get("prerequisites", []):
                    for impl_skill in prereq.get("implied_skills", []):
                        skill = Skill(
                            name=impl_skill["name"],
                            category=SkillCategory.FOUNDATIONAL,
                            level=SkillLevel.from_string(impl_skill.get("level", "competent")),
                            context=SkillContext.THEORETICAL,
                            confidence=0.8,
                            source=f"UNI:{course.code}_prerequisite"
                        )
                        skills.append(skill)
            
            # Step 4: Analyze assessment context
            if course.assessment:
                assessment_result = self.genai.analyze_assessment(course.assessment)
                assessment_context = assessment_result.get("primary_assessment_context", "hybrid")
                
                for skill in skills:
                    if assessment_context == "practical" and skill.context == SkillContext.THEORETICAL:
                        skill.context = SkillContext.HYBRID
            
            # Step 5: Identify implicit skills
            explicit_skill_names = [s.name for s in skills]
            implicit_skills = self.genai.identify_implicit_skills(full_text, explicit_skill_names)
            
            for impl_skill in implicit_skills:
                # Get category from AI and validate it
                category_str = self.genai.categorize_skill(impl_skill["name"], full_text)
                category = self._validate_category(category_str)
                
                skill = Skill(
                    name=impl_skill["name"],
                    category=category,
                    level=SkillLevel.COMPETENT,
                    context=SkillContext.HYBRID,
                    confidence=impl_skill.get("confidence", 0.6),
                    source=f"UNI:{course.code}_implicit"
                )
                skills.append(skill)
            
            # Step 6: Adjust skill levels
            skills_dict = [{"name": s.name, "level": s.level.name} for s in skills]
            level_adjustment = self.genai.adjust_skill_levels(skills_dict, course.study_level, full_text)
            
            for adj_skill in level_adjustment.get("adjusted_skills", []):
                for skill in skills:
                    if skill.name == adj_skill["skill_name"]:
                        skill.level = SkillLevel.from_string(adj_skill["adjusted_level"])
                        break
            
            # Step 7: Deduplicate skills
            skills = self._deduplicate_skills_with_ai(skills)
        
        else:
            # Fallback: Create minimal skills from learning outcomes
            logger.warning(f"No GenAI available, using minimal extraction for {course.code}")
            for outcome in course.learning_outcomes[:5]:
                skill = Skill(
                    name=self._clean_text(outcome)[:50],
                    category=SkillCategory.TECHNICAL,
                    level=SkillLevel.COMPETENT,
                    context=SkillContext.THEORETICAL,
                    confidence=0.5,
                    source=f"UNI:{course.code}_fallback"
                )
                skills.append(skill)
            
            # Add skills from topics
            for topic in course.topics[:5]:
                skill = Skill(
                    name=self._clean_text(topic)[:50],
                    category=SkillCategory.TECHNICAL,
                    level=SkillLevel.COMPETENT,
                    context=SkillContext.THEORETICAL,
                    confidence=0.5,
                    source=f"UNI:{course.code}_topic"
                )
                skills.append(skill)
        
        # Cache and assign
        self.cache[cache_key] = skills
        course.extracted_skills = skills
        
        logger.info(f"Extracted {len(skills)} skills from {course.code}")
        return skills
    
    def _validate_category(self, category_str: str) -> SkillCategory:
        """Validate and convert category string to SkillCategory enum"""
        valid_categories = {
            "technical": SkillCategory.TECHNICAL,
            "cognitive": SkillCategory.COGNITIVE,
            "practical": SkillCategory.PRACTICAL,
            "foundational": SkillCategory.FOUNDATIONAL,
            "professional": SkillCategory.PROFESSIONAL
        }
        
        category_lower = category_str.lower().strip()
        
        if category_lower in valid_categories:
            return valid_categories[category_lower]
        
        # Try to find closest match
        for valid_cat in valid_categories:
            if valid_cat in category_lower or category_lower in valid_cat:
                return valid_categories[valid_cat]
        
        # Default to technical
        logger.debug(f"Invalid category '{category_str}' defaulting to 'technical'")
        return SkillCategory.TECHNICAL
    
    def _validate_context(self, context_str: str) -> SkillContext:
        """Validate and convert context string to SkillContext enum"""
        valid_contexts = {
            "theoretical": SkillContext.THEORETICAL,
            "practical": SkillContext.PRACTICAL,
            "hybrid": SkillContext.HYBRID
        }
        
        context_lower = context_str.lower().strip()
        
        if context_lower in valid_contexts:
            return valid_contexts[context_lower]
        
        # Default to hybrid
        logger.debug(f"Invalid context '{context_str}' defaulting to 'hybrid'")
        return SkillContext.HYBRID
    
    def _convert_ai_skills(self, ai_skills: List[Dict], source: str) -> List[Skill]:
        """Convert AI-extracted skills to Skill objects"""
        skills = []
        
        for ai_skill in ai_skills:
            try:
                skill_name = ai_skill.get("name", "").strip()
                if len(skill_name) < 3 or len(skill_name) > 100:
                    continue
                
                # Validate category and context
                category = self._validate_category(ai_skill.get("category", "technical"))
                context = self._validate_context(ai_skill.get("context", "hybrid"))
                
                skill = Skill(
                    name=skill_name,
                    category=category,
                    level=SkillLevel.from_string(ai_skill.get("level", "competent")),
                    context=context,
                    keywords=ai_skill.get("keywords", [])[:10],
                    confidence=min(1.0, max(0.0, ai_skill.get("confidence", 0.8))),
                    source=source
                )
                skills.append(skill)
            except Exception as e:
                logger.debug(f"Failed to convert AI skill: {e}")
        
        return skills
    
    def _deduplicate_skills_with_ai(self, skills: List[Skill]) -> List[Skill]:
        """Deduplicate skills using Gen AI"""
        if len(skills) <= 1:
            return skills
        
        if not self.genai:
            # Simple deduplication without AI
            return self._simple_deduplicate(skills)
        
        # Convert skills for AI
        skills_dict = [
            {
                "name": s.name,
                "category": s.category.value,
                "level": s.level.name,
                "confidence": s.confidence
            }
            for s in skills
        ]
        
        # Use AI to identify duplicates
        dedup_result = self.genai.deduplicate_skills(skills_dict)
        
        # Build merged skills
        final_skills = []
        processed_names = set()
        
        # Process skill groups
        for group in dedup_result.get("skill_groups", []):
            similar_names = [s.lower() for s in group.get("similar_skills", [])]
            merged_name = group.get("merged_name", "")
            
            if merged_name:
                best_skill = None
                best_confidence = 0
                
                for skill in skills:
                    if skill.name.lower() in similar_names:
                        if skill.confidence > best_confidence:
                            best_skill = skill
                            best_confidence = skill.confidence
                        processed_names.add(skill.name.lower())
                
                if best_skill:
                    best_skill.name = merged_name
                    # Validate category before setting
                    merged_category = self._validate_category(group.get("merged_category", best_skill.category.value))
                    best_skill.category = merged_category
                    best_skill.level = SkillLevel.from_string(group.get("merged_level", best_skill.level.name))
                    final_skills.append(best_skill)
        
        # Add unique skills
        for skill_name in dedup_result.get("unique_skills", []):
            for skill in skills:
                if skill.name == skill_name and skill.name.lower() not in processed_names:
                    final_skills.append(skill)
                    processed_names.add(skill.name.lower())
        
        # Add any remaining skills
        for skill in skills:
            if skill.name.lower() not in processed_names:
                final_skills.append(skill)
        
        return final_skills
    
    def _simple_deduplicate(self, skills: List[Skill]) -> List[Skill]:
        """Simple deduplication without AI"""
        skill_dict = {}
        
        for skill in skills:
            key = skill.name.lower().strip()
            
            if key not in skill_dict:
                skill_dict[key] = skill
            else:
                # Keep the one with higher confidence
                if skill.confidence > skill_dict[key].confidence:
                    skill_dict[key] = skill
        
        return list(skill_dict.values())
    
    def _clean_text(self, text: str) -> str:
        """Clean text for skill name"""
        text = " ".join(text.split())  # Remove extra whitespace
        text = text.strip()
        
        # Remove common prefixes
        prefixes = ["ability to", "knowledge of", "understanding of", "skills in"]
        text_lower = text.lower()
        for prefix in prefixes:
            if text_lower.startswith(prefix):
                text = text[len(prefix):].strip()
        
        return text[:100]  # Limit length
    
    def clear_cache(self):
        """Clear the extraction cache"""
        self.cache.clear()
        logger.info("Extraction cache cleared")

# VLLMGenAIInterfaceBatch

In [0]:
"""
Interface for local GenAI model integration using vLLM with true batch processing
"""

import json
import logging
import re
import shutil
import torch
from typing import List, Dict, Any, Optional
from pathlib import Path
from huggingface_hub import snapshot_download
# from config import Config
from vllm import LLM, SamplingParams

# logger = logging.getLogger(__name__)


class VLLMGenAIInterfaceBatch:
    """Interface for local GenAI model integration using vLLM with batch processing"""
    
    def __init__(self, 
                 model_name: str = "meta-llama--Llama-3.1-8B-Instruct",
                 number_gpus: int = 1,
                 max_model_len: int = 8192,
                 batch_size: int = 8,
                 model_cache_dir: str = "/root/.cache/huggingface/hub",
                 external_model_dir: str = "/Volumes/jsa_external_prod/external_vols/scratch/Scratch/Ehsan/Models",
                 gpu_id: int = 0):  # Add explicit GPU ID parameter
        """
        Initialize vLLM GenAI interface with batch processing
        
        Args:
            model_name: Name of the model to use
            number_gpus: Number of GPUs for tensor parallelism
            max_model_len: Maximum model context length
            batch_size: Default batch size for processing
            model_cache_dir: Directory for HuggingFace cache
            external_model_dir: Directory containing pre-downloaded models
            gpu_id: GPU ID to use (default 0)
        """
        self.MODELS = Config.MODELS
        self.model_name = model_name
        self.number_gpus = number_gpus
        self.max_model_len = max_model_len
        self.batch_size = batch_size
        self.model_cache_dir = Path(model_cache_dir)
        self.external_model_dir = Path(external_model_dir)
        self.gpu_id = gpu_id
        
        # Set environment variable to control GPU visibility for vLLM
        if self.number_gpus == 1:
            os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
            logger.info(f"Set CUDA_VISIBLE_DEVICES={gpu_id} for vLLM batch interface")
        else:
            # For multi-GPU, set the range starting from gpu_id
            gpu_list = ",".join(str(gpu_id + i) for i in range(number_gpus))
            os.environ["CUDA_VISIBLE_DEVICES"] = gpu_list
            logger.info(f"Set CUDA_VISIBLE_DEVICES={gpu_list} for vLLM batch interface")
        
        # Get model configuration
        if model_name not in self.MODELS:
            raise ValueError(f"Unknown model: {model_name}")
        
        self.model_config = self.MODELS[model_name]
        self.template = self.model_config.get("template", "Mistral")
        
        # Import prompts
        from extraction.genai_prompts import GenAIPrompts
        self.prompts = GenAIPrompts()
        
        # Initialize the model
        self.llm = None
        self._initialize_model()
        
    def _initialize_model(self):
        """Initialize the vLLM model"""
        try:
            snapshot_location = self._get_snapshot_location()
            logger.info(f"Loading model from: {snapshot_location}")
            
            # Initialize vLLM (it will use the GPUs specified by CUDA_VISIBLE_DEVICES)
            self.llm = LLM(
                model=snapshot_location,
                tensor_parallel_size=self.number_gpus,
                max_model_len=self.max_model_len,
                gpu_memory_utilization=0.9  # Allow vLLM to use 90% of GPU memory
            )
            logger.info(f"Successfully loaded model: {self.model_name} on GPU(s) specified by CUDA_VISIBLE_DEVICES")
            
        except Exception as e:
            logger.error(f"Failed to initialize model: {e}")
            raise

    
    def _get_snapshot_location(self, copy_model: bool = False) -> str:
        """Get or download model snapshot location"""
        model_id = self.model_config['model_id']
        revision = self.model_config['revision']
        
        if revision:
            external_path = self.external_model_dir / f"models--{self.model_name}"
            cache_path = self.model_cache_dir / f"models--{self.model_name}"
            
            if copy_model and external_path.exists():
                try:
                    shutil.copytree(str(external_path), str(cache_path))
                    logger.info(f"Copied model from {external_path} to {cache_path}")
                except FileExistsError:
                    logger.info(f"Model already exists in cache: {cache_path}")
            
            snapshot_location = snapshot_download(
                repo_id=model_id,
                revision=revision,
                cache_dir=str(self.model_cache_dir)
            )
        else:
            if copy_model:
                external_path = self.external_model_dir / self.model_name
                cache_path = self.model_cache_dir / self.model_name
                
                if external_path.exists():
                    try:
                        shutil.copytree(str(external_path), str(cache_path))
                        logger.info(f"Copied model from {external_path} to {cache_path}")
                    except FileExistsError:
                        logger.info(f"Model already exists in cache: {cache_path}")
                
                snapshot_location = str(cache_path)
            else:
                snapshot_location = model_id
        
        return snapshot_location
    
    def _format_instruction(self, sys_message: str, query: str) -> str:
        """Format instruction based on model template"""
        if self.template == 'Phi':
            return f'<|system|> {sys_message} <|end|><|user|> {query} <|end|><|assistant|>'
        elif self.template == 'Llama':
            return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>{sys_message}<|eot_id|><|start_header_id|>user<|end_header_id|>{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'''
        elif self.template == "GPT":
            return f'''<|start|>system<|message|>{sys_message}\n\nReasoning: low\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>{query}<|end|><|start|>assistant'''
        else:  # Default Mistral format
            return f'<s> [INST] {sys_message} [/INST]\nUser: {query}\nAssistant: '
    
    def _generate_batch(self, system_prompt: str, user_prompts: List[str], max_tokens: int = 2048) -> List[str]:
        """Generate responses for a batch of prompts"""
        full_prompts = [
            self._format_instruction(system_prompt, user_prompt) 
            for user_prompt in user_prompts
        ]
        
        sampling_params = SamplingParams(
            max_tokens=max_tokens,
            temperature=0.0,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            n=1,
            best_of=1
        )
        
        outputs = self.llm.generate(full_prompts, sampling_params=sampling_params)
        return [output.outputs[0].text for output in outputs]
    
    def _parse_json_response(self, response: str) -> Dict:
        """Parse JSON from model response"""
        try:
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
        except json.JSONDecodeError as e:
            logger.warning(f"Failed to parse JSON response: {e}")
        return {}
    
    # Batch extraction methods
    
    def extract_skills_batch(self, texts: List[str], contexts: List[str] = None) -> List[List[Dict]]:
        """
        Extract skills from multiple texts in batch
        
        Args:
            texts: List of texts to extract skills from
            contexts: List of context types (optional)
            
        Returns:
            List of extracted skills for each text
        """
        if contexts is None:
            contexts = ["course"] * len(texts)
        
        system_prompt = self.prompts.skill_extraction_prompt()
        user_prompts = [
            f"Context: {context}\n\nText to analyze:\n{text[:3000]}"
            for text, context in zip(texts, contexts)
        ]
        
        # Process in batches if needed
        all_results = []
        for i in range(0, len(user_prompts), self.batch_size):
            batch = user_prompts[i:i + self.batch_size]
            responses = self._generate_batch(system_prompt, batch)
            
            for response in responses:
                result = self._parse_json_response(response)
                all_results.append(result.get("skills", []))
        
        return all_results
    
    def identify_study_levels_batch(self, course_texts: List[str]) -> List[Dict]:
        """Identify study levels for multiple courses"""
        system_prompt = self.prompts.study_level_identification_prompt()
        user_prompts = [
            f"Course description:\n{text[:2000]}" 
            for text in course_texts
        ]
        
        all_results = []
        for i in range(0, len(user_prompts), self.batch_size):
            batch = user_prompts[i:i + self.batch_size]
            responses = self._generate_batch(system_prompt, batch, max_tokens=512)
            
            for response in responses:
                all_results.append(self._parse_json_response(response))
        
        return all_results
    
    def identify_implicit_skills_batch(self, texts: List[str], explicit_skills_lists: List[List[str]]) -> List[List[Dict]]:
        """Identify implicit skills for multiple texts"""
        system_prompt = self.prompts.implicit_skill_identification_prompt()
        user_prompts = [
            f"Course content: {text[:1500]}\n\nExplicit skills already identified: {', '.join(skills[:20])}"
            for text, skills in zip(texts, explicit_skills_lists)
        ]
        
        all_results = []
        for i in range(0, len(user_prompts), self.batch_size):
            batch = user_prompts[i:i + self.batch_size]
            responses = self._generate_batch(system_prompt, batch)
            
            for response in responses:
                result = self._parse_json_response(response)
                all_results.append(result.get("implicit_skills", []))
        
        return all_results
    
    def determine_contexts_batch(self, texts: List[str]) -> List[Dict]:
        """Determine theoretical vs practical context for multiple texts"""
        system_prompt = self.prompts.context_determination_prompt()
        user_prompts = [f"Text to analyze:\n{text[:2000]}" for text in texts]
        
        all_results = []
        for i in range(0, len(user_prompts), self.batch_size):
            batch = user_prompts[i:i + self.batch_size]
            responses = self._generate_batch(system_prompt, batch, max_tokens=512)
            
            for response in responses:
                all_results.append(self._parse_json_response(response))
        
        return all_results
    
    def analyze_skill_similarities_batch(self, skill_pairs: List[tuple]) -> List[float]:
        """Analyze similarity for multiple skill pairs"""
        system_prompt = self.prompts.skill_similarity_prompt()
        user_prompts = [
            f"Skill 1: {skill1}\nSkill 2: {skill2}"
            for skill1, skill2 in skill_pairs
        ]
        
        all_scores = []
        for i in range(0, len(user_prompts), self.batch_size):
            batch = user_prompts[i:i + self.batch_size]
            responses = self._generate_batch(system_prompt, batch, max_tokens=256)
            
            for response in responses:
                result = self._parse_json_response(response)
                all_scores.append(result.get("similarity_score", 0.5))
        
        return all_scores
    
    # Single-item methods that delegate to batch methods
    
    def extract_skills_prompt(self, text: str, context: str = "course") -> List[Dict]:
        """Extract skills from a single text (delegates to batch)"""
        results = self.extract_skills_batch([text], [context])
        return results[0] if results else []
    
    def identify_study_level(self, course_text: str) -> Dict:
        """Identify study level from single course (delegates to batch)"""
        results = self.identify_study_levels_batch([course_text])
        return results[0] if results else {}
    
    def identify_implicit_skills(self, text: str, explicit_skills: List[str]) -> List[Dict]:
        """Identify implicit skills for single text (delegates to batch)"""
        results = self.identify_implicit_skills_batch([text], [explicit_skills])
        return results[0] if results else []
    
    def determine_context(self, text: str) -> Dict:
        """Determine context for single text (delegates to batch)"""
        results = self.determine_contexts_batch([text])
        return results[0] if results else {}
    
    def analyze_skill_similarity(self, skill1: str, skill2: str) -> float:
        """Analyze similarity between two skills (delegates to batch)"""
        results = self.analyze_skill_similarities_batch([(skill1, skill2)])
        return results[0] if results else 0.5
    
    # Methods that remain single-processing (less common operations)
    
    def deduplicate_skills(self, skills: List[Dict]) -> Dict:
        """Deduplicate and merge similar skills"""
        system_prompt = self.prompts.skill_deduplication_prompt()
        skills_json = json.dumps(skills[:50], indent=2)
        user_prompt = f"Skills to analyze:\n{skills_json}"
        
        response = self._generate_batch(system_prompt, [user_prompt])[0]
        return self._parse_json_response(response)
    
    def decompose_composite_skills(self, skills: List[str]) -> Dict:
        """Decompose composite skills into components"""
        system_prompt = self.prompts.composite_skill_decomposition_prompt()
        user_prompt = f"Skills to analyze:\n{json.dumps(skills[:30], indent=2)}"
        
        response = self._generate_batch(system_prompt, [user_prompt])[0]
        return self._parse_json_response(response)
    
    def adjust_skill_levels(self, skills: List[Dict], study_level: str, course_text: str) -> Dict:
        """Adjust skill levels based on context"""
        system_prompt = self.prompts.skill_level_adjustment_prompt()
        user_prompt = f"""Study level: {study_level}
Course context: {course_text[:1000]}
Skills to adjust: {json.dumps(skills[:30], indent=2)}"""
        
        response = self._generate_batch(system_prompt, [user_prompt])[0]
        return self._parse_json_response(response)
    
    def extract_technology_versions(self, text: str) -> Dict:
        """Extract technology versions and assess currency"""
        system_prompt = self.prompts.technology_version_extraction_prompt()
        user_prompt = f"Text to analyze:\n{text[:2000]}"
        
        response = self._generate_batch(system_prompt, [user_prompt])[0]
        return self._parse_json_response(response)
    
    def analyze_prerequisites(self, prerequisites: List[str], course_text: str) -> Dict:
        """Analyze prerequisites and dependencies"""
        system_prompt = self.prompts.prerequisite_analysis_prompt()
        user_prompt = f"""Prerequisites: {', '.join(prerequisites)}
Course context: {course_text[:1000]}"""
        
        response = self._generate_batch(system_prompt, [user_prompt])[0]
        return self._parse_json_response(response)
    
    def detect_edge_cases(self, vet_text: str, uni_text: str, mapping_info: Dict) -> Dict:
        """Detect edge cases in credit mapping"""
        system_prompt = self.prompts.edge_case_detection_prompt()
        user_prompt = f"""VET content: {vet_text[:1000]}
University content: {uni_text[:1000]}
Mapping summary: {json.dumps(mapping_info, indent=2)}"""
        
        response = self._generate_batch(system_prompt, [user_prompt])[0]
        return self._parse_json_response(response)
    
    def extract_keywords(self, skill_name: str, context_text: str) -> List[str]:
        """Extract relevant keywords for a skill"""
        system_prompt = self.prompts.keyword_extraction_prompt()
        user_prompt = f"Skill: {skill_name}\nContext: {context_text[:500]}"
        
        response = self._generate_batch(system_prompt, [user_prompt], max_tokens=256)[0]
        result = self._parse_json_response(response)
        return result.get("keywords", [])
    
    def analyze_assessment(self, assessment_text: str) -> Dict:
        """Analyze assessment methods"""
        system_prompt = self.prompts.assessment_type_analysis_prompt()
        user_prompt = f"Assessment description:\n{assessment_text[:1500]}"
        
        response = self._generate_batch(system_prompt, [user_prompt])[0]
        return self._parse_json_response(response)
    
    def categorize_skill(self, skill_name: str, context: str = "") -> str:
        """Categorize a skill"""
        system_prompt = self.prompts.skill_categorization_prompt()
        user_prompt = f"Skill to categorize: {skill_name}\nContext: {context[:200]}"
        
        response = self._generate_batch(system_prompt, [user_prompt], max_tokens=256)[0]
        result = self._parse_json_response(response)
        return result.get("category", "technical")

# VLLMSkillExtractorBatch

In [0]:
"""
vLLM skill extraction using true batch processing for efficient multi-unit/course processing
"""

# import logging
from typing import List, Dict, Optional, Any

from models.base_models import Skill, UnitOfCompetency, UniCourse, VETQualification, UniQualification
from models.enums import SkillLevel, SkillContext, SkillCategory, StudyLevel
# from interfaces.embedding_interface import EmbeddingInterface

# logger = logging.getLogger(__name__)


class VLLMSkillExtractorBatch:
    """vLLM skill extraction using true batch processing"""
    
    def __init__(self, 
                 genai=None,
                 embeddings: Optional[EmbeddingInterface] = None):
        """
        Initialize vLLM skill extractor with batch processing
        
        Args:
            genai: vLLM GenAI batch interface for extraction
            embeddings: Embedding interface for similarity
        """
        self.genai = genai
        self.embeddings = embeddings
        
        # Cache for processed texts
        self.cache = {}
        
        # Batch processing size
        self.batch_size = getattr(genai, 'batch_size', 8) if genai else 8
    
    def extract_from_vet_qualification(self, vet_qual: VETQualification) -> Dict[str, List[Skill]]:
        """
        Extract skills from entire VET qualification using batch processing
        
        Args:
            vet_qual: VET qualification to process
            
        Returns:
            Dictionary mapping unit codes to extracted skills
        """
        logger.info(f"Batch extracting skills from VET qualification: {vet_qual.code}")
        
        # Check which units need processing
        units_to_process = []
        units_cached = []
        
        for unit in vet_qual.units:
            cache_key = f"vet_{unit.code}"
            if cache_key in self.cache:
                unit.extracted_skills = self.cache[cache_key]
                units_cached.append(unit)
            else:
                units_to_process.append(unit)
        
        if units_cached:
            logger.info(f"Using cached skills for {len(units_cached)} units")
        
        if units_to_process:
            logger.info(f"Processing {len(units_to_process)} VET units in batches")
            
            # Process all units in batches
            for i in range(0, len(units_to_process), self.batch_size):
                batch_units = units_to_process[i:i + self.batch_size]
                self._process_vet_batch(batch_units)
        
        # Return all skills
        all_skills = {}
        for unit in vet_qual.units:
            all_skills[unit.code] = unit.extracted_skills
        
        logger.info(f"Extracted skills from {len(all_skills)} units")
        return all_skills
    
    def extract_from_uni_qualification(self, uni_qual: UniQualification) -> Dict[str, List[Skill]]:
        """
        Extract skills from entire university qualification using batch processing
        
        Args:
            uni_qual: University qualification to process
            
        Returns:
            Dictionary mapping course codes to extracted skills
        """
        logger.info(f"Batch extracting skills from Uni qualification: {uni_qual.code}")
        
        # Check which courses need processing
        courses_to_process = []
        courses_cached = []
        
        for course in uni_qual.courses:
            cache_key = f"uni_{course.code}"
            if cache_key in self.cache:
                course.extracted_skills = self.cache[cache_key]
                courses_cached.append(course)
            else:
                courses_to_process.append(course)
        
        if courses_cached:
            logger.info(f"Using cached skills for {len(courses_cached)} courses")
        
        if courses_to_process:
            logger.info(f"Processing {len(courses_to_process)} Uni courses in batches")
            
            # Process all courses in batches
            for i in range(0, len(courses_to_process), self.batch_size):
                batch_courses = courses_to_process[i:i + self.batch_size]
                self._process_uni_batch(batch_courses)
        
        # Return all skills
        all_skills = {}
        for course in uni_qual.courses:
            all_skills[course.code] = course.extracted_skills
        
        logger.info(f"Extracted skills from {len(all_skills)} courses")
        return all_skills
    
    def _process_vet_batch(self, units: List[UnitOfCompetency]):
        """Process a batch of VET units"""
        # Step 1: Extract skills using batch GenAI
        texts = [unit.get_full_text() for unit in units]
        contexts = ["VET unit"] * len(units)
        
        ai_skills_batch = self.genai.extract_skills_batch(texts, contexts)
        
        # Step 2: Identify implicit skills in batch
        explicit_skills_lists = [[s["name"] for s in skills] for skills in ai_skills_batch]
        implicit_skills_batch = self.genai.identify_implicit_skills_batch(texts, explicit_skills_lists)
        
        # Step 3: Determine contexts in batch
        context_results = self.genai.determine_contexts_batch(texts)
        
        # Step 4: Process each unit with batch results
        for idx, unit in enumerate(units):
            skills = []
            
            # Add explicit skills
            ai_skills = ai_skills_batch[idx] if idx < len(ai_skills_batch) else []
            skills.extend(self._convert_ai_skills(ai_skills, f"VET:{unit.code}"))
            
            # Add implicit skills
            implicit_skills = implicit_skills_batch[idx] if idx < len(implicit_skills_batch) else []
            for impl_skill in implicit_skills:
                skill = Skill(
                    name=impl_skill["name"],
                    category=self._validate_category(impl_skill.get("category", "technical")),
                    level=SkillLevel.COMPETENT,
                    context=SkillContext.PRACTICAL,
                    confidence=impl_skill.get("confidence", 0.6),
                    source=f"VET:{unit.code}_implicit"
                )
                skills.append(skill)
            
            # Apply context
            context_result = context_results[idx] if idx < len(context_results) else {}
            primary_context = context_result.get("context_analysis", {}).get("primary_context", "practical")
            
            for skill in skills:
                if primary_context == "practical":
                    skill.context = SkillContext.PRACTICAL
                elif primary_context == "theoretical":
                    skill.context = SkillContext.THEORETICAL
                else:
                    skill.context = SkillContext.HYBRID
            
            # Step 5: Decompose composite skills (done per unit for now)
            skill_names = [s.name for s in skills]
            if skill_names:
                composite_result = self.genai.decompose_composite_skills(skill_names)
                
                for comp_skill in composite_result.get("composite_skills", []):
                    if comp_skill.get("is_composite"):
                        for component in comp_skill.get("components", []):
                            new_skill = Skill(
                                name=component["name"],
                                category=self._validate_category(component.get("category", "technical")),
                                level=SkillLevel.COMPETENT,
                                context=SkillContext.PRACTICAL,
                                confidence=0.7,
                                source=f"VET:{unit.code}_decomposed"
                            )
                            skills.append(new_skill)
            
            # Step 6: Deduplicate skills
            skills = self._deduplicate_skills_with_ai(skills)
            
            # Cache and assign
            cache_key = f"vet_{unit.code}"
            self.cache[cache_key] = skills
            unit.extracted_skills = skills
            
            logger.debug(f"Extracted {len(skills)} skills from {unit.code}")
    
    def _process_uni_batch(self, courses: List[UniCourse]):
        """Process a batch of University courses"""
        # Step 1: Identify study levels in batch
        texts = [course.get_full_text() for course in courses]
        study_level_results = self.genai.identify_study_levels_batch(texts)
        
        for idx, course in enumerate(courses):
            if not course.study_level or course.study_level == "intermediate":
                if idx < len(study_level_results):
                    course.study_level = study_level_results[idx].get("study_level", "intermediate")
        
        # Step 2: Extract skills using batch GenAI
        contexts = ["university course"] * len(courses)
        ai_skills_batch = self.genai.extract_skills_batch(texts, contexts)
        
        # Step 3: Identify implicit skills in batch
        explicit_skills_lists = [[s["name"] for s in skills] for skills in ai_skills_batch]
        implicit_skills_batch = self.genai.identify_implicit_skills_batch(texts, explicit_skills_lists)
        
        # Step 4: Determine contexts in batch
        context_results = self.genai.determine_contexts_batch(texts)
        
        # Step 5: Process each course with batch results
        for idx, course in enumerate(courses):
            skills = []
            
            # Add explicit skills
            ai_skills = ai_skills_batch[idx] if idx < len(ai_skills_batch) else []
            skills.extend(self._convert_ai_skills(ai_skills, f"UNI:{course.code}"))
            
            # Add implicit skills
            implicit_skills = implicit_skills_batch[idx] if idx < len(implicit_skills_batch) else []
            for impl_skill in implicit_skills:
                skill = Skill(
                    name=impl_skill["name"],
                    category=self._validate_category(impl_skill.get("category", "technical")),
                    level=SkillLevel.COMPETENT,
                    context=SkillContext.HYBRID,
                    confidence=impl_skill.get("confidence", 0.6),
                    source=f"UNI:{course.code}_implicit"
                )
                skills.append(skill)
            
            # Analyze prerequisites (individual processing for now)
            if course.prerequisites:
                prereq_result = self.genai.analyze_prerequisites(course.prerequisites, texts[idx][:1000])
                for prereq in prereq_result.get("prerequisites", []):
                    for impl_skill in prereq.get("implied_skills", []):
                        skill = Skill(
                            name=impl_skill["name"],
                            category=SkillCategory.FOUNDATIONAL,
                            level=SkillLevel.from_string(impl_skill.get("level", "competent")),
                            context=SkillContext.THEORETICAL,
                            confidence=0.8,
                            source=f"UNI:{course.code}_prerequisite"
                        )
                        skills.append(skill)
            
            # Apply context
            context_result = context_results[idx] if idx < len(context_results) else {}
            primary_context = context_result.get("context_analysis", {}).get("primary_context", "hybrid")
            
            for skill in skills:
                if primary_context == "practical":
                    skill.context = SkillContext.PRACTICAL
                elif primary_context == "theoretical":
                    skill.context = SkillContext.THEORETICAL
                else:
                    skill.context = SkillContext.HYBRID
            
            # Adjust skill levels based on study level
            skills_dict = [{"name": s.name, "level": s.level.name} for s in skills]
            if skills_dict:
                level_adjustment = self.genai.adjust_skill_levels(skills_dict, course.study_level, texts[idx][:1000])
                
                for adj_skill in level_adjustment.get("adjusted_skills", []):
                    for skill in skills:
                        if skill.name == adj_skill["skill_name"]:
                            skill.level = SkillLevel.from_string(adj_skill["adjusted_level"])
                            break
            
            # Deduplicate skills
            skills = self._deduplicate_skills_with_ai(skills)
            
            # Cache and assign
            cache_key = f"uni_{course.code}"
            self.cache[cache_key] = skills
            course.extracted_skills = skills
            
            logger.debug(f"Extracted {len(skills)} skills from {course.code}")
    
    def extract_from_vet_unit(self, unit: UnitOfCompetency) -> List[Skill]:
        """
        Extract skills from a single VET unit (uses batch processing with size 1)
        
        Args:
            unit: VET unit to extract skills from
            
        Returns:
            List of extracted skills
        """
        logger.info(f"Extracting skills from single VET unit: {unit.code}")
        
        cache_key = f"vet_{unit.code}"
        if cache_key in self.cache:
            unit.extracted_skills = self.cache[cache_key]
            return self.cache[cache_key]
        
        # Process as a batch of 1
        self._process_vet_batch([unit])
        return unit.extracted_skills
    
    def extract_from_uni_course(self, course: UniCourse) -> List[Skill]:
        """
        Extract skills from a single university course (uses batch processing with size 1)
        
        Args:
            course: University course to extract skills from
            
        Returns:
            List of extracted skills
        """
        logger.info(f"Extracting skills from single Uni course: {course.code}")
        
        cache_key = f"uni_{course.code}"
        if cache_key in self.cache:
            course.extracted_skills = self.cache[cache_key]
            return self.cache[cache_key]
        
        # Process as a batch of 1
        self._process_uni_batch([course])
        return course.extracted_skills
    
    def _validate_category(self, category_str: str) -> SkillCategory:
        """Validate and convert category string to SkillCategory enum"""
        valid_categories = {
            "technical": SkillCategory.TECHNICAL,
            "cognitive": SkillCategory.COGNITIVE,
            "practical": SkillCategory.PRACTICAL,
            "foundational": SkillCategory.FOUNDATIONAL,
            "professional": SkillCategory.PROFESSIONAL
        }
        
        category_lower = category_str.lower().strip()
        
        if category_lower in valid_categories:
            return valid_categories[category_lower]
        
        for valid_cat in valid_categories:
            if valid_cat in category_lower or category_lower in valid_cat:
                return valid_categories[valid_cat]
        
        logger.debug(f"Invalid category '{category_str}' defaulting to 'technical'")
        return SkillCategory.TECHNICAL
    
    def _validate_context(self, context_str: str) -> SkillContext:
        """Validate and convert context string to SkillContext enum"""
        valid_contexts = {
            "theoretical": SkillContext.THEORETICAL,
            "practical": SkillContext.PRACTICAL,
            "hybrid": SkillContext.HYBRID
        }
        
        context_lower = context_str.lower().strip()
        
        if context_lower in valid_contexts:
            return valid_contexts[context_lower]
        
        logger.debug(f"Invalid context '{context_str}' defaulting to 'hybrid'")
        return SkillContext.HYBRID
    
    def _convert_ai_skills(self, ai_skills: List[Dict], source: str) -> List[Skill]:
        """Convert AI-extracted skills to Skill objects"""
        skills = []
        
        for ai_skill in ai_skills:
            try:
                skill_name = ai_skill.get("name", "").strip()
                if len(skill_name) < 3 or len(skill_name) > 100:
                    continue
                
                category = self._validate_category(ai_skill.get("category", "technical"))
                context = self._validate_context(ai_skill.get("context", "hybrid"))
                
                skill = Skill(
                    name=skill_name,
                    category=category,
                    level=SkillLevel.from_string(ai_skill.get("level", "competent")),
                    context=context,
                    keywords=ai_skill.get("keywords", [])[:10],
                    confidence=min(1.0, max(0.0, ai_skill.get("confidence", 0.8))),
                    source=source
                )
                skills.append(skill)
            except Exception as e:
                logger.debug(f"Failed to convert AI skill: {e}")
        
        return skills
    
    def _deduplicate_skills_with_ai(self, skills: List[Skill]) -> List[Skill]:
        """Deduplicate skills using Gen AI"""
        if len(skills) <= 1:
            return skills
        
        skills_dict = [
            {
                "name": s.name,
                "category": s.category.value,
                "level": s.level.name,
                "confidence": s.confidence
            }
            for s in skills
        ]
        
        dedup_result = self.genai.deduplicate_skills(skills_dict)
        
        final_skills = []
        processed_names = set()
        
        for group in dedup_result.get("skill_groups", []):
            similar_names = [s.lower() for s in group.get("similar_skills", [])]
            merged_name = group.get("merged_name", "")
            
            if merged_name:
                best_skill = None
                best_confidence = 0
                
                for skill in skills:
                    if skill.name.lower() in similar_names:
                        if skill.confidence > best_confidence:
                            best_skill = skill
                            best_confidence = skill.confidence
                        processed_names.add(skill.name.lower())
                
                if best_skill:
                    best_skill.name = merged_name
                    merged_category = self._validate_category(group.get("merged_category", best_skill.category.value))
                    best_skill.category = merged_category
                    best_skill.level = SkillLevel.from_string(group.get("merged_level", best_skill.level.name))
                    final_skills.append(best_skill)
        
        for skill_name in dedup_result.get("unique_skills", []):
            for skill in skills:
                if skill.name == skill_name and skill.name.lower() not in processed_names:
                    final_skills.append(skill)
                    processed_names.add(skill.name.lower())
        
        for skill in skills:
            if skill.name.lower() not in processed_names:
                final_skills.append(skill)
        
        return final_skills
    
    def clear_cache(self):
        """Clear the extraction cache"""
        self.cache.clear()
        logger.info("Extraction cache cleared")
    
    def get_extraction_stats(self) -> Dict[str, Any]:
        """Get statistics about the extraction process"""
        return {
            "cache_size": len(self.cache),
            "batch_size": self.batch_size,
            "extraction_mode": "vllm_batch_processing",
            "features": [
                "true_batch_processing",
                "batch_skill_extraction",
                "batch_context_determination",
                "batch_study_level_detection",
                "parallel_implicit_skill_detection",
                "efficient_multi_unit_processing",
                "optimized_for_qualifications"
            ]
        }

# SkillMapper

In [0]:
"""
Skill mapping engine for credit transfer analysis using Gen AI
"""

# import logging
import numpy as np
from typing import List, Dict, Any, Optional
from collections import defaultdict

from models.base_models import Skill, SkillMapping
from models.enums import SkillLevel, SkillContext, EMBEDDING_MODE
# from interfaces.embedding_interface import EmbeddingInterface

# logger = logging.getLogger(__name__)


class SkillMapper:
    """Maps skills between VET and University courses using Gen AI and embeddings"""
    
    def __init__(self, 
                 embeddings: Optional[EmbeddingInterface] = None,
                 genai: Optional[Any] = None,
                 similarity_threshold: float = 0.8,
                 partial_threshold: float = 0.6,
                 embedding_mode: str = "embedding"):
        """
        Initialize skill mapper with Gen AI support
        
        Args:
            embeddings: Embedding interface for similarity calculation
            genai: GenAI interface for AI-based similarity
            similarity_threshold: Threshold for direct skill matches
            partial_threshold: Threshold for partial skill matches
        """
        self.embeddings = embeddings
        self.genai = genai
        self.similarity_threshold = similarity_threshold
        self.partial_threshold = partial_threshold
        self.similarity_cache = {}
        self.embedding_mode = embedding_mode.lower()
        if self.embedding_mode not in {"hybrid", "genai", "embedding"}:
            logger.warning(f"Unknown embedding mode '{self.embedding_mode}', defaulting to 'embedding'")
            self.embedding_mode = "embedding"
    
    def map_skills(self, 
                   vet_skills: List[Skill], 
                   uni_skills: List[Skill]) -> SkillMapping:
        """
        Map VET skills to university skills using AI and embeddings
        
        Args:
            vet_skills: List of VET skills
            uni_skills: List of university skills
            
        Returns:
            SkillMapping object with mapping details
        """
        logger.info(f"Mapping {len(vet_skills)} VET skills to {len(uni_skills)} Uni skills")
        
        mapping = SkillMapping()
        
        if not vet_skills or not uni_skills:
            logger.warning("Empty skill list provided for mapping")
            return mapping
        
        # Calculate similarity matrix
        similarity_matrix = self._calculate_similarity_matrix(vet_skills, uni_skills)
        
        # Find best matches
        matched_vet = set()
        matched_uni = set()
        
        # First pass: Find direct matches
        for i, vet_skill in enumerate(vet_skills):
            best_match_idx = np.argmax(similarity_matrix[i])
            best_score = similarity_matrix[i][best_match_idx]
            
            if best_score >= self.similarity_threshold:
                uni_skill = uni_skills[best_match_idx]
                match_quality = self._assess_match_quality(vet_skill, uni_skill)
                
                mapping.direct_matches.append({
                    "vet_skill": vet_skill,
                    "uni_skill": uni_skill,
                    "similarity": float(best_score),
                    "quality": match_quality
                })
                matched_vet.add(i)
                matched_uni.add(best_match_idx)
                
                # Mark this uni skill as matched
                similarity_matrix[:, best_match_idx] = 0
        
        # Second pass: Find partial matches
        for i, vet_skill in enumerate(vet_skills):
            if i in matched_vet:
                continue
            
            best_match_idx = np.argmax(similarity_matrix[i])
            best_score = similarity_matrix[i][best_match_idx]
            
            if best_score >= self.partial_threshold:
                uni_skill = uni_skills[best_match_idx]
                match_quality = self._assess_match_quality(vet_skill, uni_skill)
                gaps = self._identify_gaps(vet_skill, uni_skill)
                
                mapping.partial_matches.append({
                    "vet_skill": vet_skill,
                    "uni_skill": uni_skill,
                    "similarity": float(best_score),
                    "quality": match_quality,
                    "gaps": gaps
                })
                matched_vet.add(i)
                matched_uni.add(best_match_idx)
        
        # Identify unmapped skills
        mapping.unmapped_vet = [
            vet_skills[i] for i in range(len(vet_skills)) 
            if i not in matched_vet
        ]
        mapping.unmapped_uni = [
            uni_skills[i] for i in range(len(uni_skills)) 
            if i not in matched_uni
        ]
        
        # Calculate overall scores
        mapping.coverage_score = self._calculate_coverage_score(mapping, uni_skills)
        mapping.context_alignment = self._calculate_context_alignment(mapping)
        
        # Add metadata
        mapping.metadata = {
            "total_vet_skills": len(vet_skills),
            "total_uni_skills": len(uni_skills),
            "direct_match_count": len(mapping.direct_matches),
            "partial_match_count": len(mapping.partial_matches),
            "unmapped_vet_count": len(mapping.unmapped_vet),
            "unmapped_uni_count": len(mapping.unmapped_uni),
            "use_ai": self.genai is not None,
            "use_embeddings": self.embeddings is not None
        }
        
        logger.info(
            f"Mapping complete: {len(mapping.direct_matches)} direct, "
            f"{len(mapping.partial_matches)} partial, "
            f"{len(mapping.unmapped_uni)} unmapped uni skills"
        )
        
        return mapping
    
    def _calculate_similarity_matrix(self, 
                                     vet_skills: List[Skill], 
                                     uni_skills: List[Skill]) -> np.ndarray:
        """Calculate pairwise similarity between skills using AI and/or embeddings"""
        
        # If both AI and embeddings available, use hybrid approach
        if self.genai and self.embeddings and self.embedding_mode == EMBEDDING_MODE.HYBRID.value:
            # Use embeddings for initial fast similarity
            vet_names = [s.name for s in vet_skills]
            uni_names = [s.name for s in uni_skills]
            
            vet_embeddings = self.embeddings.encode(vet_names)
            uni_embeddings = self.embeddings.encode(uni_names)
            
            similarity_matrix = self.embeddings.similarity(vet_embeddings, uni_embeddings)
            
            # For high-scoring pairs, refine with AI
            high_similarity_pairs = np.where(similarity_matrix > 0.5)
            
            for i, j in zip(high_similarity_pairs[0], high_similarity_pairs[1]):
                # Use AI for more accurate similarity
                ai_similarity = self.genai.analyze_skill_similarity(
                    vet_skills[i].name,
                    uni_skills[j].name
                )
                # Weighted average of embedding and AI similarity
                similarity_matrix[i, j] = 0.6 * similarity_matrix[i, j] + 0.4 * ai_similarity
        
        # If only GenAI available, use it exclusively
        elif self.genai and self.embedding_mode == EMBEDDING_MODE.GENAI.value:
            similarity_matrix = np.zeros((len(vet_skills), len(uni_skills)))
            
            for i, vet_skill in enumerate(vet_skills):
                for j, uni_skill in enumerate(uni_skills):
                    # Check cache first
                    cache_key = (vet_skill.name.lower(), uni_skill.name.lower())
                    if cache_key in self.similarity_cache:
                        similarity = self.similarity_cache[cache_key]
                    else:
                        similarity = self.genai.analyze_skill_similarity(
                            vet_skill.name, 
                            uni_skill.name
                        )
                        self.similarity_cache[cache_key] = similarity
                    
                    similarity_matrix[i, j] = similarity
        
        # If only embeddings available, use them
        elif self.embeddings and self.embedding_mode == EMBEDDING_MODE.EMBEDDING.value:
            vet_names = [s.name for s in vet_skills]
            uni_names = [s.name for s in uni_skills]
            
            vet_embeddings = self.embeddings.encode(vet_names)
            uni_embeddings = self.embeddings.encode(uni_names)
            
            similarity_matrix = self.embeddings.similarity(vet_embeddings, uni_embeddings)
        
        # Fallback to simple string matching
        else:
            similarity_matrix = np.zeros((len(vet_skills), len(uni_skills)))
            
            for i, vet_skill in enumerate(vet_skills):
                for j, uni_skill in enumerate(uni_skills):
                    similarity = self._calculate_string_similarity(
                        vet_skill.name, 
                        uni_skill.name
                    )
                    similarity_matrix[i, j] = similarity
        
        # Apply modifiers based on skill properties
        for i, vet_skill in enumerate(vet_skills):
            for j, uni_skill in enumerate(uni_skills):
                # Boost similarity for same category
                if vet_skill.category == uni_skill.category:
                    similarity_matrix[i, j] *= 1.1
                
                # Reduce similarity for very different contexts
                if (vet_skill.context == SkillContext.PRACTICAL and 
                    uni_skill.context == SkillContext.THEORETICAL):
                    similarity_matrix[i, j] *= 0.9
                elif (vet_skill.context == SkillContext.THEORETICAL and 
                      uni_skill.context == SkillContext.PRACTICAL):
                    similarity_matrix[i, j] *= 0.9
        
        # Normalize to [0, 1]
        similarity_matrix = np.clip(similarity_matrix, 0, 1)
        
        return similarity_matrix
    
    def _calculate_string_similarity(self, str1: str, str2: str) -> float:
        """Calculate string similarity as fallback"""
        str1_lower = str1.lower().strip()
        str2_lower = str2.lower().strip()
        
        # Check cache
        cache_key = (str1_lower, str2_lower)
        if cache_key in self.similarity_cache:
            return self.similarity_cache[cache_key]
        
        # Exact match
        if str1_lower == str2_lower:
            similarity = 1.0
        # One contains the other
        elif str1_lower in str2_lower or str2_lower in str1_lower:
            similarity = 0.85
        else:
            # Word overlap using Jaccard similarity
            words1 = set(str1_lower.split())
            words2 = set(str2_lower.split())
            
            if not words1 or not words2:
                similarity = 0.0
            else:
                intersection = words1.intersection(words2)
                union = words1.union(words2)
                similarity = len(intersection) / len(union) if union else 0.0
        
        # Cache result
        self.similarity_cache[cache_key] = similarity
        return similarity
    
    def _assess_match_quality(self, vet_skill: Skill, uni_skill: Skill) -> Dict[str, Any]:
        """Assess quality of skill match using AI if available"""
        quality = {
            "level_alignment": self._compare_levels(vet_skill.level, uni_skill.level),
            "context_compatibility": self._compare_contexts(vet_skill.context, uni_skill.context),
            "category_match": vet_skill.category == uni_skill.category,
            "confidence_product": vet_skill.confidence * uni_skill.confidence
        }
        
        # If AI available, get additional quality assessment
        if self.genai:
            # Could add a specific prompt for quality assessment
            pass
        
        # Overall quality score (weighted average)
        quality["overall"] = (
            quality["level_alignment"] * 0.4 +
            quality["context_compatibility"] * 0.3 +
            float(quality["category_match"]) * 0.2 +
            quality["confidence_product"] * 0.1
        )
        
        return quality
    
    def _compare_levels(self, vet_level: SkillLevel, uni_level: SkillLevel) -> float:
        """Compare skill proficiency levels"""
        try:
            level_diff = vet_level.value - uni_level.value
            
            if level_diff >= 0:
                # VET meets or exceeds requirement
                return 1.0
            elif level_diff == -1:
                # One level below - minor gap
                return 0.8
            elif level_diff == -2:
                # Two levels below - significant gap
                return 0.6
            else:
                # Large gap
                return 0.3
        except:
            # Fallback if comparison fails
            return 0.5
    
    def _compare_contexts(self, vet_context: SkillContext, uni_context: SkillContext) -> float:
        """Compare skill contexts"""
        if vet_context == uni_context:
            return 1.0
        elif vet_context == SkillContext.HYBRID or uni_context == SkillContext.HYBRID:
            # Hybrid matches reasonably with both
            return 0.85
        else:
            # Practical vs Theoretical mismatch
            return 0.7
    
    def _identify_gaps(self, vet_skill: Skill, uni_skill: Skill) -> List[str]:
        """Identify gaps between VET and Uni skill requirements"""
        gaps = []
        
        # Level gap
        try:
            if vet_skill.level.value < uni_skill.level.value:
                level_diff = uni_skill.level.value - vet_skill.level.value
                gaps.append(f"Proficiency gap: {level_diff} level(s) below required {uni_skill.level.name}")
        except:
            pass
        
        # Context gap
        if vet_skill.context != uni_skill.context and uni_skill.context != SkillContext.HYBRID:
            if uni_skill.context == SkillContext.THEORETICAL:
                gaps.append("Context gap: requires more theoretical foundation")
            elif uni_skill.context == SkillContext.PRACTICAL:
                gaps.append("Context gap: requires more practical application")
        
        # Confidence gap
        if vet_skill.confidence < 0.7:
            gaps.append("Low confidence in skill extraction")
        
        # Use AI for more detailed gap analysis if available
        if self.genai:
            # Could add specific gap analysis prompt
            pass
        
        return gaps
    
    def _calculate_coverage_score(self, mapping: SkillMapping, uni_skills: List[Skill]) -> float:
        """Calculate skill coverage score"""
        if not uni_skills:
            return 0.0
        
        # Weight skills by importance
        covered_count = len(mapping.direct_matches) + (len(mapping.partial_matches) * 0.5)
        total_count = len(uni_skills)
        
        return min(1.0, covered_count / total_count) if total_count > 0 else 0.0
    
    def _calculate_context_alignment(self, mapping: SkillMapping) -> float:
        """Calculate overall context alignment score"""
        if not mapping.direct_matches and not mapping.partial_matches:
            return 0.0
        
        scores = []
        
        for match in mapping.direct_matches:
            scores.append(match["quality"]["context_compatibility"])
        
        for match in mapping.partial_matches:
            scores.append(match["quality"]["context_compatibility"] * 0.7)
        
        return np.mean(scores) if scores else 0.0
    
    def get_skill_alignment_summary(self, mapping: SkillMapping) -> Dict[str, Any]:
        """Generate a summary of skill alignment"""
        summary = {
            "total_alignment_score": 0.0,
            "strengths": [],
            "weaknesses": [],
            "recommendations": [],
            "analysis_method": "hybrid" if self.genai and self.embeddings else 
                             "ai_only" if self.genai else 
                             "embedding_only" if self.embeddings else "string_matching"
        }
        
        # Calculate total alignment score
        summary["total_alignment_score"] = (
            mapping.coverage_score * 0.6 +
            mapping.context_alignment * 0.4
        )
        
        # Identify strengths
        if mapping.coverage_score > 0.7:
            summary["strengths"].append(f"Good skill coverage ({mapping.coverage_score:.1%})")
        if mapping.context_alignment > 0.8:
            summary["strengths"].append("Good practical/theoretical balance")
        
        direct_ratio = len(mapping.direct_matches) / (len(mapping.direct_matches) + len(mapping.partial_matches)) if (mapping.direct_matches or mapping.partial_matches) else 0
        if direct_ratio > 0.7:
            summary["strengths"].append(f"High proportion of direct matches ({direct_ratio:.1%})")
        
        # Identify weaknesses
        if mapping.coverage_score < 0.5:
            summary["weaknesses"].append(f"Low skill coverage ({mapping.coverage_score:.1%})")
        if mapping.context_alignment < 0.6:
            summary["weaknesses"].append("Poor practical/theoretical balance")
        if len(mapping.unmapped_uni) > 5:
            summary["weaknesses"].append(f"{len(mapping.unmapped_uni)} critical skills not covered")
        
        # Generate recommendations
        if mapping.unmapped_uni:
            summary["recommendations"].append(
                f"Bridge {len(mapping.unmapped_uni)} missing skills through supplementary training"
            )
        
        if mapping.context_alignment < 0.7:
            if any(m["uni_skill"].context == SkillContext.THEORETICAL for m in mapping.partial_matches):
                summary["recommendations"].append("Add theoretical foundation modules")
            if any(m["uni_skill"].context == SkillContext.PRACTICAL for m in mapping.partial_matches):
                summary["recommendations"].append("Include more practical/lab work")
        
        return summary
    
    def clear_cache(self):
        """Clear the similarity cache"""
        self.similarity_cache.clear()
        logger.info("Similarity cache cleared")

# CreditTransferAnalyzer

In [0]:
"""
Main credit transfer analyzer - Updated to support batch processing
"""

# import logging
from typing import List, Dict, Any, Optional
from datetime import datetime
import json
# from config import Config

from models.base_models import (
    VETQualification, UniQualification, CreditTransferRecommendation,
    UnitOfCompetency, UniCourse, SkillMapping
)
from models.enums import RecommendationType
# from extraction.skill_extractor import SkillExtractor
# from mapping.skill_mapper import SkillMapper
from mapping.edge_cases import EdgeCaseHandler
from interfaces.genai_interface import GenAIInterface
# from interfaces.embedding_interface import EmbeddingInterface

# Import vLLM interfaces
if Config.USE_VLLM:
    if Config.USE_VLLM_BATCH:
        pass
        # from interfaces.vllm_genai_interface_batch import VLLMGenAIInterfaceBatch
        # from extraction.vllm_skill_extractor_batch import VLLMSkillExtractorBatch
    else:
        from interfaces.vllm_genai_interface import VLLMGenAIInterface
        from extraction.vllm_skill_extractor import VLLMSkillExtractor
        from extraction.openai_skill_extractor import OpenAISkillExtractor

# logger = logging.getLogger(__name__)


class CreditTransferAnalyzer:
    """Main analyzer for credit transfer recommendations with batch processing support"""
    
    def __init__(self,
                 genai: Optional[GenAIInterface] = None,
                 embeddings: Optional[EmbeddingInterface] = None,
                 config: Optional[Dict] = None):
        """
        Initialize credit transfer analyzer with batch processing support
        
        Args:
            genai: GenAI interface for advanced extraction
            embeddings: Embedding interface for similarity
            config: Configuration dictionary
        """
        self.genai = genai
        self.embeddings = embeddings
        self.config = config or {}
        
        # Initialize components based on interface type and configuration
        if Config.USE_VLLM:
            if Config.USE_VLLM_BATCH:
                # Use batch processing versions
                if isinstance(genai, VLLMGenAIInterfaceBatch):
                    self.extractor = VLLMSkillExtractorBatch(genai, embeddings)
                    logger.info("Using vLLM batch skill extractor for efficient processing")
                else:
                    logger.warning("Batch mode configured but GenAI interface is not batch-enabled")
                    # Fall back to regular vLLM extractor
                    from extraction.vllm_skill_extractor import VLLMSkillExtractor
                    self.extractor = VLLMSkillExtractor(genai, embeddings)
            else:
                # Use regular vLLM versions
                if hasattr(genai, '__class__') and 'VLLMGenAIInterface' in str(genai.__class__):
                    self.extractor = VLLMSkillExtractor(genai, embeddings)
                    logger.info("Using vLLM skill extractor (individual processing)")
                else:
                    # Fall back to standard extractor
                    self.extractor = SkillExtractor(genai, embeddings)
                    
        elif isinstance(genai, GenAIInterface) and hasattr(genai, 'client'):
            # Use OpenAI extractor for Azure OpenAI
            delay = self.config.get("openai_delay_between_requests", 1.0)
            
            self.extractor = OpenAISkillExtractor(
                genai, 
                embeddings, 
                delay_between_requests=delay
            )
            logger.info("Using OpenAI skill extractor")
            
        else:
            # Use standard extractor
            self.extractor = SkillExtractor(genai, embeddings)
            logger.info("Using standard skill extractor")
        
        # Initialize mapper with Gen AI support
        self.mapper = SkillMapper(embeddings, genai)
        
        # Initialize edge handler with Gen AI support
        self.edge_handler = EdgeCaseHandler(genai)
        
        # Configuration
        self.min_alignment_score = self.config.get("min_alignment_score", 0.5)
        self.combination_limit = self.config.get("max_unit_combination", 3)
        
        # Analysis cache
        self.analysis_cache = {}
    
    def analyze_transfer(self,
                        vet_qual: VETQualification,
                        uni_qual: UniQualification,
                        target_courses: Optional[List[str]] = None) -> List[CreditTransferRecommendation]:
        """
        Analyze credit transfer possibilities using batch processing if configured
        
        Args:
            vet_qual: VET qualification to analyze
            uni_qual: University qualification target
            target_courses: Optional list of specific course codes to analyze
            
        Returns:
            List of credit transfer recommendations
        """
        logger.info(f"Starting credit transfer analysis: {vet_qual.code} -> {uni_qual.code}")
        
        # Check if batch extraction is available
        use_batch = Config.USE_VLLM_BATCH and hasattr(self.extractor, 'extract_from_vet_qualification')
        
        if use_batch:
            logger.info("Using batch processing for skill extraction")
            
            # Extract all VET skills at once using batch processing
            vet_skills_map = self.extractor.extract_from_vet_qualification(vet_qual)
            logger.info(f"Batch extracted skills from {len(vet_skills_map)} VET units")
            
            # Filter target courses if specified
            if target_courses:
                # Create a temporary qualification with only target courses
                filtered_qual = UniQualification(
                    code=uni_qual.code,
                    name=uni_qual.name,
                    courses=[c for c in uni_qual.courses if c.code in target_courses]
                )
                # Extract all university skills at once using batch processing
                uni_skills_map = self.extractor.extract_from_uni_qualification(filtered_qual)
                courses_to_analyze = filtered_qual.courses
            else:
                # Extract all university skills at once using batch processing
                uni_skills_map = self.extractor.extract_from_uni_qualification(uni_qual)
                courses_to_analyze = uni_qual.courses
            
            logger.info(f"Batch extracted skills from {len(uni_skills_map)} University courses")
            
        else:
            logger.info("Using individual processing for skill extraction")
            
            # Extract skills from VET units individually
            for unit in vet_qual.units:
                if not unit.extracted_skills:
                    self.extractor.extract_from_vet_unit(unit)
            
            # Filter target courses if specified
            courses_to_analyze = uni_qual.courses
            if target_courses:
                courses_to_analyze = [c for c in uni_qual.courses if c.code in target_courses]
            
            # Extract skills from university courses individually
            for course in courses_to_analyze:
                if not course.extracted_skills:
                    self.extractor.extract_from_uni_course(course)
        
        recommendations = []
        
        # Analyze each university course
        for course in courses_to_analyze:
            logger.info(f"Analyzing transfers for {course.code}: {course.name}")
            
            # Try single unit mappings
            single_recs = self._analyze_single_mappings(vet_qual.units, course)
            recommendations.extend(single_recs)
            
            # Try combination mappings if enabled
            if self.combination_limit > 1:
                combo_recs = self._analyze_combination_mappings(vet_qual.units, course)
                recommendations.extend(combo_recs)
        
        # Filter and sort recommendations
        recommendations = self._filter_recommendations(recommendations)
        recommendations.sort(key=lambda x: (x.alignment_score, x.confidence), reverse=True)
        
        # Add analysis metadata
        self._add_analysis_metadata(recommendations, vet_qual, uni_qual)
        
        logger.info(f"Analysis complete: {len(recommendations)} recommendations generated")
        
        return recommendations
    
    def _analyze_single_mappings(self,
                                 units: List[UnitOfCompetency],
                                 course: UniCourse) -> List[CreditTransferRecommendation]:
        """Analyze single unit to course mappings"""
        recommendations = []
        
        for unit in units:
            # Check cache
            cache_key = f"{unit.code}_{course.code}"
            if cache_key in self.analysis_cache:
                rec = self.analysis_cache[cache_key]
                if rec.alignment_score >= self.min_alignment_score:
                    recommendations.append(rec)
                continue
            
            # Perform analysis
            rec = self._analyze_single_mapping(unit, course)
            
            # Cache result
            self.analysis_cache[cache_key] = rec
            
            # Add if meets threshold
            if rec.alignment_score >= self.min_alignment_score:
                recommendations.append(rec)
                logger.debug(f"Single mapping {unit.code} -> {course.code}: {rec.alignment_score:.2%}")
        
        return recommendations
    
    def _analyze_single_mapping(self,
                               unit: UnitOfCompetency,
                               course: UniCourse) -> CreditTransferRecommendation:
        """Analyze single unit to course mapping"""
        
        # Map skills
        mapping = self.mapper.map_skills(unit.extracted_skills, course.extracted_skills)
        
        # Handle edge cases
        edge_cases = self.edge_handler.process_edge_cases([unit], course, mapping)
        
        # Calculate scores
        alignment_score = self._calculate_alignment_score(mapping, edge_cases)
        skill_coverage = self._calculate_skill_coverage_breakdown(mapping)
        confidence = self._calculate_confidence(mapping, edge_cases)
        
        # Determine recommendation type
        recommendation_type = self._determine_recommendation_type(
            alignment_score, mapping, edge_cases
        )
        
        # Identify conditions
        conditions = self._identify_conditions(mapping, edge_cases)
        
        # Generate evidence
        evidence = self._generate_evidence(mapping, edge_cases)
        
        # Create recommendation
        rec = CreditTransferRecommendation(
            vet_units=[unit],
            uni_course=course,
            alignment_score=alignment_score,
            skill_coverage=skill_coverage,
            gaps=mapping.unmapped_uni,
            evidence=evidence,
            recommendation=recommendation_type,
            conditions=conditions,
            confidence=confidence,
            edge_case_results=edge_cases
        )
        
        return rec
    
    def _analyze_combination_mappings(self,
                                      units: List[UnitOfCompetency],
                                      course: UniCourse) -> List[CreditTransferRecommendation]:
        """Analyze combinations of units mapping to course"""
        recommendations = []
        
        from itertools import combinations
        
        # Try combinations up to the limit
        for r in range(2, min(self.combination_limit + 1, len(units) + 1)):
            for combo in combinations(units, r):
                # Skip if too many units
                if len(combo) > self.combination_limit:
                    continue
                
                # Combine skills from all units
                combined_skills = []
                for unit in combo:
                    combined_skills.extend(unit.extracted_skills)
                
                # Map combined skills
                mapping = self.mapper.map_skills(combined_skills, course.extracted_skills)
                
                # Only proceed if combination significantly improves coverage
                if mapping.coverage_score < 0.7:
                    continue
                
                # Perform full analysis
                edge_cases = self.edge_handler.process_edge_cases(list(combo), course, mapping)
                alignment_score = self._calculate_alignment_score(mapping, edge_cases)
                
                if alignment_score >= self.min_alignment_score * 1.2:  # Higher threshold for combinations
                    rec = CreditTransferRecommendation(
                        vet_units=list(combo),
                        uni_course=course,
                        alignment_score=alignment_score,
                        skill_coverage=self._calculate_skill_coverage_breakdown(mapping),
                        gaps=mapping.unmapped_uni,
                        evidence=self._generate_evidence(mapping, edge_cases),
                        recommendation=self._determine_recommendation_type(
                            alignment_score, mapping, edge_cases
                        ),
                        conditions=self._identify_conditions(mapping, edge_cases),
                        confidence=self._calculate_confidence(mapping, edge_cases),
                        edge_case_results=edge_cases
                    )
                    recommendations.append(rec)
                    
                    logger.debug(
                        f"Combination {'+'.join(u.code for u in combo)} -> "
                        f"{course.code}: {alignment_score:.2%}"
                    )
        
        return recommendations
    
    def _calculate_alignment_score(self,
                                   mapping: SkillMapping,
                                   edge_cases: Dict[str, Any]) -> float:
        """Calculate overall alignment score"""
        
        # Base weights
        weights = {
            "coverage": 0.5,
            "context": 0.25,
            "quality": 0.15,
            "edge_penalty": 0.1
        }
        
        # Calculate quality score
        quality_score = 0.0
        if mapping.direct_matches:
            quality_scores = [m["quality"]["overall"] for m in mapping.direct_matches]
            quality_score = sum(quality_scores) / len(quality_scores)
        
        # Calculate edge case penalty
        edge_penalty = 0.0
        
        if "context_imbalance" in edge_cases:
            edge_penalty += edge_cases["context_imbalance"].get("imbalance_score", 0) * 0.3
        
        if "outdated_content" in edge_cases:
            if edge_cases["outdated_content"].get("currency_issues"):
                edge_penalty += 0.2
        
        # Calculate final score
        score = (
            mapping.coverage_score * weights["coverage"] +
            mapping.context_alignment * weights["context"] +
            quality_score * weights["quality"] -
            edge_penalty * weights["edge_penalty"]
        )
        
        return max(0.0, min(1.0, score))
    
    def _determine_recommendation_type(self,
                                       score: float,
                                       mapping: SkillMapping,
                                       edge_cases: Dict[str, Any]) -> RecommendationType:
        """Determine recommendation type based on analysis"""
        
        has_gaps = len(mapping.unmapped_uni) > 0
        has_major_issues = any([
            edge_cases.get("outdated_content", {}).get("estimated_update_effort") == "high",
            edge_cases.get("context_imbalance", {}).get("imbalance_score", 0) > 0.5
        ])
        
        if score >= 0.8 and not has_gaps and not has_major_issues:
            return RecommendationType.FULL
        elif score >= 0.7 and not has_major_issues:
            return RecommendationType.CONDITIONAL
        elif score >= 0.5:
            return RecommendationType.PARTIAL
        else:
            return RecommendationType.NONE
    
    def _identify_conditions(self,
                            mapping: SkillMapping,
                            edge_cases: Dict[str, Any]) -> List[str]:
        """Identify conditions for credit transfer"""
        conditions = []
        
        # Missing skills
        if mapping.unmapped_uni:
            skill_names = [s.name for s in mapping.unmapped_uni[:3]]
            conditions.append(f"Bridge missing skills: {', '.join(skill_names)}")
        
        # Context imbalance
        if "context_imbalance" in edge_cases:
            conditions.extend(edge_cases["context_imbalance"].get("bridging_requirements", []))
        
        # Outdated content
        if "outdated_content" in edge_cases:
            conditions.extend(edge_cases["outdated_content"].get("update_requirements", []))
        
        # Prerequisites
        if "prerequisite_chain" in edge_cases:
            missing = edge_cases["prerequisite_chain"].get("missing_prerequisites", [])
            if missing:
                conditions.append(f"Complete prerequisites: {', '.join(missing[:3])}")
        
        return conditions
    
    def _generate_evidence(self,
                          mapping: SkillMapping,
                          edge_cases: Dict[str, Any]) -> List[str]:
        """Generate evidence statements for recommendation"""
        evidence = []
        
        # Skill matches
        if mapping.direct_matches:
            evidence.append(f"{len(mapping.direct_matches)} direct skill matches")
        
        if mapping.partial_matches:
            evidence.append(f"{len(mapping.partial_matches)} partial skill matches")
        
        # Coverage metrics
        evidence.append(f"Skill coverage: {mapping.coverage_score:.1%}")
        
        if mapping.context_alignment > 0.8:
            evidence.append("Good practical/theoretical balance")
        
        # Edge case findings
        if "split_to_single" in edge_cases:
            if edge_cases["split_to_single"].get("total_coverage", 0) > 0.8:
                evidence.append("Unit combination provides comprehensive coverage")
        
        if "credit_hours" in edge_cases:
            if not edge_cases["credit_hours"].get("adjustment_needed", False):
                evidence.append("Credit hour alignment acceptable")
        
        # Add processing method
        if Config.USE_VLLM_BATCH:
            evidence.append("Analyzed using batch-processed skill extraction")
        elif mapping.metadata.get("use_ai"):
            evidence.append("Analyzed using AI-powered skill matching")
        
        return evidence
    
    def _calculate_skill_coverage_breakdown(self,
                                           mapping: SkillMapping) -> Dict[str, float]:
        """Calculate detailed skill coverage breakdown"""
        from collections import defaultdict
        
        breakdown = defaultdict(float)
        category_totals = defaultdict(int)
        category_matched = defaultdict(float)
        
        # Count totals by category
        for match in mapping.direct_matches:
            category = match["uni_skill"].category.value
            category_matched[category] += 1.0
            category_totals[category] += 1
        
        for match in mapping.partial_matches:
            category = match["uni_skill"].category.value
            category_matched[category] += 0.5
            category_totals[category] += 1
        
        for skill in mapping.unmapped_uni:
            category = skill.category.value
            category_totals[category] += 1
        
        # Calculate percentages
        for category in category_totals:
            if category_totals[category] > 0:
                breakdown[category] = category_matched[category] / category_totals[category]
        
        return dict(breakdown)
    
    def _calculate_confidence(self,
                             mapping: SkillMapping,
                             edge_cases: Dict[str, Any]) -> float:
        """Calculate confidence in the recommendation"""
        confidence = 1.0
        
        # Reduce for unmapped skills
        unmapped_ratio = len(mapping.unmapped_uni) / (
            len(mapping.direct_matches) + len(mapping.partial_matches) + len(mapping.unmapped_uni)
        ) if (mapping.direct_matches or mapping.partial_matches or mapping.unmapped_uni) else 0
        
        confidence -= unmapped_ratio * 0.3
        
        # Reduce for edge case issues
        if "context_imbalance" in edge_cases:
            imbalance = edge_cases["context_imbalance"].get("imbalance_score", 0)
            confidence -= imbalance * 0.2
        
        if "outdated_content" in edge_cases:
            if edge_cases["outdated_content"].get("currency_issues"):
                confidence -= 0.15
        
        # Boost for strong direct matches
        if mapping.direct_matches:
            direct_ratio = len(mapping.direct_matches) / (
                len(mapping.direct_matches) + len(mapping.partial_matches)
            ) if (mapping.direct_matches or mapping.partial_matches) else 0
            
            if direct_ratio > 0.7:
                confidence += 0.1
        
        # Boost if using AI or batch processing
        if Config.USE_VLLM_BATCH:
            confidence += 0.07  # Higher boost for batch processing
        elif mapping.metadata.get("use_ai"):
            confidence += 0.05
        
        return max(0.3, min(1.0, confidence))
    
    def _filter_recommendations(self,
                               recommendations: List[CreditTransferRecommendation]
                               ) -> List[CreditTransferRecommendation]:
        """Filter recommendations to avoid duplicates and conflicts"""
        filtered = []
        seen_mappings = set()
        
        for rec in recommendations:
            # Create unique key for this mapping
            vet_codes = tuple(sorted(rec.get_vet_unit_codes()))
            uni_code = rec.uni_course.code
            mapping_key = (vet_codes, uni_code)
            
            # Skip if already seen
            if mapping_key in seen_mappings:
                continue
            
            seen_mappings.add(mapping_key)
            
            # Skip if recommendation is "none"
            if rec.recommendation == RecommendationType.NONE:
                continue
            
            filtered.append(rec)
        
        return filtered
    
    def _add_analysis_metadata(self,
                               recommendations: List[CreditTransferRecommendation],
                               vet_qual: VETQualification,
                               uni_qual: UniQualification):
        """Add metadata to recommendations"""
        timestamp = datetime.now().isoformat()
        
        for rec in recommendations:
            rec.metadata.update({
                "vet_qualification": vet_qual.code,
                "uni_qualification": uni_qual.code,
                "analysis_timestamp": timestamp,
                "analyzer_version": "2.1.0",  # Updated version for batch support
                "extractor_type": type(self.extractor).__name__,
                "uses_genai": self.genai is not None,
                "uses_embeddings": self.embeddings is not None,
                "uses_batch_processing": Config.USE_VLLM_BATCH,
                "batch_size": Config.VLLM_BATCH_SIZE if Config.USE_VLLM_BATCH else 1
            })
    
    def export_recommendations(self,
                               recommendations: List[CreditTransferRecommendation],
                               filepath: str):
        """Export recommendations to JSON file"""
        data = {
            "recommendations": [rec.to_dict() for rec in recommendations],
            "summary": self._generate_summary(recommendations),
            "timestamp": datetime.now().isoformat(),
            "extractor_stats": self._get_extractor_stats(),
            "analysis_method": "batch_processing" if Config.USE_VLLM_BATCH else "individual_processing",
            "configuration": {
                "use_batch": Config.USE_VLLM_BATCH,
                "batch_size": Config.VLLM_BATCH_SIZE if Config.USE_VLLM_BATCH else None
            }
        }
        
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)
        
        logger.info(f"Exported {len(recommendations)} recommendations to {filepath}")
    
    def _generate_summary(self,
                         recommendations: List[CreditTransferRecommendation]) -> Dict:
        """Generate summary of recommendations"""
        full = [r for r in recommendations if r.recommendation == RecommendationType.FULL]
        conditional = [r for r in recommendations if r.recommendation == RecommendationType.CONDITIONAL]
        partial = [r for r in recommendations if r.recommendation == RecommendationType.PARTIAL]
        
        return {
            "total_recommendations": len(recommendations),
            "full_credit": len(full),
            "conditional_credit": len(conditional),
            "partial_credit": len(partial),
            "average_alignment": sum(r.alignment_score for r in recommendations) / len(recommendations) if recommendations else 0,
            "average_confidence": sum(r.confidence for r in recommendations) / len(recommendations) if recommendations else 0,
            "uses_ai": self.genai is not None,
            "uses_batch_processing": Config.USE_VLLM_BATCH
        }
    
    def _get_extractor_stats(self) -> Dict:
        """Get statistics from the extractor"""
        if hasattr(self.extractor, 'get_extraction_stats'):
            return self.extractor.get_extraction_stats()
        elif hasattr(self.extractor, 'cache'):
            return {"cache_size": len(self.extractor.cache)}
        else:
            return {"extractor_type": type(self.extractor).__name__}

# Main

In [0]:
import argparse
import json
# import logging
import sys
from pathlib import Path
from typing import List, Optional

# from config import Config
from models.base_models import VETQualification, UniQualification, UnitOfCompetency, UniCourse
from interfaces.genai_interface import GenAIInterface
from interfaces.vllm_genai_interface import VLLMGenAIInterface
# from interfaces.embedding_interface import EmbeddingInterface
# from analysis.analyzer import CreditTransferAnalyzer
from reporting.report_generator import ReportGenerator

def load_vet_data(filepath: str) -> VETQualification:
    """Load VET qualification data from JSON file"""
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    vet_qual = VETQualification(
        code=data["code"],
        name=data["name"],
        level=data["level"]
    )
    
    for unit_data in data.get("units", []):
        unit = UnitOfCompetency(
            code=unit_data["code"],
            name=unit_data["name"],
            description=unit_data.get("description", ""),
            learning_outcomes=unit_data.get("learning_outcomes", []),
            assessment_requirements=unit_data.get("assessment_requirements", ""),
            nominal_hours=unit_data.get("nominal_hours", 0),
            prerequisites=unit_data.get("prerequisites", [])
        )
        vet_qual.units.append(unit)
    
    logger.info(f"Loaded VET qualification: {vet_qual.code} with {len(vet_qual.units)} units")
    return vet_qual


def load_uni_data(filepath: str) -> UniQualification:
    """Load university qualification data from JSON file"""
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    uni_qual = UniQualification(
        code=data["code"],
        name=data["name"]
    )
    
    for course_data in data.get("courses", []):
        course = UniCourse(
            code=course_data["code"],
            name=course_data["name"],
            description=course_data.get("description", ""),
            study_level=course_data.get("study_level", "intermediate"),
            learning_outcomes=course_data.get("learning_outcomes", []),
            prerequisites=course_data.get("prerequisites", []),
            credit_points=course_data.get("credit_points", 0),
            topics=course_data.get("topics", []),
            assessment=course_data.get("assessment", "")
        )
        uni_qual.courses.append(course)
    
    logger.info(f"Loaded university qualification: {uni_qual.code} with {len(uni_qual.courses)} courses")
    return uni_qual


def initialize_interfaces():
    """Initialize GenAI and Embedding interfaces with batch processing support"""
    genai = None
    embeddings = None

    # Save the original CUDA_VISIBLE_DEVICES
    original_cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
    logger.info(f"original_cuda_visible : {original_cuda_visible}")
    # Initialize GenAI - Priority order: Azure OpenAI > vLLM > Web API
    if Config.USE_AZURE_OPENAI:
        try:
            azure_config = Config.get_azure_openai_config()
            genai = GenAIInterface(
                endpoint=azure_config["endpoint"],
                deployment=azure_config["deployment"],
                api_key=azure_config["api_key"],
                api_version=azure_config["api_version"],
                timeout=azure_config["timeout"],
                max_tokens=azure_config["max_tokens"],
                temperature=azure_config["temperature"]
            )
            logger.info(f"Azure OpenAI interface initialized with deployment: {azure_config['deployment']}")
        except Exception as e:
            logger.warning(f"Failed to initialize Azure OpenAI interface: {e}")
            
            # Fall back to vLLM if Azure OpenAI fails
            if Config.USE_VLLM:
                try:
                    if Config.USE_VLLM_BATCH:
                        genai = VLLMGenAIInterfaceBatch(
                            model_name=Config.VLLM_MODEL_NAME,
                            number_gpus=Config.VLLM_NUM_GPUS,
                            max_model_len=Config.VLLM_MAX_MODEL_LEN,
                            batch_size=Config.VLLM_BATCH_SIZE,
                            model_cache_dir=Config.MODEL_CACHE_DIR,
                            external_model_dir=Config.EXTERNAL_MODEL_DIR,
                            gpu_id=0  # Explicitly use GPU 0
                        )
                        logger.info(f"Fell back to vLLM batch interface on GPU 0 with model: {Config.VLLM_MODEL_NAME}")
                    else:
                        genai = VLLMGenAIInterface(
                            model_name=Config.VLLM_MODEL_NAME,
                            number_gpus=Config.VLLM_NUM_GPUS,
                            max_model_len=Config.VLLM_MAX_MODEL_LEN,
                            model_cache_dir=Config.MODEL_CACHE_DIR,
                            external_model_dir=Config.EXTERNAL_MODEL_DIR,
                            gpu_id=0  # Explicitly use GPU 0
                        )
                        logger.info(f"Fell back to vLLM interface on GPU 0 with model: {Config.VLLM_MODEL_NAME}")
                except Exception as e2:
                    logger.warning(f"Failed to initialize vLLM interface: {e2}")
    
    elif Config.USE_VLLM:
        try:
            # Set CUDA_VISIBLE_DEVICES for vLLM (GPU 0)
            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
            
            if Config.USE_VLLM_BATCH:
                genai = VLLMGenAIInterfaceBatch(
                    model_name=Config.VLLM_MODEL_NAME,
                    number_gpus=Config.VLLM_NUM_GPUS,
                    max_model_len=Config.VLLM_MAX_MODEL_LEN,
                    batch_size=Config.VLLM_BATCH_SIZE,
                    model_cache_dir=Config.MODEL_CACHE_DIR,
                    external_model_dir=Config.EXTERNAL_MODEL_DIR,
                    gpu_id=0  # Using GPU 0
                )
                logger.info(f"vLLM batch GenAI interface initialized on GPU 0 with model: {Config.VLLM_MODEL_NAME}")
                logger.info(f"Batch size: {Config.VLLM_BATCH_SIZE}")
            else:
                genai = VLLMGenAIInterface(
                    model_name=Config.VLLM_MODEL_NAME,
                    number_gpus=Config.VLLM_NUM_GPUS,
                    max_model_len=Config.VLLM_MAX_MODEL_LEN,
                    model_cache_dir=Config.MODEL_CACHE_DIR,
                    external_model_dir=Config.EXTERNAL_MODEL_DIR,
                    gpu_id=0  # Using GPU 0
                )
                logger.info(f"vLLM GenAI interface initialized on GPU 0 with model: {Config.VLLM_MODEL_NAME}")
                
            # Restore original CUDA_VISIBLE_DEVICES for embedding model
            if original_cuda_visible is not None:
                os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible
            else:
                # Remove the variable to make all GPUs visible again
                del os.environ["CUDA_VISIBLE_DEVICES"]

        except Exception as e:
            logger.warning(f"Failed to initialize vLLM GenAI interface: {e}")
            
            # Fall back to web API if vLLM fails
            if Config.USE_GENAI:
                try:
                    genai = GenAIInterface(
                        model_endpoint=Config.GENAI_ENDPOINT,
                        api_key=Config.GENAI_API_KEY,
                        timeout=Config.GENAI_TIMEOUT
                    )
                    logger.info("Fell back to web API GenAI interface")
                except Exception as e2:
                    logger.warning(f"Failed to initialize web API GenAI interface: {e2}")
    
    elif Config.USE_GENAI:
        try:
            genai = GenAIInterface(
                model_endpoint=Config.GENAI_ENDPOINT,
                api_key=Config.GENAI_API_KEY,
                timeout=Config.GENAI_TIMEOUT
            )
            logger.info("Web API GenAI interface initialized")
        except Exception as e:
            logger.warning(f"Failed to initialize GenAI interface: {e}")
    
    # Initialize Embeddings
    try:
        # Use cuda:1 for embeddings when using vLLM on cuda:0
        embedding_device = "cuda:1" if Config.USE_VLLM else Config.EMBEDDING_DEVICE
        
        embeddings = EmbeddingInterface(
            model_name=Config.EMBEDDING_MODEL_NAME,
            model_cache_dir=Config.MODEL_CACHE_DIR,
            external_model_dir=Config.EXTERNAL_MODEL_DIR,
            device=embedding_device,  # This will be cuda:1 when using vLLM
            batch_size=Config.EMBEDDING_BATCH_SIZE
        )
        logger.info(f"Embedding interface initialized with model: {Config.EMBEDDING_MODEL_NAME} on device: {embedding_device}")
    except Exception as e:
        logger.warning(f"Failed to initialize Embedding interface: {e}")
        # Try fallback to legacy configuration
        try:
            embeddings = EmbeddingInterface(
                model_name=Config.EMBEDDING_MODEL_NAME,
                model_cache_dir=Config.MODEL_CACHE_DIR,
                external_model_dir=Config.EXTERNAL_MODEL_DIR,
                device=embedding_device,  # This will be cuda:1 when using vLLM
                batch_size=Config.EMBEDDING_BATCH_SIZE
            )
            logger.info("Initialized embedding interface with legacy configuration")
        except Exception as e2:
            logger.warning(f"Failed to initialize legacy embedding interface: {e2}")
    
    return genai, embeddings

In [0]:
logger.info("Starting credit transfer analysis")
vet_file = "./data/sample_vet.json"
uni_file = "./data/sample_uni.json"   
    # Load data
vet_qual = load_vet_data(vet_file)
uni_qual = load_uni_data(uni_file)

In [0]:
genai, embeddings = initialize_interfaces()

In [0]:
analyzer = CreditTransferAnalyzer(
        genai=genai,
        embeddings=embeddings,
        config=Config.get_config_dict()
    )
    
# Perform analysis
logger.info("Performing credit transfer analysis...")
recommendations = analyzer.analyze_transfer(
    vet_qual=vet_qual,
    uni_qual=uni_qual,
    target_courses=None
)

In [0]:
logger.info(f"Generated {len(recommendations)} recommendations")
    
    # Generate report
report_gen = ReportGenerator()
  
# Export skills if requested
logger.info("Exporting extracted skills...")

# Generate complete report package including skills
files = report_gen.generate_complete_report_package(
    recommendations, vet_qual, uni_qual
)

logger.info("Report package generated:")
for file_type, filepath in files.items():
    logger.info(f"  {file_type}: {filepath}")

# Print summary
print("\n" + "="*60)
print("REPORT PACKAGE GENERATED")
print("="*60)
print(f"Processing Mode: {'Batch' if Config.USE_VLLM_BATCH else 'Individual'}")
if Config.USE_VLLM_BATCH:
    print(f"Batch Size: {Config.VLLM_BATCH_SIZE}")
print(f"Main report: {files.get('report_html', 'N/A')}")
print(f"Recommendations: {files.get('recommendations_json', 'N/A')}")
print(f"VET Skills: {files.get('vet_skills_json', 'N/A')}")
print(f"University Skills: {files.get('uni_skills_json', 'N/A')}")
print(f"Combined Skills: {files.get('combined_skills_json', 'N/A')}")
print(f"Skill Analysis: {files.get('skill_analysis', 'N/A')}")