# config.py


In [None]:
"""
Configuration module for ADReSSo21 Speech Analysis
Handles all paths, settings, and system configuration
"""
import os
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List
import multiprocessing

@dataclass
class SystemConfig:
    """System resource configuration"""
    n_cores: int = min(10, multiprocessing.cpu_count())  # Use available cores, max 10
    max_workers: int = 8  # Leave some cores for system
    chunk_size: int = 2  # Process files in chunks
    memory_limit_gb: int = 30  # Leave 5GB for system from your 35GB

@dataclass
class PathConfig:
    """Path configuration for Windows 10"""
    base_path: str = r"C:\Users\Administrator\Desktop\Speech"
    output_path: str = r"C:\Users\Administrator\Desktop\Speech\output"

    # Diagnosis paths
    diagnosis_train_audio_ad: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad"
    diagnosis_train_audio_cn: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn"
    diagnosis_train_seg_ad: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\ad"
    diagnosis_train_seg_cn: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\cn"

    # Progression train paths
    progression_train_audio_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline"
    progression_train_audio_no_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline"
    progression_train_seg_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\decline"
    progression_train_seg_no_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\no_decline"

    # Progression test paths
    progression_test_audio: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio"
    progression_test_seg: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\segmentation"

    def __post_init__(self):
        """Create output directory if it doesn't exist"""
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "features")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "transcripts")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "models")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "logs")).mkdir(parents=True, exist_ok=True)

@dataclass
class ModelConfig:
    """Model configuration"""
    whisper_model_size: str = "base"  # base, small, medium, large
    wav2vec_model: str = "facebook/wav2vec2-base-960h"
    bert_model: str = "bert-base-uncased"
    sampling_rate: int = 16000
    max_sequence_length: int = 512

@dataclass
class FeatureConfig:
    """Feature extraction configuration"""
    n_mfcc: int = 13
    n_mels: int = 80
    f0_min: float = 50.0
    f0_max: float = 300.0
    egemaps_feature_count: int = 88
    wav2vec_feature_size: int = 768

# Global configuration instances
SYSTEM_CONFIG = SystemConfig()
PATH_CONFIG = PathConfig()
MODEL_CONFIG = ModelConfig()
FEATURE_CONFIG = FeatureConfig()

def get_audio_file_paths() -> Dict[str, List[str]]:
    """Get all audio file paths organized by category"""
    audio_files = {
        'diagnosis_ad': [],
        'diagnosis_cn': [],
        'progression_decline': [],
        'progression_no_decline': [],
        'progression_test': []
    }

    # Helper function to safely get files
    def get_wav_files(path: str) -> List[str]:
        if os.path.exists(path):
            return [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.wav')]
        return []

    # Get diagnosis files
    audio_files['diagnosis_ad'] = get_wav_files(PATH_CONFIG.diagnosis_train_audio_ad)
    audio_files['diagnosis_cn'] = get_wav_files(PATH_CONFIG.diagnosis_train_audio_cn)

    # Get progression files
    audio_files['progression_decline'] = get_wav_files(PATH_CONFIG.progression_train_audio_decline)
    audio_files['progression_no_decline'] = get_wav_files(PATH_CONFIG.progression_train_audio_no_decline)
    audio_files['progression_test'] = get_wav_files(PATH_CONFIG.progression_test_audio)

    return audio_files

def print_system_info():
    """Print system configuration info"""
    print("=== System Configuration ===")
    print(f"CPU Cores Available: {multiprocessing.cpu_count()}")
    print(f"Using Cores: {SYSTEM_CONFIG.n_cores}")
    print(f"Max Workers: {SYSTEM_CONFIG.max_workers}")
    print(f"Memory Limit: {SYSTEM_CONFIG.memory_limit_gb}GB")
    print(f"Output Path: {PATH_CONFIG.output_path}")
    print(f"Whisper Model: {MODEL_CONFIG.whisper_model_size}")
    print("=" * 40)

## Fix config with cluade

In [None]:
"""
Fixed Configuration module with better path handling
"""
import os
import multiprocessing
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List
import logging

@dataclass
class SystemConfig:
    """System resource configuration"""
    n_cores: int = min(10, multiprocessing.cpu_count())
    max_workers: int = 8
    chunk_size: int = 2
    memory_limit_gb: int = 30

@dataclass
class PathConfig:
    """Path configuration for Windows 10"""
    base_path: str = r"C:\Users\Administrator\Desktop\Speech"
    output_path: str = r"C:\Users\Administrator\Desktop\Speech\output"

    # Diagnosis paths
    diagnosis_train_audio_ad: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad"
    diagnosis_train_audio_cn: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn"
    diagnosis_train_seg_ad: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\ad"
    diagnosis_train_seg_cn: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\cn"

    # Progression train paths
    progression_train_audio_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline"
    progression_train_audio_no_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline"
    progression_train_seg_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\decline"
    progression_train_seg_no_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\no_decline"

    # Progression test paths
    progression_test_audio: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio"
    progression_test_seg: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\segmentation"

    def __post_init__(self):
        """Create output directory if it doesn't exist"""
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "features")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "transcripts")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "models")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "logs")).mkdir(parents=True, exist_ok=True)

@dataclass
class ModelConfig:
    """Model configuration"""
    whisper_model_size: str = "base"
    wav2vec_model: str = "facebook/wav2vec2-base-960h"
    bert_model: str = "bert-base-uncased"
    sampling_rate: int = 16000
    max_sequence_length: int = 512

@dataclass
class FeatureConfig:
    """Feature extraction configuration"""
    n_mfcc: int = 13
    n_mels: int = 80
    f0_min: float = 50.0
    f0_max: float = 300.0
    egemaps_feature_count: int = 88
    wav2vec_feature_size: int = 768

# Global configuration instances
SYSTEM_CONFIG = SystemConfig()
PATH_CONFIG = PathConfig()
MODEL_CONFIG = ModelConfig()
FEATURE_CONFIG = FeatureConfig()

def get_wav_files_safe(path: str, logger: logging.Logger = None) -> List[str]:
    """Safely get WAV files from a directory with detailed logging"""
    if logger is None:
        logger = logging.getLogger(__name__)

    if not os.path.exists(path):
        logger.warning(f"Directory does not exist: {path}")
        return []

    if not os.path.isdir(path):
        logger.warning(f"Path is not a directory: {path}")
        return []

    try:
        all_files = os.listdir(path)
        wav_files = [f for f in all_files if f.lower().endswith('.wav')]

        if not wav_files:
            logger.warning(f"No WAV files found in: {path}")
            logger.info(f"Directory contains {len(all_files)} files: {all_files[:10]}...")
            return []

        # Return full paths
        full_paths = [os.path.join(path, f) for f in wav_files]

        # Validate each file
        valid_files = []
        for file_path in full_paths:
            if os.path.isfile(file_path) and os.path.getsize(file_path) > 0:
                valid_files.append(file_path)
            else:
                logger.warning(f"Invalid file skipped: {file_path}")

        logger.info(f"Found {len(valid_files)} valid WAV files in {path}")
        return valid_files

    except PermissionError:
        logger.error(f"Permission denied accessing: {path}")
        return []
    except Exception as e:
        logger.error(f"Error reading directory {path}: {e}")
        return []

def get_audio_file_paths(logger: logging.Logger = None) -> Dict[str, List[str]]:
    """Get all audio file paths organized by category with enhanced error handling"""
    if logger is None:
        logger = logging.getLogger(__name__)

    logger.info("Scanning for audio files...")

    audio_files = {
        'diagnosis_ad': [],
        'diagnosis_cn': [],
        'progression_decline': [],
        'progression_no_decline': [],
        'progression_test': []
    }

    # Mapping of categories to paths
    path_mapping = {
        'diagnosis_ad': PATH_CONFIG.diagnosis_train_audio_ad,
        'diagnosis_cn': PATH_CONFIG.diagnosis_train_audio_cn,
        'progression_decline': PATH_CONFIG.progression_train_audio_decline,
        'progression_no_decline': PATH_CONFIG.progression_train_audio_no_decline,
        'progression_test': PATH_CONFIG.progression_test_audio
    }

    # Scan each category
    for category, directory_path in path_mapping.items():
        logger.info(f"Scanning {category}: {directory_path}")
        audio_files[category] = get_wav_files_safe(directory_path, logger)

    # Log summary
    total_files = sum(len(files) for files in audio_files.values())
    logger.info(f"Total audio files found: {total_files}")

    for category, files in audio_files.items():
        if files:
            logger.info(f"  {category}: {len(files)} files")
        else:
            logger.warning(f"  {category}: No files found!")

    return audio_files

def print_system_info():
    """Print system configuration info"""
    print("=== System Configuration ===")
    print(f"CPU Cores Available: {multiprocessing.cpu_count()}")
    print(f"Using Cores: {SYSTEM_CONFIG.n_cores}")
    print(f"Max Workers: {SYSTEM_CONFIG.max_workers}")
    print(f"Memory Limit: {SYSTEM_CONFIG.memory_limit_gb}GB")
    print(f"Output Path: {PATH_CONFIG.output_path}")
    print(f"Whisper Model: {MODEL_CONFIG.whisper_model_size}")
    print("=" * 40)

def verify_directories():
    """Verify all configured directories exist"""
    print("=== Directory Verification ===")

    directories_to_check = [
        ("Base Path", PATH_CONFIG.base_path),
        ("Output Path", PATH_CONFIG.output_path),
        ("Diagnosis AD Audio", PATH_CONFIG.diagnosis_train_audio_ad),
        ("Diagnosis CN Audio", PATH_CONFIG.diagnosis_train_audio_cn),
        ("Progression Decline Audio", PATH_CONFIG.progression_train_audio_decline),
        ("Progression No-Decline Audio", PATH_CONFIG.progression_train_audio_no_decline),
        ("Progression Test Audio", PATH_CONFIG.progression_test_audio),
    ]

    for name, path in directories_to_check:
        exists = os.path.exists(path)
        is_dir = os.path.isdir(path) if exists else False
        print(f"{name}: {'✓' if exists and is_dir else '✗'} {path}")

        if exists and is_dir:
            try:
                file_count = len([f for f in os.listdir(path) if f.lower().endswith('.wav')])
                print(f"  -> Contains {file_count} WAV files")
            except:
                print(f"  -> Cannot read directory contents")

    print("=" * 40)

if __name__ == "__main__":
    # Test the configuration
    print_system_info()
    verify_directories()

    # Setup logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Get and display audio files
    audio_files = get_audio_file_paths(logger)

    print(f"\n=== Audio Files Summary ===")
    for category, files in audio_files.items():
        print(f"{category}: {len(files)} files")
        if files:
            print(f"  First file: {files[0]}")

# utils.py - Utilities Module


In [None]:
"""
Utilities module for ADReSSo21 Speech Analysis
Common utilities, logging, and helper functions
"""
import os
import json
import pickle
import logging
import psutil
import numpy as np
import pandas as pd
from typing import Dict, Any, List, Union
from pathlib import Path
from datetime import datetime
import gc
import torch

from config import PATH_CONFIG, SYSTEM_CONFIG

def setup_logging(log_level: str = "INFO") -> logging.Logger:
    """Setup logging configuration"""
    log_dir = os.path.join(PATH_CONFIG.output_path, "logs")
    Path(log_dir).mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(log_dir, f"adresso_analysis_{timestamp}.log")

    logging.basicConfig(
        level=getattr(logging, log_level.upper()),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler()
        ]
    )

    logger = logging.getLogger('ADReSSoAnalyzer')
    logger.info(f"Logging initialized. Log file: {log_file}")
    return logger

def monitor_memory_usage() -> Dict[str, float]:
    """Monitor current memory usage"""
    memory = psutil.virtual_memory()
    return {
        'total_gb': memory.total / (1024**3),
        'available_gb': memory.available / (1024**3),
        'used_gb': memory.used / (1024**3),
        'percent_used': memory.percent
    }

def cleanup_memory():
    """Force garbage collection and clear GPU memory if available"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def check_memory_limit(threshold_percent: float = 85.0) -> bool:
    """Check if memory usage is below threshold"""
    memory_info = monitor_memory_usage()
    return memory_info['percent_used'] < threshold_percent

def safe_save_pickle(data: Any, filepath: str, logger: logging.Logger = None):
    """Safely save data as pickle with error handling"""
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
        if logger:
            logger.info(f"Successfully saved pickle: {filepath}")
    except Exception as e:
        if logger:
            logger.error(f"Failed to save pickle {filepath}: {str(e)}")
        raise

def safe_load_pickle(filepath: str, logger: logging.Logger = None) -> Any:
    """Safely load pickle with error handling"""
    try:
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        if logger:
            logger.info(f"Successfully loaded pickle: {filepath}")
        return data
    except Exception as e:
        if logger:
            logger.error(f"Failed to load pickle {filepath}: {str(e)}")
        return None

def safe_save_json(data: Dict, filepath: str, logger: logging.Logger = None):
    """Safely save data as JSON with error handling"""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False, default=str)
        if logger:
            logger.info(f"Successfully saved JSON: {filepath}")
    except Exception as e:
        if logger:
            logger.error(f"Failed to save JSON {filepath}: {str(e)}")
        raise

def validate_audio_file(filepath: str) -> bool:
    """Validate if audio file exists and is readable"""
    return os.path.exists(filepath) and os.path.getsize(filepath) > 0

def get_file_info(filepath: str) -> Dict[str, Any]:
    """Get basic file information"""
    if not os.path.exists(filepath):
        return {'exists': False}

    stat = os.stat(filepath)
    return {
        'exists': True,
        'size_mb': stat.st_size / (1024**2),
        'modified': datetime.fromtimestamp(stat.st_mtime).isoformat(),
        'filename': os.path.basename(filepath),
        'extension': os.path.splitext(filepath)[1]
    }

def create_progress_bar(total: int, desc: str = "Processing") -> Any:
    """Create a progress bar for batch processing"""
    try:
        from tqdm import tqdm
        return tqdm(total=total, desc=desc, unit="files")
    except ImportError:
        # Fallback simple counter if tqdm not available
        class SimpleProgress:
            def __init__(self, total, desc):
                self.total = total
                self.current = 0
                self.desc = desc

            def update(self, n=1):
                self.current += n
                print(f"\r{self.desc}: {self.current}/{self.total}", end="")

            def close(self):
                print()  # New line

        return SimpleProgress(total, desc)

def batch_generator(items: List[Any], batch_size: int):
    """Generate batches from a list of items"""
    for i in range(0, len(items), batch_size):
        yield items[i:i + batch_size]

def flatten_dict(d: Dict, parent_key: str = '', sep: str = '_') -> Dict:
    """Flatten nested dictionary"""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, np.ndarray):
            # Convert numpy arrays to lists for JSON serialization
            items.append((new_key, v.tolist() if v.ndim > 0 else float(v)))
        else:
            items.append((new_key, v))
    return dict(items)

def create_summary_dataframe(results: Dict[str, Any], save_path: str = None) -> pd.DataFrame:
    """Create a summary DataFrame from results dictionary"""
    data = []

    for key, result in results.items():
        if isinstance(result, dict):
            # Flatten the result dictionary
            flat_result = flatten_dict(result)
            flat_result['file_id'] = key
            data.append(flat_result)

    df = pd.DataFrame(data)

    if save_path:
        df.to_csv(save_path, index=False)

    return df

def log_processing_stats(processed: int, failed: int, total: int, logger: logging.Logger):
    """Log processing statistics"""
    success_rate = (processed / total * 100) if total > 0 else 0
    logger.info(f"Processing completed: {processed}/{total} successful ({success_rate:.1f}%)")
    if failed > 0:
        logger.warning(f"Failed files: {failed}")

class ProcessingTimer:
    """Context manager for timing operations"""

    def __init__(self, operation_name: str, logger: logging.Logger = None):
        self.operation_name = operation_name
        self.logger = logger
        self.start_time = None

    def __enter__(self):
        self.start_time = datetime.now()
        if self.logger:
            self.logger.info(f"Starting {self.operation_name}...")
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        end_time = datetime.now()
        duration = end_time - self.start_time

        if self.logger:
            self.logger.info(f"Completed {self.operation_name} in {duration}")

        if exc_type:
            if self.logger:
                self.logger.error(f"Error in {self.operation_name}: {exc_val}")

def ensure_directory_exists(directory_path: str):
    """Ensure directory exists, create if not"""
    Path(directory_path).mkdir(parents=True, exist_ok=True)

def get_available_models() -> Dict[str, bool]:
    """Check which models are available/working"""
    models_status = {
        'whisper': False,
        'wav2vec2': False,
        'bert': False,
        'opensmile': False
    }

    try:
        import whisper
        models_status['whisper'] = True
    except ImportError:
        pass

    try:
        from transformers import Wav2Vec2Processor, Wav2Vec2Model
        models_status['wav2vec2'] = True
    except ImportError:
        pass

    try:
        from transformers import BertTokenizer, BertModel
        models_status['bert'] = True
    except ImportError:
        pass

    try:
        import opensmile
        models_status['opensmile'] = True
    except ImportError:
        pass

    return models_status

# acoustic_features_service.py - Acoustic Features Extraction Service


In [None]:
"""
Acoustic Features Service - Microservice for extracting acoustic features
Handles eGeMAPS, MFCC, Log-mel, Wav2Vec2, and prosodic features
"""
import os
import numpy as np
import librosa
import torch
import warnings
from typing import Dict, Any, List, Tuple, Optional
from concurrent.futures import ProcessPoolExecutor, as_completed
import logging
from dataclasses import dataclass

# Model imports with error handling
try:
    import opensmile
    OPENSMILE_AVAILABLE = True
except ImportError:
    OPENSMILE_AVAILABLE = False

try:
    from transformers import Wav2Vec2Processor, Wav2Vec2Model
    WAV2VEC_AVAILABLE = True
except ImportError:
    WAV2VEC_AVAILABLE = False

from config import MODEL_CONFIG, FEATURE_CONFIG, SYSTEM_CONFIG
from utils import setup_logging, monitor_memory_usage, cleanup_memory, safe_save_pickle

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)

@dataclass
class AcousticFeatures:
    """Data class to hold acoustic features"""
    egemaps: np.ndarray
    mfccs: Dict[str, np.ndarray]
    log_mel: Dict[str, np.ndarray]
    wav2vec2: np.ndarray
    prosodic: Dict[str, float]
    extraction_success: Dict[str, bool]

class AcousticFeaturesService:
    """Service for extracting acoustic features from audio files"""

    def __init__(self, logger: Optional[logging.Logger] = None):
        self.logger = logger or setup_logging()
        self.smile = None
        self.wav2vec_processor = None
        self.wav2vec_model = None

        self._initialize_models()

    def _initialize_models(self):
        """Initialize feature extraction models"""
        self.logger.info("Initializing acoustic feature extraction models...")

        # Initialize OpenSMILE for eGeMAPS
        if OPENSMILE_AVAILABLE:
            try:
                self.smile = opensmile.Smile(
                    feature_set=opensmile.FeatureSet.eGeMAPSv02,
                    feature_level=opensmile.FeatureLevel.Functionals,
                )
                self.logger.info("✓ OpenSMILE (eGeMAPS) initialized")
            except Exception as e:
                self.logger.error(f"Failed to initialize OpenSMILE: {e}")
                self.smile = None
        else:
            self.logger.warning("OpenSMILE not available - eGeMAPS features will be skipped")

        # Initialize Wav2Vec2
        if WAV2VEC_AVAILABLE:
            try:
                self.wav2vec_processor = Wav2Vec2Processor.from_pretrained(MODEL_CONFIG.wav2vec_model)
                self.wav2vec_model = Wav2Vec2Model.from_pretrained(MODEL_CONFIG.wav2vec_model)
                self.logger.info("✓ Wav2Vec2 initialized")
            except Exception as e:
                self.logger.error(f"Failed to initialize Wav2Vec2: {e}")
                self.wav2vec_processor = None
                self.wav2vec_model = None
        else:
            self.logger.warning("Transformers not available - Wav2Vec2 features will be skipped")

    def extract_egemaps_features(self, audio_path: str) -> Tuple[np.ndarray, bool]:
        """Extract eGeMAPS features using OpenSMILE"""
        try:
            if self.smile is None:
                return np.zeros(FEATURE_CONFIG.egemaps_feature_count), False

            features = self.smile.process_file(audio_path).values.flatten()
            return features, True

        except Exception as e:
            self.logger.debug(f"eGeMAPS extraction failed for {os.path.basename(audio_path)}: {e}")
            return np.zeros(FEATURE_CONFIG.egemaps_feature_count), False

    def extract_mfcc_features(self, y: np.ndarray, sr: int) -> Tuple[Dict[str, np.ndarray], bool]:
        """Extract MFCC features and their derivatives"""
        try:
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=FEATURE_CONFIG.n_mfcc)

            features = {
                'mean': np.mean(mfccs, axis=1),
                'std': np.std(mfccs, axis=1),
                'delta': np.mean(librosa.feature.delta(mfccs), axis=1),
                'delta2': np.mean(librosa.feature.delta(mfccs, order=2), axis=1)
            }
            return features, True

        except Exception as e:
            self.logger.debug(f"MFCC extraction failed: {e}")
            default_features = {
                'mean': np.zeros(FEATURE_CONFIG.n_mfcc),
                'std': np.zeros(FEATURE_CONFIG.n_mfcc),
                'delta': np.zeros(FEATURE_CONFIG.n_mfcc),
                'delta2': np.zeros(FEATURE_CONFIG.n_mfcc)
            }
            return default_features, False

    def extract_logmel_features(self, y: np.ndarray, sr: int) -> Tuple[Dict[str, np.ndarray], bool]:
        """Extract log-mel spectrogram features"""
        try:
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=FEATURE_CONFIG.n_mels)
            log_mel = librosa.power_to_db(mel_spec)

            features = {
                'mean': np.mean(log_mel, axis=1),
                'std': np.std(log_mel, axis=1)
            }
            return features, True

        except Exception as e:
            self.logger.debug(f"Log-mel extraction failed: {e}")
            default_features = {
                'mean': np.zeros(FEATURE_CONFIG.n_mels),
                'std': np.zeros(FEATURE_CONFIG.n_mels)
            }
            return default_features, False

    def extract_wav2vec_features(self, y: np.ndarray, sr: int) -> Tuple[np.ndarray, bool]:
        """Extract Wav2Vec2 features"""
        try:
            if self.wav2vec_processor is None or self.wav2vec_model is None:
                return np.zeros(FEATURE_CONFIG.wav2vec_feature_size), False

            if len(y) == 0:
                raise ValueError("Empty audio signal")

            # Ensure correct sampling rate
            if sr != MODEL_CONFIG.sampling_rate:
                y = librosa.resample(y, orig_sr=sr, target_sr=MODEL_CONFIG.sampling_rate)
                sr = MODEL_CONFIG.sampling_rate

            input_values = self.wav2vec_processor(
                y,
                sampling_rate=sr,
                return_tensors="pt"
            ).input_values

            with torch.no_grad():
                wav2vec_features = self.wav2vec_model(input_values).last_hidden_state
                features = torch.mean(wav2vec_features, dim=1).squeeze().numpy()

            return features, True

        except Exception as e:
            self.logger.debug(f"Wav2Vec2 extraction failed: {e}")
            return np.zeros(FEATURE_CONFIG.wav2vec_feature_size), False

    def extract_prosodic_features(self, y: np.ndarray, sr: int) -> Tuple[Dict[str, float], bool]:
        """Extract prosodic features"""
        try:
            # F0 extraction
            f0 = librosa.yin(y, fmin=FEATURE_CONFIG.f0_min, fmax=FEATURE_CONFIG.f0_max, sr=sr)
            f0_clean = f0[f0 > 0]  # Remove unvoiced frames

            # Energy features
            rms = librosa.feature.rms(y=y)

            # Spectral features
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            zero_crossing_rate = librosa.feature.zero_crossing_rate(y)

            features = {
                'f0_mean': float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'f0_std': float(np.std(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'f0_median': float(np.median(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'f0_range': float(np.max(f0_clean) - np.min(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'energy_mean': float(np.mean(rms)),
                'energy_std': float(np.std(rms)),
                'zero_crossing_rate': float(np.mean(zero_crossing_rate)),
                'spectral_centroid': float(np.mean(spectral_centroid)),
                'spectral_rolloff': float(np.mean(spectral_rolloff)),
                'duration': len(y) / sr,
                'voicing_fraction': len(f0_clean) / len(f0) if len(f0) > 0 else 0.0
            }

            return features, True

        except Exception as e:
            self.logger.debug(f"Prosodic feature extraction failed: {e}")
            default_features = {
                'f0_mean': 0.0, 'f0_std': 0.0, 'f0_median': 0.0, 'f0_range': 0.0,
                'energy_mean': 0.0, 'energy_std': 0.0, 'zero_crossing_rate': 0.0,
                'spectral_centroid': 0.0, 'spectral_rolloff': 0.0,
                'duration': 0.0, 'voicing_fraction': 0.0
            }
            return default_features, False

    def extract_features_from_file(self, audio_path: str) -> Optional[AcousticFeatures]:
        """Extract all acoustic features from a single audio file"""
        try:
            # Load audio file
            y, sr = librosa.load(audio_path, sr=MODEL_CONFIG.sampling_rate)

            if len(y) == 0:
                self.logger.warning(f"Empty audio file: {audio_path}")
                return None

            # Extract all features
            egemaps, egemaps_success = self.extract_egemaps_features(audio_path)
            mfccs, mfccs_success = self.extract_mfcc_features(y, sr)
            log_mel, logmel_success = self.extract_logmel_features(y, sr)
            wav2vec2, wav2vec_success = self.extract_wav2vec_features(y, sr)
            prosodic, prosodic_success = self.extract_prosodic_features(y, sr)

            features = AcousticFeatures(
                egemaps=egemaps,
                mfccs=mfccs,
                log_mel=log_mel,
                wav2vec2=wav2vec2,
                prosodic=prosodic,
                extraction_success={
                    'egemaps': egemaps_success,
                    'mfccs': mfccs_success,
                    'log_mel': logmel_success,
                    'wav2vec2': wav2vec_success,
                    'prosodic': prosodic_success
                }
            )

            return features

        except Exception as e:
            self.logger.error(f"Failed to extract features from {audio_path}: {e}")
            return None

    def process_files_batch(self, file_paths: List[str]) -> Dict[str, Optional[AcousticFeatures]]:
        """Process a batch of audio files"""
        results = {}

        for file_path in file_paths:
            filename = os.path.basename(file_path)
            try:
                features = self.extract_features_from_file(file_path)
                results[filename] = features

                # Memory cleanup for large batches
                if len(results) % 10 == 0:
                    cleanup_memory()

            except Exception as e:
                self.logger.error(f"Error processing {filename}: {e}")
                results[filename] = None

        return results

    def extract_features_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Dict[str, Optional[AcousticFeatures]]]:
        """Extract features from all audio files using parallel processing"""
        self.logger.info("Starting parallel acoustic feature extraction...")

        all_results = {}
        total_files = sum(len(files) for files in audio_files.values())
        processed_files = 0

        for category, file_paths in audio_files.items():
            if not file_paths:
                continue

            self.logger.info(f"Processing {category}: {len(file_paths)} files")
            category_results = {}

            # Process files in batches to manage memory
            batch_size = SYSTEM_CONFIG.chunk_size
            batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]

            with ProcessPoolExecutor(max_workers=SYSTEM_CONFIG.max_workers) as executor:
                # Submit batch jobs
                future_to_batch = {
                    executor.submit(process_audio_batch_worker, batch): batch
                    for batch in batches
                }

                # Collect results
                for future in as_completed(future_to_batch):
                    batch = future_to_batch[future]
                    try:
                        batch_results = future.result()
                        category_results.update(batch_results)
                        processed_files += len(batch)

                        # Log progress
                        progress = (processed_files / total_files) * 100
                        self.logger.info(f"Progress: {processed_files}/{total_files} ({progress:.1f}%)")

                        # Check memory usage
                        memory_info = monitor_memory_usage()
                        if memory_info['percent_used'] > 80:
                            self.logger.warning(f"High memory usage: {memory_info['percent_used']:.1f}%")
                            cleanup_memory()

                    except Exception as e:
                        self.logger.error(f"Batch processing failed: {e}")

            all_results[category] = category_results
            self.logger.info(f"Completed {category}: {len(category_results)} files processed")

        # Final cleanup
        cleanup_memory()
        self.logger.info(f"Acoustic feature extraction completed: {processed_files}/{total_files} files")

        return all_results

    def save_features(self, features: Dict[str, Dict[str, Optional[AcousticFeatures]]], output_dir: str):
        """Save extracted features to disk"""
        self.logger.info("Saving acoustic features...")

        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        # Save features by category
        for category, category_features in features.items():
            category_path = os.path.join(output_dir, f"acoustic_features_{category}.pkl")

            # Convert AcousticFeatures objects to dictionaries for serialization
            serializable_features = {}
            for filename, feature_obj in category_features.items():
                if feature_obj is not None:
                    serializable_features[filename] = {
                        'egemaps': feature_obj.egemaps,
                        'mfccs': feature_obj.mfccs,
                        'log_mel': feature_obj.log_mel,
                        'wav2vec2': feature_obj.wav2vec2,
                        'prosodic': feature_obj.prosodic,
                        'extraction_success': feature_obj.extraction_success
                    }
                else:
                    serializable_features[filename] = None

            safe_save_pickle(serializable_features, category_path, self.logger)

        # Save combined features
        combined_path = os.path.join(output_dir, "acoustic_features_all.pkl")
        safe_save_pickle(features, combined_path, self.logger)

        self.logger.info(f"Acoustic features saved to {output_dir}")

def process_audio_batch_worker(file_paths: List[str]) -> Dict[str, Optional[AcousticFeatures]]:
    """Worker function for parallel processing of audio batches"""
    # Create a new service instance for each worker to avoid sharing model states
    service = AcousticFeaturesService()
    return service.process_files_batch(file_paths)

def demonstrate_acoustic_features(audio_file_path: str, logger: Optional[logging.Logger] = None):
    """Demonstrate acoustic feature extraction on a single file"""
    if logger is None:
        logger = setup_logging()

    service = AcousticFeaturesService(logger)

    logger.info(f"Demonstrating acoustic features for: {os.path.basename(audio_file_path)}")

    features = service.extract_features_from_file(audio_file_path)

    if features is None:
        logger.error("Failed to extract features")
        return

    print(f"\n=== Acoustic Features for {os.path.basename(audio_file_path)} ===\n")

    # eGeMAPS
    print(f"1. eGeMAPS Features: {len(features.egemaps)} features")
    print(f"   Success: {features.extraction_success['egemaps']}")
    print(f"   Shape: {features.egemaps.shape}")
    print(f"   Sample values: {features.egemaps[:5]}")
    print()

    # MFCCs
    print("2. MFCC Features:")
    print(f"   Success: {features.extraction_success['mfccs']}")
    for key, values in features.mfccs.items():
        print(f"   {key}: {values.shape} - {values[:5]}")
    print()

    # Log-mel
    print("3. Log-Mel Spectrogram Features:")
    print(f"   Success: {features.extraction_success['log_mel']}")
    for key, values in features.log_mel.items():
        print(f"   {key}: {values.shape} - {values[:5]}")
    print()

    # Wav2Vec2
    print(f"4. Wav2Vec2 Features: {features.wav2vec2.shape}")
    print(f"   Success: {features.extraction_success['wav2vec2']}")
    print(f"   Sample values: {features.wav2vec2[:5]}")
    print()

    # Prosodic
    print("5. Prosodic Features:")
    print(f"   Success: {features.extraction_success['prosodic']}")
    for key, value in features.prosodic.items():
        print(f"   {key}: {value:.4f}")
    print()

    # Success summary
    successful_features = sum(features.extraction_success.values())
    total_features = len(features.extraction_success)
    print(f"Feature extraction success: {successful_features}/{total_features}")

if __name__ == "__main__":
    # Test the service with a single file
    from config import get_audio_file_paths

    logger = setup_logging()
    audio_files = get_audio_file_paths()

    # Find first available audio file for demonstration
    test_file = None
    for category, files in audio_files.items():
        if files:
            test_file = files[0]
            break

    if test_file:
        demonstrate_acoustic_features(test_file, logger)
    else:
        logger.error("No audio files found for demonstration")

# transcription_service.py - Speech Transcription Service


In [None]:
"""
Fixed Transcription Service with better FFmpeg handling and fallback options
"""
import os
import logging
import subprocess
import sys
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from dataclasses import dataclass
from concurrent.futures import ProcessPoolExecutor, as_completed
import numpy as np
import tempfile
import shutil

@dataclass
class TranscriptionResult:
    """Data class for transcription results"""
    file_path: str
    category: str
    filename: str
    transcript: str
    language: str
    segments: int
    duration: float
    confidence: float
    success: bool
    error_message: Optional[str] = None

def check_ffmpeg_comprehensive():
    """Comprehensive FFmpeg check with multiple methods"""
    methods = []

    # Method 1: Direct command check
    try:
        result = subprocess.run(['ffmpeg', '-version'],
                              capture_output=True, text=True, timeout=10)
        if result.returncode == 0:
            methods.append(("Direct command", True, "Working"))
        else:
            methods.append(("Direct command", False, f"Return code: {result.returncode}"))
    except FileNotFoundError:
        methods.append(("Direct command", False, "FFmpeg not found in PATH"))
    except subprocess.TimeoutExpired:
        methods.append(("Direct command", False, "Timeout"))
    except Exception as e:
        methods.append(("Direct command", False, str(e)))

    # Method 2: Check if ffmpeg.exe exists in common locations
    common_paths = [
        r"C:\ffmpeg\bin\ffmpeg.exe",
        r"C:\Program Files\ffmpeg\bin\ffmpeg.exe",
        r"C:\Program Files (x86)\ffmpeg\bin\ffmpeg.exe",
        os.path.join(os.environ.get('USERPROFILE', ''), 'ffmpeg', 'bin', 'ffmpeg.exe'),
    ]

    for path in common_paths:
        if os.path.exists(path):
            methods.append(("File check", True, f"Found at {path}"))
            break
    else:
        methods.append(("File check", False, "Not found in common locations"))

    # Method 3: Check PATH environment
    path_env = os.environ.get('PATH', '')
    ffmpeg_in_path = any('ffmpeg' in p.lower() for p in path_env.split(os.pathsep))
    methods.append(("PATH check", ffmpeg_in_path, "FFmpeg found in PATH" if ffmpeg_in_path else "No FFmpeg in PATH"))

    return methods

def install_ffmpeg_windows():
    """Attempt to install FFmpeg automatically on Windows"""
    print("Attempting to install FFmpeg automatically...")

    # Try chocolatey first
    try:
        result = subprocess.run(['choco', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print("Chocolatey found. Installing FFmpeg...")
            install_result = subprocess.run(['choco', 'install', 'ffmpeg', '-y'],
                                          capture_output=True, text=True)
            if install_result.returncode == 0:
                print("✓ FFmpeg installed successfully via Chocolatey!")
                return True
            else:
                print(f"Chocolatey install failed: {install_result.stderr}")
    except FileNotFoundError:
        print("Chocolatey not found.")

    # Try winget
    try:
        result = subprocess.run(['winget', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print("Winget found. Installing FFmpeg...")
            install_result = subprocess.run(['winget', 'install', 'FFmpeg'],
                                          capture_output=True, text=True)
            if install_result.returncode == 0:
                print("✓ FFmpeg installed successfully via Winget!")
                return True
            else:
                print(f"Winget install failed: {install_result.stderr}")
    except FileNotFoundError:
        print("Winget not found.")

    return False

def try_convert_audio_manually(input_path: str, output_path: str) -> bool:
    """Try to convert audio manually using available tools"""
    try:
        # Try with pydub + simpleaudio (fallback)
        from pydub import AudioSegment
        from pydub.utils import which

        # Check if we have any audio conversion capability
        if which("ffmpeg") or which("avconv"):
            audio = AudioSegment.from_file(input_path)
            audio.export(output_path, format="wav")
            return True
    except ImportError:
        pass
    except Exception as e:
        print(f"Manual conversion failed: {e}")

    return False

class TranscriptionService:
    """Enhanced service for transcribing audio files with better FFmpeg handling"""

    def __init__(self, model_size: str = None, logger: Optional[logging.Logger] = None):
        self.logger = logger or self.setup_logging()
        self.model_size = model_size or "base"
        self.whisper_model = None
        self.ffmpeg_available = False
        self.temp_dir = None

        # Comprehensive FFmpeg check
        self._check_ffmpeg_comprehensive()

        # Initialize model
        self._initialize_model()

    def setup_logging(self):
        """Setup basic logging if none provided"""
        logging.basicConfig(level=logging.INFO)
        return logging.getLogger(__name__)

    def _check_ffmpeg_comprehensive(self):
        """Comprehensive FFmpeg check and attempt fixes"""
        self.logger.info("Performing comprehensive FFmpeg check...")

        methods = check_ffmpeg_comprehensive()

        for method, success, message in methods:
            status = "✓" if success else "✗"
            self.logger.info(f"{status} {method}: {message}")

        # If any method succeeds, consider FFmpeg available
        self.ffmpeg_available = any(success for _, success, _ in methods)

        if not self.ffmpeg_available:
            self.logger.warning("FFmpeg not detected. Attempting automatic installation...")

            if sys.platform.startswith('win'):
                if install_ffmpeg_windows():
                    # Re-check after installation
                    methods = check_ffmpeg_comprehensive()
                    self.ffmpeg_available = any(success for _, success, _ in methods)

                    if self.ffmpeg_available:
                        self.logger.info("✓ FFmpeg now available after installation!")
                    else:
                        self.logger.error("FFmpeg installation failed.")

            if not self.ffmpeg_available:
                self.logger.error("FFmpeg is required for Whisper. Trying fallback methods...")
                self._setup_fallback_audio_processing()

    def _setup_fallback_audio_processing(self):
        """Setup fallback audio processing without FFmpeg"""
        self.temp_dir = tempfile.mkdtemp()
        self.logger.info(f"Created temporary directory for audio processing: {self.temp_dir}")

    def _initialize_model(self):
        """Initialize Whisper model with error handling"""
        try:
            import whisper
            self.logger.info(f"Loading Whisper model: {self.model_size}")
            self.whisper_model = whisper.load_model(self.model_size)
            self.logger.info("✓ Whisper model loaded successfully")
        except Exception as e:
            self.logger.error(f"Failed to load Whisper model: {e}")
            self.whisper_model = None

    def validate_audio_file(self, file_path: str) -> Tuple[bool, str]:
        """Enhanced audio file validation"""
        if not os.path.exists(file_path):
            return False, f"File does not exist: {file_path}"

        if not os.path.isfile(file_path):
            return False, f"Path is not a file: {file_path}"

        try:
            size = os.path.getsize(file_path)
            if size == 0:
                return False, f"File is empty: {file_path}"
            if size < 1000:
                return False, f"File too small ({size} bytes): {file_path}"
        except OSError as e:
            return False, f"Cannot access file: {e}"

        valid_extensions = {'.wav', '.mp3', '.flac', '.m4a', '.mp4', '.avi', '.mov'}
        file_ext = os.path.splitext(file_path)[1].lower()
        if file_ext not in valid_extensions:
            return False, f"Unsupported file format: {file_ext}"

        return True, "Valid"

    def _prepare_audio_for_whisper(self, audio_path: str) -> Tuple[bool, str, str]:
        """Prepare audio file for Whisper transcription"""
        # If FFmpeg is available, use original path
        if self.ffmpeg_available:
            return True, audio_path, "Using original file with FFmpeg"

        # Fallback: try to process without FFmpeg
        self.logger.warning("Attempting to process audio without FFmpeg...")

        # For WAV files, try direct processing first
        if audio_path.lower().endswith('.wav'):
            # Try to load the file directly - sometimes Whisper can handle it
            try:
                # Test if we can at least read the file
                with open(audio_path, 'rb') as f:
                    header = f.read(44)  # WAV header is typically 44 bytes
                    if header[:4] == b'RIFF' and header[8:12] == b'WAVE':
                        self.logger.info("Valid WAV file detected, attempting direct processing")
                        return True, audio_path, "Direct WAV processing"
            except Exception as e:
                self.logger.warning(f"Cannot read WAV header: {e}")

        # Try pydub fallback
        temp_path = None
        try:
            from pydub import AudioSegment
            self.logger.info("Attempting conversion using pydub...")

            # Load audio with pydub
            audio = AudioSegment.from_file(audio_path)

            # Convert to standard WAV format
            temp_path = os.path.join(self.temp_dir, f"temp_{os.path.basename(audio_path)}")
            if not temp_path.endswith('.wav'):
                temp_path = os.path.splitext(temp_path)[0] + '.wav'

            # Export as standard WAV
            audio.export(temp_path, format="wav", parameters=["-ar", "16000", "-ac", "1"])

            if os.path.exists(temp_path) and os.path.getsize(temp_path) > 0:
                self.logger.info(f"Successfully converted audio to: {temp_path}")
                return True, temp_path, "Converted using pydub"

        except ImportError:
            self.logger.error("pydub not available for audio conversion")
        except Exception as e:
            self.logger.error(f"Audio conversion failed: {e}")
            if temp_path and os.path.exists(temp_path):
                try:
                    os.remove(temp_path)
                except:
                    pass

        return False, audio_path, "No conversion method available"

    def transcribe_audio_file(self, audio_path: str, category: str = "") -> TranscriptionResult:
        """Transcribe a single audio file with enhanced error handling"""
        filename = os.path.basename(audio_path)

        self.logger.debug(f"Attempting to transcribe: {audio_path}")

        # Validate file
        is_valid, validation_message = self.validate_audio_file(audio_path)
        if not is_valid:
            self.logger.error(f"Validation failed for {filename}: {validation_message}")
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message=validation_message
            )

        if self.whisper_model is None:
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message="Whisper model not available"
            )

        # Prepare audio file
        processed, processed_path, method = self._prepare_audio_for_whisper(audio_path)
        if not processed:
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message=f"Could not prepare audio file: {method}"
            )

        try:
            self.logger.info(f"Transcribing {filename} using {method}...")

            # Transcribe with Whisper
            result = self.whisper_model.transcribe(
                processed_path,
                fp16=False,
                language=None,
                task="transcribe",
                verbose=False
            )

            transcript_text = result["text"].strip()
            segments = result.get("segments", [])

            # Calculate confidence
            confidence = 0.0
            if segments:
                confidences = [seg.get("avg_logprob", 0) for seg in segments if "avg_logprob" in seg]
                if confidences:
                    confidence = float(np.mean([np.exp(c) for c in confidences]))

            self.logger.info(f"✓ Successfully transcribed {filename}")

            # Clean up temporary file if created
            if processed_path != audio_path and os.path.exists(processed_path):
                try:
                    os.remove(processed_path)
                except:
                    pass

            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript=transcript_text,
                language=result.get("language", "unknown"),
                segments=len(segments),
                duration=result.get("duration", 0.0),
                confidence=confidence,
                success=True
            )

        except Exception as e:
            self.logger.error(f"Transcription failed for {filename}: {e}")
            self.logger.error(f"Error type: {type(e).__name__}")

            # Clean up temporary file if created
            if processed_path != audio_path and os.path.exists(processed_path):
                try:
                    os.remove(processed_path)
                except:
                    pass

            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message=str(e)
            )

    def cleanup(self):
        """Cleanup temporary resources"""
        if self.temp_dir and os.path.exists(self.temp_dir):
            try:
                shutil.rmtree(self.temp_dir)
                self.logger.info("Cleaned up temporary directory")
            except Exception as e:
                self.logger.warning(f"Failed to cleanup temporary directory: {e}")

    def __del__(self):
        """Destructor to ensure cleanup"""
        self.cleanup()

def demonstrate_fixed_transcription(audio_file_path: str, logger: Optional[logging.Logger] = None):
    """Demonstration with the fixed transcription service"""
    if logger is None:
        logging.basicConfig(level=logging.INFO)
        logger = logging.getLogger(__name__)

    logger.info("=== FIXED TRANSCRIPTION DEMONSTRATION ===")

    # Debug the file
    logger.info(f"Input file: {audio_file_path}")
    logger.info(f"Exists: {os.path.exists(audio_file_path)}")
    logger.info(f"Size: {os.path.getsize(audio_file_path) if os.path.exists(audio_file_path) else 'N/A'}")

    # Create service
    service = TranscriptionService(logger=logger)

    try:
        # Test transcription
        result = service.transcribe_audio_file(audio_file_path, "demo")

        print(f"\n=== Transcription Result for {result.filename} ===")
        print(f"Success: {result.success}")
        print(f"Language: {result.language}")
        print(f"Duration: {result.duration:.2f} seconds")
        print(f"Segments: {result.segments}")
        print(f"Confidence: {result.confidence:.3f}")

        if result.success:
            print(f"Transcript ({len(result.transcript)} chars, {len(result.transcript.split())} words):")
            print(f'"{result.transcript}"')
        else:
            print(f"Error: {result.error_message}")

            # Provide specific guidance based on error
            if "ffmpeg" in result.error_message.lower() or "file specified" in result.error_message.lower():
                print("\n=== TROUBLESHOOTING GUIDANCE ===")
                print("This appears to be an FFmpeg issue. Try:")
                print("1. Install FFmpeg using the instructions above")
                print("2. Install pydub for fallback processing: pip install pydub")
                print("3. Restart your Python environment after installing FFmpeg")

    finally:
        # Always cleanup
        service.cleanup()

if __name__ == "__main__":
    # Enhanced test with better error handling
    import sys

    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)

    # Check if running as part of larger system
    try:
        # Try to import from the main config (assuming it's in the same directory)
        import importlib.util

        # Try to load config module
        config_path = os.path.join(os.path.dirname(__file__), 'config.py')
        if os.path.exists(config_path):
            spec = importlib.util.spec_from_file_location("config", config_path)
            config = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(config)

            logger.info("Getting audio file paths from config...")
            audio_files = config.get_audio_file_paths()

            # Find first available file
            test_file = None
            for category, files in audio_files.items():
                if files:
                    for file_path in files:
                        if os.path.exists(file_path):
                            test_file = file_path
                            logger.info(f"Found test file: {test_file}")
                            break
                    if test_file:
                        break

            if test_file:
                demonstrate_fixed_transcription(test_file, logger)
            else:
                logger.error("No valid audio files found in config!")

        else:
            logger.info("Config file not found, asking for manual input...")
            print("Please enter the full path to an audio file:")
            test_file = input().strip().strip('"')

            if test_file and os.path.exists(test_file):
                demonstrate_fixed_transcription(test_file, logger)
            else:
                logger.error(f"File not found: {test_file}")

    except Exception as e:
        logger.error(f"Error during execution: {e}")
        logger.info("Falling back to manual file input...")

        print("Please enter the full path to an audio file:")
        test_file = input().strip().strip('"')

        if test_file and os.path.exists(test_file):
            demonstrate_fixed_transcription(test_file, logger)
        else:
            logger.error(f"File not found: {test_file}")

 python .\transcription_service.py
2025-07-24 09:51:08,495 - __main__ - INFO - Getting audio file paths...
2025-07-24 09:51:08,496 - config - INFO - Scanning for audio files...
2025-07-24 09:51:08,497 - config - INFO - Scanning diagnosis_ad: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad
2025-07-24 09:51:08,532 - config - INFO - Found 87 valid WAV files in C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad
2025-07-24 09:51:08,533 - config - INFO - Scanning diagnosis_cn: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn
2025-07-24 09:51:08,565 - config - INFO - Found 79 valid WAV files in C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn
2025-07-24 09:51:08,566 - config - INFO - Scanning progression_decline: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline
2025-07-24 09:51:08,574 - config - INFO - Found 15 valid WAV files in C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline
2025-07-24 09:51:08,575 - config - INFO - Scanning progression_no_decline: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline
2025-07-24 09:51:08,599 - config - INFO - Found 58 valid WAV files in C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline
2025-07-24 09:51:08,600 - config - INFO - Scanning progression_test: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio
2025-07-24 09:51:08,614 - config - INFO - Found 32 valid WAV files in C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio
2025-07-24 09:51:08,615 - config - INFO - Total audio files found: 271
2025-07-24 09:51:08,615 - config - INFO -   diagnosis_ad: 87 files
2025-07-24 09:51:08,616 - config - INFO -   diagnosis_cn: 79 files
2025-07-24 09:51:08,616 - config - INFO -   progression_decline: 15 files
2025-07-24 09:51:08,617 - config - INFO -   progression_no_decline: 58 files
2025-07-24 09:51:08,617 - config - INFO -   progression_test: 32 files
2025-07-24 09:51:08,618 - __main__ - INFO - === DEBUGGING FILE PATHS ===
2025-07-24 09:51:08,618 - __main__ - INFO -
Category: diagnosis_ad
2025-07-24 09:51:08,619 - __main__ - INFO - Number of files: 87
2025-07-24 09:51:08,620 - __main__ - INFO -   File 1: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso024.wav
2025-07-24 09:51:08,621 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,621 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,623 - __main__ - INFO -     Size: 20802638 bytes
2025-07-24 09:51:08,623 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso024.wav
2025-07-24 09:51:08,624 - __main__ - INFO -   File 2: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso025.wav
2025-07-24 09:51:08,625 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,626 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,627 - __main__ - INFO -     Size: 33719272 bytes
2025-07-24 09:51:08,627 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso025.wav
2025-07-24 09:51:08,628 - __main__ - INFO -   File 3: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso027.wav
2025-07-24 09:51:08,629 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,630 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,631 - __main__ - INFO -     Size: 14549992 bytes
2025-07-24 09:51:08,631 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso027.wav
2025-07-24 09:51:08,632 - __main__ - INFO -   ... and 84 more files
2025-07-24 09:51:08,632 - __main__ - INFO -
Category: diagnosis_cn
2025-07-24 09:51:08,633 - __main__ - INFO - Number of files: 79
2025-07-24 09:51:08,634 - __main__ - INFO -   File 1: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn\adrso002.wav
2025-07-24 09:51:08,635 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,636 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,637 - __main__ - INFO -     Size: 6884840 bytes
2025-07-24 09:51:08,637 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn\adrso002.wav
2025-07-24 09:51:08,638 - __main__ - INFO -   File 2: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn\adrso003.wav
2025-07-24 09:51:08,639 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,640 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,641 - __main__ - INFO -     Size: 2965714 bytes
2025-07-24 09:51:08,641 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn\adrso003.wav
2025-07-24 09:51:08,642 - __main__ - INFO -   File 3: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn\adrso005.wav
2025-07-24 09:51:08,643 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,644 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,645 - __main__ - INFO -     Size: 5447768 bytes
2025-07-24 09:51:08,645 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn\adrso005.wav
2025-07-24 09:51:08,646 - __main__ - INFO -   ... and 76 more files
2025-07-24 09:51:08,647 - __main__ - INFO -
Category: progression_decline
2025-07-24 09:51:08,647 - __main__ - INFO - Number of files: 15
2025-07-24 09:51:08,648 - __main__ - INFO -   File 1: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline\adrsp003.wav
2025-07-24 09:51:08,649 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,649 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,650 - __main__ - INFO -     Size: 50457688 bytes
2025-07-24 09:51:08,651 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline\adrsp003.wav
2025-07-24 09:51:08,652 - __main__ - INFO -   File 2: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline\adrsp051.wav
2025-07-24 09:51:08,653 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,654 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,655 - __main__ - INFO -     Size: 8027224 bytes
2025-07-24 09:51:08,655 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline\adrsp051.wav
2025-07-24 09:51:08,656 - __main__ - INFO -   File 3: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline\adrsp055.wav
2025-07-24 09:51:08,657 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,658 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,659 - __main__ - INFO -     Size: 42928216 bytes
2025-07-24 09:51:08,660 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline\adrsp055.wav
2025-07-24 09:51:08,661 - __main__ - INFO -   ... and 12 more files
2025-07-24 09:51:08,662 - __main__ - INFO -
Category: progression_no_decline
2025-07-24 09:51:08,664 - __main__ - INFO - Number of files: 58
2025-07-24 09:51:08,665 - __main__ - INFO -   File 1: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline\adrsp001.wav
2025-07-24 09:51:08,666 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,667 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,669 - __main__ - INFO -     Size: 54982804 bytes
2025-07-24 09:51:08,669 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline\adrsp001.wav
2025-07-24 09:51:08,670 - __main__ - INFO -   File 2: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline\adrsp007.wav
2025-07-24 09:51:08,671 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,672 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,674 - __main__ - INFO -     Size: 46605400 bytes
2025-07-24 09:51:08,674 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline\adrsp007.wav
2025-07-24 09:51:08,675 - __main__ - INFO -   File 3: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline\adrsp019.wav
2025-07-24 09:51:08,677 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,678 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,679 - __main__ - INFO -     Size: 43020376 bytes
2025-07-24 09:51:08,679 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline\adrsp019.wav
2025-07-24 09:51:08,680 - __main__ - INFO -   ... and 55 more files
2025-07-24 09:51:08,681 - __main__ - INFO -
Category: progression_test
2025-07-24 09:51:08,682 - __main__ - INFO - Number of files: 32
2025-07-24 09:51:08,683 - __main__ - INFO -   File 1: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio\adrspt1.wav
2025-07-24 09:51:08,684 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,685 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,686 - __main__ - INFO -     Size: 45951124 bytes
2025-07-24 09:51:08,687 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio\adrspt1.wav
2025-07-24 09:51:08,688 - __main__ - INFO -   File 2: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio\adrspt10.wav
2025-07-24 09:51:08,689 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,691 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,692 - __main__ - INFO -     Size: 57692248 bytes
2025-07-24 09:51:08,692 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio\adrspt10.wav
2025-07-24 09:51:08,693 - __main__ - INFO -   File 3: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio\adrspt11.wav
2025-07-24 09:51:08,694 - __main__ - INFO -     Exists: True
2025-07-24 09:51:08,696 - __main__ - INFO -     Is file: True
2025-07-24 09:51:08,697 - __main__ - INFO -     Size: 45315160 bytes
2025-07-24 09:51:08,698 - __main__ - INFO -     Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio\adrspt11.wav
2025-07-24 09:51:08,699 - __main__ - INFO -   ... and 29 more files
2025-07-24 09:51:08,700 - __main__ - INFO - === END DEBUG ===

2025-07-24 09:51:08,701 - __main__ - INFO - Found valid test file: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso024.wav
2025-07-24 09:51:08,702 - __main__ - INFO - === TRANSCRIPTION DEMONSTRATION WITH DEBUG ===
2025-07-24 09:51:08,703 - __main__ - INFO - Input file path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso024.wav
2025-07-24 09:51:08,704 - __main__ - INFO - Absolute path: C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso024.wav
2025-07-24 09:51:08,705 - __main__ - INFO - File exists: True
2025-07-24 09:51:08,706 - __main__ - INFO - Is file: True
2025-07-24 09:51:08,707 - __main__ - INFO - File size: 20802638 bytes
2025-07-24 09:51:08,708 - __main__ - INFO - File extension: .wav
2025-07-24 09:51:08,709 - __main__ - INFO - Current working directory: C:\Users\Administrator\Desktop\Speech
2025-07-24 09:51:08,710 - __main__ - INFO - Contents of directory C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad:
2025-07-24 09:51:08,711 - __main__ - INFO -   adrso024.wav (FILE)
2025-07-24 09:51:08,712 - __main__ - INFO -   adrso025.wav (FILE)
2025-07-24 09:51:08,714 - __main__ - INFO -   adrso027.wav (FILE)
2025-07-24 09:51:08,715 - __main__ - INFO -   adrso028.wav (FILE)
2025-07-24 09:51:08,716 - __main__ - INFO -   adrso031.wav (FILE)
2025-07-24 09:51:08,717 - __main__ - INFO -   adrso032.wav (FILE)
2025-07-24 09:51:08,718 - __main__ - INFO -   adrso033.wav (FILE)
2025-07-24 09:51:08,718 - __main__ - INFO -   adrso035.wav (FILE)
2025-07-24 09:51:08,719 - __main__ - INFO -   adrso036.wav (FILE)
2025-07-24 09:51:08,720 - __main__ - INFO -   adrso039.wav (FILE)
2025-07-24 09:51:08,721 - __main__ - INFO - Checking prerequisites...
2025-07-24 09:51:16,740 - __main__ - INFO - ✓ Whisper is available
2025-07-24 09:51:16,746 - __main__ - ERROR - ✗ FFmpeg not found or not working properly
2025-07-24 09:51:16,746 - __main__ - ERROR -
    FFmpeg is required for Whisper to work properly. Please install it:

    Option 1 - Using Chocolatey (Recommended for Windows):
    1. Open PowerShell as Administrator
    2. Install Chocolatey if not already installed:
       Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
    3. Install FFmpeg: choco install ffmpeg
    4. Restart your command prompt/IDE

    Option 2 - Manual Installation:
    1. Download FFmpeg from https://ffmpeg.org/download.html
    2. Extract to a folder (e.g., C:\ffmpeg)
    3. Add C:\ffmpeg\bin to your system PATH environment variable
    4. Restart your command prompt/IDE

    Option 3 - Using conda:
    conda install ffmpeg

2025-07-24 09:51:16,750 - __main__ - INFO - Loading Whisper model: base
2025-07-24 09:51:18,696 - __main__ - INFO - ✓ Whisper model loaded successfully
2025-07-24 09:51:18,698 - __main__ - INFO - Transcribing adrso024.wav...
2025-07-24 09:51:18,710 - __main__ - ERROR - Transcription failed for adrso024.wav: [WinError 2] The system cannot find the file specified      
2025-07-24 09:51:18,710 - __main__ - ERROR - Error type: FileNotFoundError

=== Transcription Result for adrso024.wav ===
Success: False
Language:
Duration: 0.00 seconds
Segments: 0
Confidence: 0.000
Error: [WinError 2] The system cannot find the file specified

(env) PS C:\Users\Administrator\Desktop\Speech>

## Revised- Grok

In [None]:
import os
import json
import numpy as np
from typing import Dict, List, Optional, Any
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass
import logging

# Whisper import with error handling
try:
    import whisper
    WHISPER_AVAILABLE = True
except ImportError:
    WHISPER_AVAILABLE = False

from config import MODEL_CONFIG, SYSTEM_CONFIG, PATH_CONFIG
from utils import (setup_logging, monitor_memory_usage, cleanup_memory,
                   safe_save_json, safe_save_pickle, ProcessingTimer,
                   create_progress_bar, validate_audio_file)

@dataclass
class TranscriptionResult:
    """Data class for transcription results"""
    file_path: str
    category: str
    filename: str
    transcript: str
    language: str
    segments: int
    duration: float
    confidence: float
    success: bool
    error_message: Optional[str] = None

class TranscriptionService:
    """Service for transcribing audio files using Whisper"""

    def __init__(self, model_size: str = None, logger: Optional[logging.Logger] = None):
        self.logger = logger or setup_logging()
        self.model_size = model_size or MODEL_CONFIG.whisper_model_size
        self.whisper_model = None

        self._initialize_model()

    def _initialize_model(self):
        """Initialize Whisper model"""
        if not WHISPER_AVAILABLE:
            self.logger.error("Whisper not available. Please install: pip install openai-whisper")
            return

        try:
            self.logger.info(f"Loading Whisper model: {self.model_size}")
            self.whisper_model = whisper.load_model(self.model_size)
            self.logger.info("✓ Whisper model loaded successfully")
        except Exception as e:
            self.logger.error(f"Failed to load Whisper model: {e}")
            self.whisper_model = None

    def transcribe_audio_file(self, audio_path: str, category: str = "") -> TranscriptionResult:
        """Transcribe a single audio file"""
        filename = os.path.basename(audio_path)

        # Validate file with enhanced logging
        if not validate_audio_file(audio_path):
            self.logger.error(f"Invalid audio file: {audio_path} (Exists: {os.path.exists(audio_path)}, Size: {os.path.getsize(audio_path) if os.path.exists(audio_path) else 0})")
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message="Invalid or missing audio file"
            )

        if self.whisper_model is None:
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message="Whisper model not available"
            )

        try:
            self.logger.debug(f"Transcribing {filename}...")

            # Transcribe with Whisper
            result = self.whisper_model.transcribe(
                audio_path,
                fp16=False,  # Use fp32 for better compatibility
                language=None,  # Auto-detect language
                task="transcribe"
            )

            transcript_text = result["text"].strip()
            segments = result.get("segments", [])

            # Calculate average confidence if available
            confidence = 0.0
            if segments:
                confidences = [seg.get("avg_logprob", 0) for seg in segments if "avg_logprob" in seg]
                if confidences:
                    # Convert log probabilities to rough confidence scores
                    confidence = float(np.mean([np.exp(c) for c in confidences]))

            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript=transcript_text,
                language=result.get("language", "unknown"),
                segments=len(segments),
                duration=result.get("duration", 0.0),
                confidence=confidence,
                success=True
            )

        except Exception as e:
            self.logger.error(f"Transcription failed for {filename}: {e}")
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message=str(e)
            )

    def transcribe_files_batch(self, file_paths: List[str], category: str = "") -> Dict[str, TranscriptionResult]:
        """Transcribe a batch of audio files"""
        results = {}

        for file_path in file_paths:
            filename = os.path.basename(file_path)

            try:
                result = self.transcribe_audio_file(file_path, category)
                results[filename] = result

                # Memory management
                if len(results) % 5 == 0:  # More frequent cleanup for transcription
                    cleanup_memory()

            except Exception as e:
                self.logger.error(f"Error processing {filename}: {e}")
                results[filename] = TranscriptionResult(
                    file_path=file_path,
                    category=category,
                    filename=filename,
                    transcript="",
                    language="",
                    segments=0,
                    duration=0.0,
                    confidence=0.0,
                    success=False,
                    error_message=str(e)
                )

        return results

    def transcribe_all_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Dict[str, TranscriptionResult]]:
        """Transcribe all audio files using parallel processing"""
        self.logger.info("Starting parallel transcription...")

        all_results = {}
        total_files = sum(len(files) for files in audio_files.values())
        processed_files = 0

        with ProcessingTimer("Complete transcription process", self.logger):
            for category, file_paths in audio_files.items():
                if not file_paths:
                    continue

                self.logger.info(f"Transcribing {category}: {len(file_paths)} files")
                category_results = {}

                # Use smaller batches for transcription to manage memory better
                batch_size = max(1, SYSTEM_CONFIG.chunk_size // 2)  # Smaller batches
                batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]

                # Use fewer workers for transcription as it's memory intensive
                max_workers = min(SYSTEM_CONFIG.max_workers // 2, 4)

                with ProcessPoolExecutor(max_workers=max_workers) as executor:
                    # Submit batch jobs
                    future_to_batch = {
                        executor.submit(transcribe_batch_worker, batch, category, self.model_size): batch
                        for batch in batches
                    }

                    # Progress tracking
                    progress_bar = create_progress_bar(len(batches), f"Transcribing {category}")

                    # Collect results
                    for future in as_completed(future_to_batch):
                        batch = future_to_batch[future]
                        try:
                            batch_results = future.result()
                            category_results.update(batch_results)
                            processed_files += len(batch)

                            progress_bar.update(1)

                            # Memory monitoring
                            memory_info = monitor_memory_usage()
                            if memory_info['percent_used'] > 75:
                                self.logger.warning(f"High memory usage: {memory_info['percent_used']:.1f}%")
                                cleanup_memory()

                        except Exception as e:
                            self.logger.error(f"Batch transcription failed: {e}")

                    progress_bar.close()

                all_results[category] = category_results

                # Log category completion stats
                successful = sum(1 for result in category_results.values() if result.success)
                self.logger.info(f"Completed {category}: {successful}/{len(category_results)} successful")

        # Final cleanup
        cleanup_memory()

        # Log overall stats
        total_successful = sum(
            sum(1 for result in category_results.values() if result.success)
            for category_results in all_results.values()
        )
        self.logger.info(f"Transcription completed: {total_successful}/{processed_files} successful")

        return all_results

    def save_transcriptions(self, transcriptions: Dict[str, Dict[str, TranscriptionResult]], output_dir: str):
        """Save transcriptions in multiple formats"""
        self.logger.info("Saving transcriptions...")

        # Create output directories
        transcripts_dir = os.path.join(output_dir, "transcripts")
        os.makedirs(transcripts_dir, exist_ok=True)

        all_transcripts = {}
        transcript_summary = []

        for category, category_results in transcriptions.items():
            # Save individual category files
            category_transcripts = {}

            for filename, result in category_results.items():
                # Convert to dictionary for JSON serialization
                transcript_dict = {
                    'file_path': result.file_path,
                    'category': result.category,
                    'filename': result.filename,
                    'transcript': result.transcript,
                    'language': result.language,
                    'segments': result.segments,
                    'duration': result.duration,
                    'confidence': result.confidence,
                    'success': result.success,
                    'error_message': result.error_message
                }

                category_transcripts[filename] = transcript_dict
                all_transcripts[f"{category}_{filename}"] = transcript_dict

                # Add to summary
                transcript_summary.append({
                    'File_ID': f"{category}_{filename}",
                    'Category': result.category,
                    'Filename': result.filename,
                    'Success': result.success,
                    'Language': result.language,
                    'Duration': result.duration,
                    'Transcript_Length': len(result.transcript),
                    'Word_Count': len(result.transcript.split()) if result.transcript else 0,
                    'Segments': result.segments,
                    'Confidence': result.confidence,
                    'Error': result.error_message if result.error_message else "",
                    'Transcript_Preview': (result.transcript[:100] + "...") if len(result.transcript) > 100 else result.transcript
                })

                # Save individual transcript file
                if result.success and result.transcript:
                    transcript_file = os.path.join(transcripts_dir, f"{category}_{filename}_transcript.txt")
                    try:
                        with open(transcript_file, 'w', encoding='utf-8') as f:
                            f.write(result.transcript)
                    except Exception as e:
                        self.logger.warning(f"Failed to save individual transcript {transcript_file}: {e}")

            # Save category JSON
            category_json_path = os.path.join(transcripts_dir, f"transcripts_{category}.json")
            safe_save_json(category_transcripts, category_json_path, self.logger)

        # Save consolidated files
        all_transcripts_path = os.path.join(transcripts_dir, "all_transcripts.json")
        safe_save_json(all_transcripts, all_transcripts_path, self.logger)

        # Save as pickle
        transcripts_pickle_path = os.path.join(transcripts_dir, "transcripts.pkl")
        safe_save_pickle(all_transcripts, transcripts_pickle_path, self.logger)

        # Save summary CSV
        try:
            import pandas as pd
            summary_df = pd.DataFrame(transcript_summary)
            summary_csv_path = os.path.join(output_dir, "transcript_summary.csv")
            summary_df.to_csv(summary_csv_path, index=False)
            self.logger.info(f"Transcript summary saved: {summary_csv_path}")
        except ImportError:
            self.logger.warning("Pandas not available, skipping CSV summary")

        self.logger.info(f"All transcriptions saved to {output_dir}")
        return all_transcripts

def transcribe_batch_worker(file_paths: List[str], category: str, model_size: str) -> Dict[str, TranscriptionResult]:
    """Worker function for parallel transcription"""
    # Create new service instance for each worker
    service = TranscriptionService(model_size=model_size)
    return service.transcribe_files_batch(file_paths, category)

def demonstrate_transcription(audio_file_path: str, logger: Optional[logging.Logger] = None):
    """Demonstrate transcription on a single file"""
    if logger is None:
        logger = setup_logging()

    service = TranscriptionService(logger=logger)

    logger.info(f"Demonstrating transcription for: {os.path.basename(audio_file_path)}")

    result = service.transcribe_audio_file(audio_file_path, "demo")

    print(f"\n=== Transcription Result for {result.filename} ===")
    print(f"Success: {result.success}")
    print(f"Language: {result.language}")
    print(f"Duration: {result.duration:.2f} seconds")
    print(f"Segments: {result.segments}")
    print(f"Confidence: {result.confidence:.3f}")

    if result.success:
        print(f"Transcript ({len(result.transcript)} chars, {len(result.transcript.split())} words):")
        print(f'"{result.transcript}"')
    else:
        print(f"Error: {result.error_message}")
    print()

if __name__ == "__main__":
    # Test the service
    from config import get_audio_file_paths

    logger = setup_logging()
    audio_files = get_audio_file_paths()

    # Prioritize known valid file
    test_file = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso024.wav"
    if validate_audio_file(test_file):
        demonstrate_transcription(test_file, logger)
    else:
        logger.error(f"Test file not valid: {test_file}. Falling back to available files.")
        # Fallback to first valid file
        for category, files in audio_files.items():
            for file in files:
                if validate_audio_file(file):
                    logger.info(f"Using fallback file: {file}")
                    demonstrate_transcription(file, logger)
                    break
            if file and validate_audio_file(file):
                break
        else:
            logger.error("No valid audio files found for demonstration")
            # Log all scanned files for debugging
            for category, files in audio_files.items():
                logger.debug(f"Category {category}: {files}")


C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad\adrso024.wav

## Revised- Claude

In [None]:
"""
Enhanced Transcription Service with automatic FFmpeg installation and robust fallbacks
"""
import os
import logging
import subprocess
import sys
import platform
from pathlib import Path
from typing import Optional, Dict, List, Tuple, Any
from dataclasses import dataclass
import tempfile
import shutil
import json

@dataclass
class TranscriptionResult:
    """Data class for transcription results"""
    file_path: str
    category: str
    filename: str
    transcript: str
    language: str
    segments: int
    duration: float
    confidence: float
    success: bool
    error_message: Optional[str] = None

class FFmpegInstaller:
    """Handles FFmpeg installation across different platforms"""

    @staticmethod
    def is_ffmpeg_available() -> Tuple[bool, str]:
        """Check if FFmpeg is available"""
        try:
            result = subprocess.run(
                ['ffmpeg', '-version'],
                capture_output=True,
                text=True,
                timeout=10
            )
            if result.returncode == 0:
                version_line = result.stdout.split('\n')[0]
                return True, f"FFmpeg found: {version_line}"
        except FileNotFoundError:
            return False, "FFmpeg not found in PATH"
        except subprocess.TimeoutExpired:
            return False, "FFmpeg check timed out"
        except Exception as e:
            return False, f"FFmpeg check failed: {str(e)}"

        return False, "FFmpeg not available"

    @staticmethod
    def install_ffmpeg_windows() -> Tuple[bool, str]:
        """Install FFmpeg on Windows using various methods"""
        methods_tried = []

        # Method 1: Try Chocolatey
        try:
            choco_check = subprocess.run(['choco', '--version'], capture_output=True, text=True, timeout=5)
            if choco_check.returncode == 0:
                methods_tried.append("Found Chocolatey")
                print("Installing FFmpeg via Chocolatey...")
                install_result = subprocess.run(
                    ['choco', 'install', 'ffmpeg', '-y'],
                    capture_output=True,
                    text=True,
                    timeout=300  # 5 minutes timeout
                )
                if install_result.returncode == 0:
                    return True, "Successfully installed FFmpeg via Chocolatey"
                else:
                    methods_tried.append(f"Chocolatey failed: {install_result.stderr[:200]}")
        except (FileNotFoundError, subprocess.TimeoutExpired):
            methods_tried.append("Chocolatey not available")

        # Method 2: Try winget
        try:
            winget_check = subprocess.run(['winget', '--version'], capture_output=True, text=True, timeout=5)
            if winget_check.returncode == 0:
                methods_tried.append("Found winget")
                print("Installing FFmpeg via winget...")
                install_result = subprocess.run(
                    ['winget', 'install', 'Gyan.FFmpeg'],
                    capture_output=True,
                    text=True,
                    timeout=300
                )
                if install_result.returncode == 0:
                    return True, "Successfully installed FFmpeg via winget"
                else:
                    methods_tried.append(f"Winget failed: {install_result.stderr[:200]}")
        except (FileNotFoundError, subprocess.TimeoutExpired):
            methods_tried.append("Winget not available")

        # Method 3: Try scoop
        try:
            scoop_check = subprocess.run(['scoop', '--version'], capture_output=True, text=True, timeout=5)
            if scoop_check.returncode == 0:
                methods_tried.append("Found scoop")
                print("Installing FFmpeg via scoop...")
                install_result = subprocess.run(
                    ['scoop', 'install', 'ffmpeg'],
                    capture_output=True,
                    text=True,
                    timeout=300
                )
                if install_result.returncode == 0:
                    return True, "Successfully installed FFmpeg via scoop"
                else:
                    methods_tried.append(f"Scoop failed: {install_result.stderr[:200]}")
        except (FileNotFoundError, subprocess.TimeoutExpired):
            methods_tried.append("Scoop not available")

        return False, f"All installation methods failed. Tried: {'; '.join(methods_tried)}"

    @staticmethod
    def install_ffmpeg_conda() -> Tuple[bool, str]:
        """Install FFmpeg using conda"""
        try:
            conda_check = subprocess.run(['conda', '--version'], capture_output=True, text=True, timeout=5)
            if conda_check.returncode == 0:
                print("Installing FFmpeg via conda...")
                install_result = subprocess.run(
                    ['conda', 'install', '-c', 'conda-forge', 'ffmpeg', '-y'],
                    capture_output=True,
                    text=True,
                    timeout=300
                )
                if install_result.returncode == 0:
                    return True, "Successfully installed FFmpeg via conda"
                else:
                    return False, f"Conda install failed: {install_result.stderr[:200]}"
        except (FileNotFoundError, subprocess.TimeoutExpired):
            return False, "Conda not available"

    @staticmethod
    def auto_install_ffmpeg() -> Tuple[bool, str]:
        """Automatically install FFmpeg based on the platform"""
        system = platform.system().lower()

        if system == "windows":
            # Try conda first (often available in data science environments)
            success, message = FFmpegInstaller.install_ffmpeg_conda()
            if success:
                return success, message

            # Then try Windows package managers
            return FFmpegInstaller.install_ffmpeg_windows()

        elif system == "darwin":  # macOS
            try:
                subprocess.run(['brew', 'install', 'ffmpeg'], check=True, timeout=300)
                return True, "Successfully installed FFmpeg via Homebrew"
            except (subprocess.CalledProcessError, FileNotFoundError):
                return False, "Could not install FFmpeg on macOS. Please install Homebrew and run: brew install ffmpeg"

        elif system == "linux":
            # Try different Linux package managers
            managers = [
                (['apt-get', 'update'], ['apt-get', 'install', '-y', 'ffmpeg']),
                (['yum', 'install', '-y', 'ffmpeg']),
                (['dnf', 'install', '-y', 'ffmpeg']),
                (['pacman', '-S', '--noconfirm', 'ffmpeg'])
            ]

            for commands in managers:
                try:
                    for cmd in commands:
                        subprocess.run(cmd, check=True, timeout=300)
                    return True, f"Successfully installed FFmpeg via {commands[0][0]}"
                except (subprocess.CalledProcessError, FileNotFoundError):
                    continue

            return False, "Could not install FFmpeg on Linux. Please install manually."

        return False, f"Unsupported platform: {system}"

class AudioProcessor:
    """Handles audio processing without FFmpeg dependency"""

    def __init__(self, logger: Optional[logging.Logger] = None):
        self.logger = logger or logging.getLogger(__name__)
        self.temp_dir = tempfile.mkdtemp()

    def __del__(self):
        """Cleanup temporary directory"""
        self.cleanup()

    def cleanup(self):
        """Clean up temporary files"""
        if hasattr(self, 'temp_dir') and self.temp_dir and os.path.exists(self.temp_dir):
            try:
                shutil.rmtree(self.temp_dir)
                self.logger.debug(f"Cleaned up temp directory: {self.temp_dir}")
            except Exception as e:
                self.logger.warning(f"Failed to cleanup temp directory: {e}")

    def load_audio_with_librosa(self, file_path: str, sr: int = 16000) -> Tuple[bool, Any, str]:
        """Load audio using librosa"""
        try:
            import librosa
            import numpy as np

            # Load audio file
            y, original_sr = librosa.load(file_path, sr=sr, mono=True)

            if len(y) == 0:
                return False, None, "Audio file is empty"

            self.logger.info(f"Loaded audio with librosa: {len(y)} samples at {sr}Hz")
            return True, y, "Success"

        except ImportError:
            return False, None, "librosa not available"
        except Exception as e:
            return False, None, f"librosa failed: {str(e)}"

    def load_audio_with_soundfile(self, file_path: str, sr: int = 16000) -> Tuple[bool, Any, str]:
        """Load audio using soundfile"""
        try:
            import soundfile as sf
            import numpy as np

            # Read audio file
            y, original_sr = sf.read(file_path, dtype='float32')

            # Convert to mono if stereo
            if len(y.shape) > 1:
                y = np.mean(y, axis=1)

            # Resample if necessary
            if original_sr != sr:
                # Simple resampling (not ideal but works without scipy)
                ratio = sr / original_sr
                new_length = int(len(y) * ratio)
                y = np.interp(np.linspace(0, len(y), new_length), np.arange(len(y)), y)

            if len(y) == 0:
                return False, None, "Audio file is empty"

            self.logger.info(f"Loaded audio with soundfile: {len(y)} samples at {sr}Hz")
            return True, y, "Success"

        except ImportError:
            return False, None, "soundfile not available"
        except Exception as e:
            return False, None, f"soundfile failed: {str(e)}"

    def save_audio_as_wav(self, audio_data: Any, output_path: str, sr: int = 16000) -> bool:
        """Save audio data as WAV file"""
        try:
            import soundfile as sf
            sf.write(output_path, audio_data, sr)
            return True
        except ImportError:
            try:
                import wave
                import numpy as np

                # Convert to 16-bit PCM
                audio_16bit = (audio_data * 32767).astype(np.int16)

                with wave.open(output_path, 'wb') as wav_file:
                    wav_file.setnchannels(1)  # mono
                    wav_file.setsampwidth(2)  # 16-bit
                    wav_file.setframerate(sr)
                    wav_file.writeframes(audio_16bit.tobytes())

                return True
            except Exception as e:
                self.logger.error(f"Failed to save audio: {e}")
                return False
        except Exception as e:
            self.logger.error(f"Failed to save audio with soundfile: {e}")
            return False

    def process_audio_for_whisper(self, file_path: str) -> Tuple[bool, str, str]:
        """Process audio file for Whisper without FFmpeg"""
        self.logger.info(f"Processing audio without FFmpeg: {file_path}")

        # Try different loading methods
        methods = [
            ("librosa", self.load_audio_with_librosa),
            ("soundfile", self.load_audio_with_soundfile)
        ]

        for method_name, method_func in methods:
            self.logger.info(f"Trying {method_name}...")
            success, audio_data, message = method_func(file_path)

            if success:
                # Save processed audio to temp file
                temp_path = os.path.join(self.temp_dir, f"processed_{os.path.basename(file_path)}")
                if not temp_path.endswith('.wav'):
                    temp_path = os.path.splitext(temp_path)[0] + '.wav'

                if self.save_audio_as_wav(audio_data, temp_path):
                    self.logger.info(f"Successfully processed audio with {method_name}")
                    return True, temp_path, f"Processed with {method_name}"
                else:
                    self.logger.warning(f"Failed to save processed audio from {method_name}")
            else:
                self.logger.warning(f"{method_name} failed: {message}")

        return False, file_path, "All audio processing methods failed"

class EnhancedTranscriptionService:
    """Enhanced transcription service with automatic FFmpeg handling"""

    def __init__(self, model_size: str = "base", logger: Optional[logging.Logger] = None):
        self.logger = logger or self._setup_logging()
        self.model_size = model_size
        self.whisper_model = None
        self.audio_processor = AudioProcessor(logger=self.logger)
        self.ffmpeg_available = False

        # Check and install FFmpeg if needed
        self._ensure_ffmpeg()

        # Initialize Whisper model
        self._initialize_whisper()

    def _setup_logging(self) -> logging.Logger:
        """Setup basic logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        return logging.getLogger(__name__)

    def _ensure_ffmpeg(self):
        """Ensure FFmpeg is available, install if needed"""
        self.logger.info("Checking FFmpeg availability...")

        available, message = FFmpegInstaller.is_ffmpeg_available()

        if available:
            self.logger.info(f"✓ {message}")
            self.ffmpeg_available = True
            return

        self.logger.warning(f"✗ {message}")
        self.logger.info("Attempting to install FFmpeg automatically...")

        # Try automatic installation
        success, install_message = FFmpegInstaller.auto_install_ffmpeg()

        if success:
            self.logger.info(f"✓ {install_message}")

            # Verify installation
            available, verify_message = FFmpegInstaller.is_ffmpeg_available()
            if available:
                self.logger.info("✓ FFmpeg installation verified")
                self.ffmpeg_available = True
                return
            else:
                self.logger.warning(f"FFmpeg installation verification failed: {verify_message}")
        else:
            self.logger.error(f"✗ {install_message}")

        self.logger.warning("FFmpeg not available. Will use fallback audio processing.")
        self.ffmpeg_available = False

    def _initialize_whisper(self):
        """Initialize Whisper model"""
        try:
            import whisper
            self.logger.info(f"Loading Whisper model: {self.model_size}")
            self.whisper_model = whisper.load_model(self.model_size)
            self.logger.info("✓ Whisper model loaded successfully")
        except ImportError:
            self.logger.error("✗ Whisper not installed. Install with: pip install openai-whisper")
            self.whisper_model = None
        except Exception as e:
            self.logger.error(f"✗ Failed to load Whisper model: {e}")
            self.whisper_model = None

    def validate_audio_file(self, file_path: str) -> Tuple[bool, str]:
        """Validate audio file"""
        if not os.path.exists(file_path):
            return False, f"File does not exist: {file_path}"

        if not os.path.isfile(file_path):
            return False, f"Path is not a file: {file_path}"

        try:
            size = os.path.getsize(file_path)
            if size == 0:
                return False, f"File is empty: {file_path}"
            if size < 1000:  # Very small files are likely corrupted
                return False, f"File too small ({size} bytes): {file_path}"
        except OSError as e:
            return False, f"Cannot access file: {e}"

        # Check file extension
        valid_extensions = {'.wav', '.mp3', '.flac', '.m4a', '.mp4', '.avi', '.mov', '.aac'}
        file_ext = os.path.splitext(file_path)[1].lower()
        if file_ext not in valid_extensions:
            return False, f"Unsupported file format: {file_ext}"

        return True, "Valid"

    def transcribe_audio_file(self, audio_path: str, category: str = "") -> TranscriptionResult:
        """Transcribe audio file with enhanced error handling"""
        filename = os.path.basename(audio_path)

        self.logger.info(f"Transcribing: {filename}")

        # Validate file
        is_valid, validation_message = self.validate_audio_file(audio_path)
        if not is_valid:
            self.logger.error(f"Validation failed: {validation_message}")
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message=validation_message
            )

        # Check if Whisper is available
        if self.whisper_model is None:
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message="Whisper model not available"
            )

        # Determine processing method
        processed_path = audio_path
        processing_method = "direct"

        if not self.ffmpeg_available:
            # Use fallback audio processing
            success, processed_path, processing_method = self.audio_processor.process_audio_for_whisper(audio_path)
            if not success:
                return TranscriptionResult(
                    file_path=audio_path,
                    category=category,
                    filename=filename,
                    transcript="",
                    language="",
                    segments=0,
                    duration=0.0,
                    confidence=0.0,
                    success=False,
                    error_message=f"Audio processing failed: {processing_method}"
                )

        # Transcribe with Whisper
        try:
            self.logger.info(f"Starting Whisper transcription using {processing_method}...")

            result = self.whisper_model.transcribe(
                processed_path,
                fp16=False,
                language=None,
                task="transcribe",
                verbose=False
            )

            transcript_text = result["text"].strip()
            segments = result.get("segments", [])

            # Calculate confidence
            confidence = 0.0
            if segments:
                confidences = []
                for seg in segments:
                    if "avg_logprob" in seg:
                        # Convert log probability to probability
                        confidences.append(max(0.0, min(1.0, 2 ** seg["avg_logprob"])))

                if confidences:
                    confidence = float(sum(confidences) / len(confidences))

            self.logger.info(f"✓ Successfully transcribed {filename}")
            self.logger.info(f"  Language: {result.get('language', 'unknown')}")
            self.logger.info(f"  Duration: {result.get('duration', 0.0):.2f}s")
            self.logger.info(f"  Segments: {len(segments)}")
            self.logger.info(f"  Confidence: {confidence:.3f}")
            self.logger.info(f"  Text length: {len(transcript_text)} characters")

            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript=transcript_text,
                language=result.get("language", "unknown"),
                segments=len(segments),
                duration=result.get("duration", 0.0),
                confidence=confidence,
                success=True
            )

        except Exception as e:
            self.logger.error(f"✗ Transcription failed for {filename}: {e}")
            self.logger.error(f"  Error type: {type(e).__name__}")

            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message=str(e)
            )

    def transcribe_batch(self, file_paths: List[str], category: str = "") -> List[TranscriptionResult]:
        """Transcribe multiple files"""
        results = []
        total_files = len(file_paths)

        self.logger.info(f"Starting batch transcription of {total_files} files...")

        for i, file_path in enumerate(file_paths, 1):
            self.logger.info(f"Processing file {i}/{total_files}: {os.path.basename(file_path)}")

            result = self.transcribe_audio_file(file_path, category)
            results.append(result)

            # Log progress
            success_count = sum(1 for r in results if r.success)
            self.logger.info(f"Progress: {i}/{total_files} processed, {success_count} successful")

        # Final summary
        success_count = sum(1 for r in results if r.success)
        self.logger.info(f"Batch transcription completed: {success_count}/{total_files} successful")

        return results

    def save_results(self, results: List[TranscriptionResult], output_dir: str):
        """Save transcription results"""
        os.makedirs(output_dir, exist_ok=True)

        # Save as JSON
        json_path = os.path.join(output_dir, "transcription_results.json")
        json_data = []

        for result in results:
            json_data.append({
                "file_path": result.file_path,
                "category": result.category,
                "filename": result.filename,
                "transcript": result.transcript,
                "language": result.language,
                "segments": result.segments,
                "duration": result.duration,
                "confidence": result.confidence,
                "success": result.success,
                "error_message": result.error_message
            })

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2, ensure_ascii=False)

        self.logger.info(f"Results saved to {json_path}")

        # Save successful transcripts as text files
        transcript_dir = os.path.join(output_dir, "transcripts")
        os.makedirs(transcript_dir, exist_ok=True)

        successful_results = [r for r in results if r.success and r.transcript.strip()]

        for result in successful_results:
            # Create safe filename
            safe_filename = "".join(c for c in result.filename if c.isalnum() or c in "._-")
            txt_filename = f"{safe_filename}_{result.category}.txt"
            txt_path = os.path.join(transcript_dir, txt_filename)

            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(f"File: {result.filename}\n")
                f.write(f"Category: {result.category}\n")
                f.write(f"Language: {result.language}\n")
                f.write(f"Duration: {result.duration:.2f}s\n")
                f.write(f"Confidence: {result.confidence:.3f}\n")
                f.write(f"Segments: {result.segments}\n")
                f.write("-" * 50 + "\n")
                f.write(result.transcript)

        self.logger.info(f"Transcripts saved to {transcript_dir}")

    def cleanup(self):
        """Cleanup resources"""
        if hasattr(self, 'audio_processor'):
            self.audio_processor.cleanup()

def main():
    """Main function for testing"""
    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger(__name__)

    # Try to get config
    try:
        # Import config if available
        sys.path.append(os.path.dirname(__file__))
        import config

        logger.info("Loading audio files from config...")
        audio_files = config.get_audio_file_paths(logger)

        # Get a test file
        test_file = None
        for category, files in audio_files.items():
            if files:
                test_file = files[0]  # Take first file
                test_category = category
                break

        if not test_file:
            logger.error("No audio files found in config")
            return

    except ImportError:
        logger.info("Config not available, requesting manual input...")
        test_file = input("Enter path to audio file: ").strip().strip('"')
        test_category = "manual_test"

        if not os.path.exists(test_file):
            logger.error(f"File not found: {test_file}")
            return

    # Test transcription service
    logger.info("=== Testing Enhanced Transcription Service ===")

    service = EnhancedTranscriptionService(logger=logger)

    try:
        # Test single file
        result = service.transcribe_audio_file(test_file, test_category)

        print(f"\n=== Transcription Result ===")
        print(f"File: {result.filename}")
        print(f"Success: {result.success}")

        if result.success:
            print(f"Language: {result.language}")
            print(f"Duration: {result.duration:.2f}s")
            print(f"Segments: {result.segments}")
            print(f"Confidence: {result.confidence:.3f}")
            print(f"Transcript length: {len(result.transcript)} characters")
            print(f"Word count: {len(result.transcript.split())}")
            print(f"\nFirst 200 characters:")
            print(f'"{result.transcript[:200]}..."')
        else:
            print(f"Error: {result.error_message}")

            # Provide installation guidance
            if "ffmpeg" in str(result.error_message).lower():
                print(f"\n=== FFmpeg Installation Guide ===")
                print("FFmpeg is required. The service attempted automatic installation.")
                print("If it failed, please install manually:")
                print("1. Windows: choco install ffmpeg")
                print("2. macOS: brew install ffmpeg")
                print("3. Linux: sudo apt install ffmpeg")
                print("4. Conda: conda install -c conda-forge ffmpeg")

        # Save results if successful
        if result.success:
            output_dir = os.path.join(os.path.dirname(test_file), "transcription_output")
            service.save_results([result], output_dir)
            print(f"\nResults saved to: {output_dir}")

    finally:
        service.cleanup()

if __name__ == "__main__":
    main()

## fix FFmpeg

In [None]:
# Install Chocolatey (if not already installed)
Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))

# Install FFmpeg
choco install ffmpeg

# Restart your terminal/IDE after installation

# linguistic_features_service.py - Linguistic Features Service


In [None]:
"""
Linguistic Features Service - Microservice for extracting linguistic features and BERT embeddings
Handles text analysis and BERT preprocessing
"""
import os
import re
import numpy as np
import torch
from typing import Dict, List, Optional, Any, Tuple
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass
import logging
from collections import Counter
import string

# BERT imports with error handling
try:
    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
    BERT_AVAILABLE = True
except ImportError:
    BERT_AVAILABLE = False

from config import MODEL_CONFIG, SYSTEM_CONFIG
from utils import (setup_logging, monitor_memory_usage, cleanup_memory,
                   safe_save_pickle, ProcessingTimer, batch_generator)
from transcription_service import TranscriptionResult

@dataclass
class LinguisticFeatures:
    """Data class for linguistic features"""
    # Basic text statistics
    raw_text: str
    word_count: int
    sentence_count: int
    char_count: int
    avg_word_length: float
    avg_sentence_length: float

    # Vocabulary features
    unique_words: int
    lexical_diversity: float
    function_words_ratio: float
    content_words_ratio: float

    # Syntactic features
    noun_ratio: float
    verb_ratio: float
    adjective_ratio: float
    pronoun_ratio: float

    # Semantic complexity
    syllable_count: int
    avg_syllables_per_word: float
    complex_words_ratio: float  # Words with 3+ syllables

    # Discourse features
    repetition_ratio: float
    pause_indicators: int
    filler_words: int

    # BERT features
    bert_tokens: List[str]
    bert_input_ids: List[int]
    bert_attention_mask: List[int]
    bert_embeddings: Optional[np.ndarray]

    # Processing metadata
    processing_success: bool
    error_message: Optional[str] = None

class LinguisticFeaturesService:
    """Service for extracting linguistic features from transcripts"""

    def __init__(self, logger: Optional[logging.Logger] = None):
        self.logger = logger or setup_logging()
        self.bert_tokenizer = None
        self.bert_model = None

        # Define word lists for analysis
        self.function_words = self._load_function_words()
        self.filler_words = {'um', 'uh', 'er', 'ah', 'hmm', 'well', 'like', 'you know', 'sort of', 'kind of'}
        self.pause_indicators = {'[pause]', '[silence]', '...', '--'}

        # Simple POS tag mappings for basic analysis
        self.noun_patterns = {'NN', 'NNS', 'NNP', 'NNPS'}
        self.verb_patterns = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
        self.adjective_patterns = {'JJ', 'JJR', 'JJS'}
        self.pronoun_patterns = {'PRP', 'PRP$', 'WP', 'WP$'}

        self._initialize_bert()

    def _initialize_bert(self):
        """Initialize BERT model for embeddings"""
        if not BERT_AVAILABLE:
            self.logger.warning("Transformers not available - BERT features will be limited")
            return

        try:
            self.logger.info(f"Loading BERT model: {MODEL_CONFIG.bert_model}")
            self.bert_tokenizer = BertTokenizer.from_pretrained(MODEL_CONFIG.bert_model)
            self.bert_model = BertModel.from_pretrained(MODEL_CONFIG.bert_model)
            self.bert_model.eval()  # Set to evaluation mode
            self.logger.info("✓ BERT model loaded successfully")
        except Exception as e:
            self.logger.error(f"Failed to load BERT model: {e}")
            self.bert_tokenizer = None
            self.bert_model = None

    def _load_function_words(self) -> set:
        """Load common function words"""
        function_words = {
            # Articles
            'a', 'an', 'the',
            # Prepositions
            'in', 'on', 'at', 'by', 'to', 'from', 'of', 'with', 'about', 'into', 'through', 'during',
            'before', 'after', 'above', 'below', 'over', 'under', 'between', 'among', 'against',
            # Conjunctions
            'and', 'or', 'but', 'nor', 'for', 'yet', 'so', 'because', 'since', 'although', 'while',
            'if', 'unless', 'until', 'when', 'where', 'how', 'why',
            # Pronouns
            'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
            'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'ours', 'theirs',
            'this', 'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'whose',
            # Auxiliary verbs
            'am', 'is', 'are', 'was', 'were', 'be', 'being', 'been',
            'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
            'will', 'would', 'shall', 'should', 'may', 'might', 'can', 'could', 'must',
            # Others
            'not', 'no', 'yes', 'there', 'here'
        }
        return function_words

    def _count_syllables(self, word: str) -> int:
        """Count syllables in a word (simple heuristic)"""
        word = word.lower().strip(".,!?;:")
        if not word:
            return 0

        # Remove silent 'e' at the end
        if word.endswith('e') and len(word) > 1:
            word = word[:-1]

        # Count vowel groups
        vowels = "aeiouy"
        syllable_count = 0
        prev_was_vowel = False

        for char in word:
            is_vowel = char in vowels
            if is_vowel and not prev_was_vowel:
                syllable_count += 1
            prev_was_vowel = is_vowel

        # Every word has at least one syllable
        return max(1, syllable_count)

    def _extract_basic_features(self, text: str) -> Dict[str, Any]:
        """Extract basic text statistics"""
        if not text.strip():
            return {
                'word_count': 0,
                'sentence_count': 0,
                'char_count': 0,
                'avg_word_length': 0.0,
                'avg_sentence_length': 0.0
            }

        # Clean text
        clean_text = text.strip()

        # Word analysis
        words = clean_text.split()
        word_count = len(words)

        # Sentence analysis - improved sentence detection
        sentences = re.split(r'[.!?]+', clean_text)
        sentences = [s.strip() for s in sentences if s.strip()]
        sentence_count = len(sentences)

        # Character count (excluding spaces)
        char_count = len(clean_text.replace(' ', ''))

        # Averages
        avg_word_length = np.mean([len(word) for word in words]) if words else 0.0
        avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0.0

        return {
            'word_count': word_count,
            'sentence_count': sentence_count,
            'char_count': char_count,
            'avg_word_length': avg_word_length,
            'avg_sentence_length': avg_sentence_length
        }

    def _extract_vocabulary_features(self, text: str) -> Dict[str, Any]:
        """Extract vocabulary and lexical features"""
        if not text.strip():
            return {
                'unique_words': 0,
                'lexical_diversity': 0.0,
                'function_words_ratio': 0.0,
                'content_words_ratio': 0.0
            }

        # Tokenize and clean words
        words = text.lower().split()
        words = [word.strip(string.punctuation) for word in words if word.strip(string.punctuation)]

        if not words:
            return {
                'unique_words': 0,
                'lexical_diversity': 0.0,
                'function_words_ratio': 0.0,
                'content_words_ratio': 0.0
            }

        unique_words = len(set(words))
        lexical_diversity = unique_words / len(words)

        # Function vs content words
        function_word_count = sum(1 for word in words if word in self.function_words)
        content_word_count = len(words) - function_word_count

        function_words_ratio = function_word_count / len(words)
        content_words_ratio = content_word_count / len(words)

        return {
            'unique_words': unique_words,
            'lexical_diversity': lexical_diversity,
            'function_words_ratio': function_words_ratio,
            'content_words_ratio': content_words_ratio
        }

    def _extract_syntactic_features(self, text: str) -> Dict[str, Any]:
        """Extract syntactic features (simplified POS analysis)"""
        if not text.strip():
            return {
                'noun_ratio': 0.0,
                'verb_ratio': 0.0,
                'adjective_ratio': 0.0,
                'pronoun_ratio': 0.0
            }

        words = text.lower().split()
        words = [word.strip(string.punctuation) for word in words if word.strip(string.punctuation)]

        if not words:
            return {
                'noun_ratio': 0.0,
                'verb_ratio': 0.0,
                'adjective_ratio': 0.0,
                'pronoun_ratio': 0.0
            }

        # Simple heuristic-based POS tagging
        noun_count = 0
        verb_count = 0
        adjective_count = 0
        pronoun_count = 0

        # Common verb endings and forms
        verb_endings = {'ed', 'ing', 'es', 's'}
        common_verbs = {'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did',
                       'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must'}

        # Common adjective endings
        adj_endings = {'ly', 'ful', 'less', 'ous', 'ive', 'able', 'ible'}

        # Common pronouns (already in function words, but specific ones)
        pronouns = {'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
                   'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those'}

        for word in words:
            # Check pronouns first
            if word in pronouns:
                pronoun_count += 1
            # Check common verbs
            elif word in common_verbs or any(word.endswith(ending) for ending in verb_endings if len(word) > 3):
                verb_count += 1
            # Check adjectives (simple heuristic)
            elif any(word.endswith(ending) for ending in adj_endings):
                adjective_count += 1
            # Default to noun if capitalized or doesn't match other patterns
            else:
                noun_count += 1

        total_words = len(words)
        return {
            'noun_ratio': noun_count / total_words,
            'verb_ratio': verb_count / total_words,
            'adjective_ratio': adjective_count / total_words,
            'pronoun_ratio': pronoun_count / total_words
        }

    def _extract_semantic_features(self, text: str) -> Dict[str, Any]:
        """Extract semantic complexity features"""
        if not text.strip():
            return {
                'syllable_count': 0,
                'avg_syllables_per_word': 0.0,
                'complex_words_ratio': 0.0
            }

        words = text.split()
        words = [word.strip(string.punctuation) for word in words if word.strip(string.punctuation)]

        if not words:
            return {
                'syllable_count': 0,
                'avg_syllables_per_word': 0.0,
                'complex_words_ratio': 0.0
            }

        syllable_counts = [self._count_syllables(word) for word in words]
        total_syllables = sum(syllable_counts)
        complex_words = sum(1 for count in syllable_counts if count >= 3)

        return {
            'syllable_count': total_syllables,
            'avg_syllables_per_word': total_syllables / len(words),
            'complex_words_ratio': complex_words / len(words)
        }

    def _extract_discourse_features(self, text: str) -> Dict[str, Any]:
        """Extract discourse and disfluency features"""
        if not text.strip():
            return {
                'repetition_ratio': 0.0,
                'pause_indicators': 0,
                'filler_words': 0
            }

        # Count pause indicators
        pause_count = 0
        text_lower = text.lower()
        for indicator in self.pause_indicators:
            pause_count += text_lower.count(indicator)

        # Count filler words
        words = text.lower().split()
        filler_count = 0
        for filler in self.filler_words:
            if ' ' in filler:  # Multi-word fillers
                filler_count += text_lower.count(filler)
            else:  # Single word fillers
                filler_count += words.count(filler)

        # Calculate repetition ratio (simple word repetition)
        word_counts = Counter(words)
        repeated_words = sum(count - 1 for count in word_counts.values() if count > 1)
        repetition_ratio = repeated_words / len(words) if words else 0.0

        return {
            'repetition_ratio': repetition_ratio,
            'pause_indicators': pause_count,
            'filler_words': filler_count
        }

    def _extract_bert_features(self, text: str, max_length: int = 512) -> Dict[str, Any]:
        """Extract BERT tokens and embeddings"""
        if not BERT_AVAILABLE or not self.bert_tokenizer:
            return {
                'bert_tokens': [],
                'bert_input_ids': [],
                'bert_attention_mask': [],
                'bert_embeddings': None
            }

        try:
            # Tokenize text
            encoded = self.bert_tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )

            # Get tokens for analysis
            tokens = self.bert_tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])

            # Get embeddings if model is available
            embeddings = None
            if self.bert_model:
                with torch.no_grad():
                    outputs = self.bert_model(**encoded)
                    # Use [CLS] token embedding as sentence representation
                    embeddings = outputs.last_hidden_state[0][0].cpu().numpy()

            return {
                'bert_tokens': tokens,
                'bert_input_ids': encoded['input_ids'][0].tolist(),
                'bert_attention_mask': encoded['attention_mask'][0].tolist(),
                'bert_embeddings': embeddings
            }

        except Exception as e:
            self.logger.error(f"Error extracting BERT features: {e}")
            return {
                'bert_tokens': [],
                'bert_input_ids': [],
                'bert_attention_mask': [],
                'bert_embeddings': None
            }

    def extract_features(self, text: str) -> LinguisticFeatures:
        """Extract all linguistic features from text"""
        try:
            if not text or not text.strip():
                self.logger.warning("Empty text provided for feature extraction")
                return self._create_empty_features(text, "Empty text provided")

            self.logger.info(f"Extracting linguistic features from text ({len(text)} characters)")

            with ProcessingTimer() as timer:
                # Extract different feature categories
                basic_features = self._extract_basic_features(text)
                vocabulary_features = self._extract_vocabulary_features(text)
                syntactic_features = self._extract_syntactic_features(text)
                semantic_features = self._extract_semantic_features(text)
                discourse_features = self._extract_discourse_features(text)
                bert_features = self._extract_bert_features(text)

                # Create LinguisticFeatures object
                features = LinguisticFeatures(
                    raw_text=text,
                    processing_success=True,
                    **basic_features,
                    **vocabulary_features,
                    **syntactic_features,
                    **semantic_features,
                    **discourse_features,
                    **bert_features
                )

            self.logger.info(f"✓ Feature extraction completed in {timer.elapsed:.2f}s")
            return features

        except Exception as e:
            self.logger.error(f"Error extracting linguistic features: {e}")
            return self._create_empty_features(text, str(e))

    def _create_empty_features(self, text: str, error_message: str) -> LinguisticFeatures:
        """Create empty features object for error cases"""
        return LinguisticFeatures(
            raw_text=text or "",
            word_count=0,
            sentence_count=0,
            char_count=0,
            avg_word_length=0.0,
            avg_sentence_length=0.0,
            unique_words=0,
            lexical_diversity=0.0,
            function_words_ratio=0.0,
            content_words_ratio=0.0,
            noun_ratio=0.0,
            verb_ratio=0.0,
            adjective_ratio=0.0,
            pronoun_ratio=0.0,
            syllable_count=0,
            avg_syllables_per_word=0.0,
            complex_words_ratio=0.0,
            repetition_ratio=0.0,
            pause_indicators=0,
            filler_words=0,
            bert_tokens=[],
            bert_input_ids=[],
            bert_attention_mask=[],
            bert_embeddings=None,
            processing_success=False,
            error_message=error_message
        )

    def process_transcription_result(self, result: TranscriptionResult) -> LinguisticFeatures:
        """Process a TranscriptionResult to extract linguistic features"""
        try:
            if not result.success or not result.text:
                return self._create_empty_features(
                    result.text or "",
                    f"Transcription failed: {result.error_message}"
                )

            return self.extract_features(result.text)

        except Exception as e:
            self.logger.error(f"Error processing transcription result: {e}")
            return self._create_empty_features("", str(e))

    def batch_process_texts(self, texts: List[str], batch_size: int = 32) -> List[LinguisticFeatures]:
        """Process multiple texts in batches"""
        try:
            self.logger.info(f"Processing {len(texts)} texts in batches of {batch_size}")
            results = []

            with ProcessingTimer() as timer:
                for batch in batch_generator(texts, batch_size):
                    batch_results = []
                    for text in batch:
                        features = self.extract_features(text)
                        batch_results.append(features)
                    results.extend(batch_results)

                    # Memory cleanup between batches
                    cleanup_memory()

            self.logger.info(f"✓ Batch processing completed in {timer.elapsed:.2f}s")
            return results

        except Exception as e:
            self.logger.error(f"Error in batch processing: {e}")
            return [self._create_empty_features(text, str(e)) for text in texts]

    def get_feature_summary(self, features: LinguisticFeatures) -> Dict[str, Any]:
        """Get a summary of extracted features"""
        if not features.processing_success:
            return {
                'status': 'failed',
                'error': features.error_message,
                'text_length': len(features.raw_text)
            }

        return {
            'status': 'success',
            'basic_stats': {
                'words': features.word_count,
                'sentences': features.sentence_count,
                'characters': features.char_count,
                'avg_word_length': round(features.avg_word_length, 2),
                'avg_sentence_length': round(features.avg_sentence_length, 2)
            },
            'vocabulary': {
                'unique_words': features.unique_words,
                'lexical_diversity': round(features.lexical_diversity, 3),
                'function_words_ratio': round(features.function_words_ratio, 3),
                'content_words_ratio': round(features.content_words_ratio, 3)
            },
            'complexity': {
                'avg_syllables_per_word': round(features.avg_syllables_per_word, 2),
                'complex_words_ratio': round(features.complex_words_ratio, 3),
                'total_syllables': features.syllable_count
            },
            'discourse': {
                'repetition_ratio': round(features.repetition_ratio, 3),
                'pause_indicators': features.pause_indicators,
                'filler_words': features.filler_words
            },
            'bert_available': features.bert_embeddings is not None,
            'bert_tokens_count': len(features.bert_tokens)
        }

    def save_features(self, features: LinguisticFeatures, filepath: str) -> bool:
        """Save linguistic features to file"""
        try:
            return safe_save_pickle(features, filepath)
        except Exception as e:
            self.logger.error(f"Error saving features: {e}")
            return False

    def cleanup(self):
        """Cleanup resources"""
        try:
            if self.bert_model:
                del self.bert_model
            if self.bert_tokenizer:
                del self.bert_tokenizer
            cleanup_memory()
            self.logger.info("✓ Linguistic features service cleanup completed")
        except Exception as e:
            self.logger.error(f"Error during cleanup: {e}")

# Factory function for easy service creation
def create_linguistic_features_service(logger: Optional[logging.Logger] = None) -> LinguisticFeaturesService:
    """Factory function to create a LinguisticFeaturesService instance"""
    return LinguisticFeaturesService(logger=logger)

# Data_Manager_Service.py

In [None]:
"""
Data Manager Service - Handles ADReSSo21 dataset loading and file management
Optimized for Windows 10 with parallel processing
"""
import os
import glob
import json
import pickle
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import pandas as pd
import logging
from pathlib import Path
import shutil

from config import SYSTEM_CONFIG, MODEL_CONFIG
from utils import setup_logging, ProcessingTimer, safe_save_pickle, cleanup_memory

@dataclass
class AudioFile:
    """Data class for audio file information"""
    file_path: str
    filename: str
    category: str
    label: str
    dataset_type: str  # 'diagnosis' or 'progression'
    split: str  # 'train' or 'test'
    segmentation_path: Optional[str] = None
    file_size: Optional[int] = None
    duration: Optional[float] = None

@dataclass
class DatasetInfo:
    """Data class for dataset information"""
    total_files: int
    categories: Dict[str, int]
    dataset_types: Dict[str, int]
    splits: Dict[str, int]
    total_size_mb: float
    audio_files: List[AudioFile]

class DataManagerService:
    """Service for managing ADReSSo21 dataset files and metadata"""

    def __init__(self, base_path: str, logger: Optional[logging.Logger] = None):
        self.base_path = Path(base_path)
        self.logger = logger or setup_logging()

        # Define dataset structure based on your paths
        self.dataset_paths = {
            'diagnosis_train': {
                'audio': {
                    'ad': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad",
                    'cn': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn"
                },
                'segmentation': {
                    'ad': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\ad",
                    'cn': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\cn"
                }
            },
            'progression_train': {
                'audio': {
                    'decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline",
                    'no_decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline"
                },
                'segmentation': {
                    'decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\decline",
                    'no_decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\no_decline"
                }
            },
            'progression_test': {
                'audio': {
                    'test': r"ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio"
                },
                'segmentation': {
                    'test': r"ADReSSo21-progression-test\ADReSSo21\progression\test-dist\segmentation"
                }
            }
        }

        # Create output directories
        self.output_dir = self.base_path / "output"
        self.create_output_directories()

    def create_output_directories(self):
        """Create necessary output directories"""
        directories = [
            self.output_dir,
            self.output_dir / "features",
            self.output_dir / "transcripts",
            self.output_dir / "models",
            self.output_dir / "results",
            self.output_dir / "logs",
            self.output_dir / "cache"
        ]

        for directory in directories:
            directory.mkdir(parents=True, exist_ok=True)

        self.logger.info(f"✓ Output directories created at {self.output_dir}")

    def scan_audio_files(self) -> List[AudioFile]:
        """Scan and catalog all audio files in the dataset"""
        self.logger.info("Scanning audio files in dataset...")
        audio_files = []

        with ProcessingTimer() as timer:
            for dataset_name, paths in self.dataset_paths.items():
                self.logger.info(f"Scanning {dataset_name}...")

                # Extract dataset info from name
                if 'diagnosis' in dataset_name:
                    dataset_type = 'diagnosis'
                    split = 'train'
                else:  # progression
                    dataset_type = 'progression'
                    split = 'test' if 'test' in dataset_name else 'train'

                # Scan audio directories
                for label, audio_path in paths['audio'].items():
                    full_audio_path = self.base_path / audio_path

                    if not full_audio_path.exists():
                        self.logger.warning(f"Audio path not found: {full_audio_path}")
                        continue

                    # Find segmentation path
                    seg_path = None
                    if 'segmentation' in paths:
                        seg_key = label if label in paths['segmentation'] else 'test'
                        if seg_key in paths['segmentation']:
                            seg_path = self.base_path / paths['segmentation'][seg_key]

                    # Get all WAV files
                    wav_files = list(full_audio_path.glob("*.wav"))

                    for wav_file in wav_files:
                        # Find corresponding segmentation file
                        seg_file = None
                        if seg_path and seg_path.exists():
                            seg_file_path = seg_path / f"{wav_file.stem}.csv"
                            if seg_file_path.exists():
                                seg_file = str(seg_file_path)

                        audio_file = AudioFile(
                            file_path=str(wav_file),
                            filename=wav_file.name,
                            category=f"{dataset_type}_{label}",
                            label=label,
                            dataset_type=dataset_type,
                            split=split,
                            segmentation_path=seg_file,
                            file_size=wav_file.stat().st_size if wav_file.exists() else None
                        )

                        audio_files.append(audio_file)

        self.logger.info(f"✓ Found {len(audio_files)} audio files in {timer.elapsed:.2f}s")
        return audio_files

    def get_dataset_info(self, audio_files: List[AudioFile]) -> DatasetInfo:
        """Generate comprehensive dataset information"""
        if not audio_files:
            return DatasetInfo(0, {}, {}, {}, 0.0, [])

        # Count by categories
        categories = {}
        dataset_types = {}
        splits = {}
        total_size = 0

        for af in audio_files:
            # Count categories
            categories[af.category] = categories.get(af.category, 0) + 1
            dataset_types[af.dataset_type] = dataset_types.get(af.dataset_type, 0) + 1
            splits[af.split] = splits.get(af.split, 0) + 1

            # Sum file sizes
            if af.file_size:
                total_size += af.file_size

        return DatasetInfo(
            total_files=len(audio_files),
            categories=categories,
            dataset_types=dataset_types,
            splits=splits,
            total_size_mb=total_size / (1024 * 1024),
            audio_files=audio_files
        )

    def create_file_manifest(self, audio_files: List[AudioFile]) -> pd.DataFrame:
        """Create a detailed file manifest"""
        data = []

        for af in audio_files:
            data.append({
                'filename': af.filename,
                'file_path': af.file_path,
                'category': af.category,
                'label': af.label,
                'dataset_type': af.dataset_type,
                'split': af.split,
                'segmentation_path': af.segmentation_path,
                'has_segmentation': af.segmentation_path is not None,
                'file_size_mb': af.file_size / (1024 * 1024) if af.file_size else None,
                'file_exists': os.path.exists(af.file_path)
            })

        df = pd.DataFrame(data)

        # Save manifest
        manifest_path = self.output_dir / "file_manifest.csv"
        df.to_csv(manifest_path, index=False)
        self.logger.info(f"✓ File manifest saved to {manifest_path}")

        return df

    def validate_dataset(self, audio_files: List[AudioFile]) -> Dict[str, Any]:
        """Validate dataset integrity"""
        self.logger.info("Validating dataset integrity...")

        validation_results = {
            'total_files': len(audio_files),
            'valid_files': 0,
            'missing_files': 0,
            'files_with_segmentation': 0,
            'missing_segmentation': 0,
            'errors': []
        }

        for af in audio_files:
            # Check if audio file exists
            if not os.path.exists(af.file_path):
                validation_results['missing_files'] += 1
                validation_results['errors'].append(f"Missing audio: {af.file_path}")
                continue

            validation_results['valid_files'] += 1

            # Check segmentation file
            if af.segmentation_path:
                if os.path.exists(af.segmentation_path):
                    validation_results['files_with_segmentation'] += 1
                else:
                    validation_results['missing_segmentation'] += 1
                    validation_results['errors'].append(f"Missing segmentation: {af.segmentation_path}")

        # Save validation report
        report_path = self.output_dir / "validation_report.json"
        with open(report_path, 'w') as f:
            json.dump(validation_results, f, indent=2)

        self.logger.info(f"✓ Dataset validation complete. Report saved to {report_path}")
        return validation_results

    def get_files_by_category(self, audio_files: List[AudioFile],
                            category: Optional[str] = None,
                            dataset_type: Optional[str] = None,
                            split: Optional[str] = None) -> List[AudioFile]:
        """Filter audio files by category, dataset type, or split"""
        filtered_files = audio_files

        if category:
            filtered_files = [af for af in filtered_files if af.category == category]

        if dataset_type:
            filtered_files = [af for af in filtered_files if af.dataset_type == dataset_type]

        if split:
            filtered_files = [af for af in filtered_files if af.split == split]

        return filtered_files

    def create_train_test_splits(self, audio_files: List[AudioFile],
                                test_size: float = 0.2) -> Tuple[List[AudioFile], List[AudioFile]]:
        """Create train/test splits for datasets that don't have predefined splits"""
        from sklearn.model_selection import train_test_split

        # Group by category to ensure balanced splits
        category_files = {}
        for af in audio_files:
            if af.category not in category_files:
                category_files[af.category] = []
            category_files[af.category].append(af)

        train_files = []
        test_files = []

        for category, files in category_files.items():
            if len(files) < 2:
                # If too few files, put all in training
                train_files.extend(files)
                continue

            cat_train, cat_test = train_test_split(
                files,
                test_size=test_size,
                random_state=42,
                stratify=None  # Can't stratify on single category
            )

            train_files.extend(cat_train)
            test_files.extend(cat_test)

        self.logger.info(f"✓ Created splits: {len(train_files)} train, {len(test_files)} test")
        return train_files, test_files

    def batch_load_files(self, audio_files: List[AudioFile],
                        batch_size: int = 32) -> List[List[AudioFile]]:
        """Create batches of files for parallel processing"""
        batches = []
        for i in range(0, len(audio_files), batch_size):
            batch = audio_files[i:i + batch_size]
            batches.append(batch)

        self.logger.info(f"✓ Created {len(batches)} batches of size {batch_size}")
        return batches

    def save_dataset_cache(self, audio_files: List[AudioFile],
                          dataset_info: DatasetInfo) -> bool:
        """Save dataset information to cache for faster loading"""
        try:
            cache_data = {
                'audio_files': audio_files,
                'dataset_info': dataset_info,
                'scan_timestamp': pd.Timestamp.now().isoformat(),
                'base_path': str(self.base_path)
            }

            cache_path = self.output_dir / "cache" / "dataset_cache.pkl"
            success = safe_save_pickle(cache_data, cache_path)

            if success:
                self.logger.info(f"✓ Dataset cache saved to {cache_path}")

            return success

        except Exception as e:
            self.logger.error(f"Error saving dataset cache: {e}")
            return False

    def load_dataset_cache(self) -> Optional[Tuple[List[AudioFile], DatasetInfo]]:
        """Load dataset information from cache"""
        try:
            cache_path = self.output_dir / "cache" / "dataset_cache.pkl"

            if not cache_path.exists():
                return None

            with open(cache_path, 'rb') as f:
                cache_data = pickle.load(f)

            # Verify cache is for same base path
            if cache_data.get('base_path') != str(self.base_path):
                self.logger.warning("Cache base path mismatch, ignoring cache")
                return None

            self.logger.info("✓ Loaded dataset from cache")
            return cache_data['audio_files'], cache_data['dataset_info']

        except Exception as e:
            self.logger.error(f"Error loading dataset cache: {e}")
            return None

    def print_dataset_summary(self, dataset_info: DatasetInfo):
        """Print a comprehensive dataset summary"""
        print("\n" + "="*60)
        print("ADRESSO21 DATASET SUMMARY")
        print("="*60)

        print(f"Total Files: {dataset_info.total_files}")
        print(f"Total Size: {dataset_info.total_size_mb:.2f} MB")
        print()

        print("By Dataset Type:")
        for dtype, count in dataset_info.dataset_types.items():
            print(f"  {dtype}: {count} files")
        print()

        print("By Split:")
        for split, count in dataset_info.splits.items():
            print(f"  {split}: {count} files")
        print()

        print("By Category:")
        for category, count in dataset_info.categories.items():
            print(f"  {category}: {count} files")
        print()

    def cleanup_output_directory(self, keep_cache: bool = True):
        """Clean up output directory"""
        try:
            for item in self.output_dir.iterdir():
                if item.is_dir():
                    if item.name == 'cache' and keep_cache:
                        continue
                    shutil.rmtree(item)
                else:
                    item.unlink()

            # Recreate directories
            self.create_output_directories()
            self.logger.info("✓ Output directory cleaned")

        except Exception as e:
            self.logger.error(f"Error cleaning output directory: {e}")

# Factory function
def create_data_manager(base_path: str, logger: Optional[logging.Logger] = None) -> DataManagerService:
    """Factory function to create DataManagerService"""
    return DataManagerService(base_path, logger)

# model_architecture_service.py - Neural Network Models Service


In [None]:
"""
model_architecture_service.py - Neural Network Models Service
Implements multi-modal deep learning architecture for ADReSSo21
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool
from torch_geometric.data import Data, Batch
from transformers import BertModel
import numpy as np
from typing import Dict, List, Tuple, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class GraphAttentionModule(nn.Module):
    """Graph-based attention module for semantic relationships"""

    def __init__(self, input_dim: int = 768, hidden_dim: int = 256,
                 num_heads: int = 8, num_layers: int = 3, dropout: float = 0.2):
        super(GraphAttentionModule, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads

        # Graph attention layers
        self.gat_layers = nn.ModuleList([
            GATConv(input_dim if i == 0 else hidden_dim,
                   hidden_dim, heads=num_heads, dropout=dropout)
            for i in range(num_layers)
        ])

        # Final projection
        self.projection = nn.Linear(hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index, batch=None):
        # Apply GAT layers
        for i, gat_layer in enumerate(self.gat_layers):
            x = gat_layer(x, edge_index)
            x = F.relu(x)
            x = self.dropout(x)

        # Global pooling if batch is provided
        if batch is not None:
            x = global_mean_pool(x, batch)
        else:
            x = torch.mean(x, dim=0, keepdim=True)

        return self.projection(x)


class VisionTransformerModule(nn.Module):
    """Vision Transformer for processing spectrograms"""

    def __init__(self, input_dim: int = 80, patch_size: int = 8,
                 embed_dim: int = 768, num_heads: int = 12, num_layers: int = 6):
        super(VisionTransformerModule, self).__init__()
        self.patch_size = patch_size
        self.embed_dim = embed_dim

        # Patch embedding
        self.patch_embed = nn.Conv2d(1, embed_dim, kernel_size=patch_size, stride=patch_size)

        # Positional encoding
        self.pos_embed = nn.Parameter(torch.randn(1, 1000, embed_dim))  # Max patches

        # Transformer layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dim_feedforward=embed_dim*4,
            dropout=0.1, activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.classifier = nn.Linear(embed_dim, 256)

    def forward(self, x):
        # x shape: (batch_size, channels, height, width)
        B, C, H, W = x.shape

        # Create patches
        x = self.patch_embed(x)  # (B, embed_dim, H', W')
        x = x.flatten(2).transpose(1, 2)  # (B, num_patches, embed_dim)

        # Add positional encoding
        num_patches = x.shape[1]
        x = x + self.pos_embed[:, :num_patches, :]

        # Apply transformer
        x = x.transpose(0, 1)  # (num_patches, B, embed_dim)
        x = self.transformer(x)
        x = x.transpose(0, 1)  # (B, num_patches, embed_dim)

        # Global average pooling
        x = torch.mean(x, dim=1)  # (B, embed_dim)

        return self.classifier(x)


class UNetModule(nn.Module):
    """U-Net for audio feature processing"""

    def __init__(self, in_channels: int = 1, out_channels: int = 128):
        super(UNetModule, self).__init__()

        # Encoder
        self.enc1 = self.conv_block(in_channels, 64)
        self.enc2 = self.conv_block(64, 128)
        self.enc3 = self.conv_block(128, 256)
        self.enc4 = self.conv_block(256, 512)

        # Bottleneck
        self.bottleneck = self.conv_block(512, 1024)

        # Decoder
        self.dec4 = self.upconv_block(1024, 512)
        self.dec3 = self.upconv_block(512, 256)
        self.dec2 = self.upconv_block(256, 128)
        self.dec1 = self.upconv_block(128, 64)

        # Final layer
        self.final = nn.Conv1d(64, out_channels, kernel_size=1)
        self.pool = nn.AdaptiveAvgPool1d(1)

    def conv_block(self, in_ch: int, out_ch: int):
        return nn.Sequential(
            nn.Conv1d(in_ch, out_ch, kernel_size=3, padding=1),
            nn.BatchNorm1d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv1d(out_ch, out_ch, kernel_size=3, padding=1),
            nn.BatchNorm1d(out_ch),
            nn.ReLU(inplace=True)
        )

    def upconv_block(self, in_ch: int, out_ch: int):
        return nn.Sequential(
            nn.ConvTranspose1d(in_ch, out_ch, kernel_size=2, stride=2),
            nn.BatchNorm1d(out_ch),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        # Encoder
        e1 = self.enc1(x)
        e2 = self.enc2(F.max_pool1d(e1, 2))
        e3 = self.enc3(F.max_pool1d(e2, 2))
        e4 = self.enc4(F.max_pool1d(e3, 2))

        # Bottleneck
        b = self.bottleneck(F.max_pool1d(e4, 2))

        # Decoder
        d4 = self.dec4(b)
        d3 = self.dec3(d4)
        d2 = self.dec2(d3)
        d1 = self.dec1(d2)

        # Final
        out = self.final(d1)
        out = self.pool(out).squeeze(-1)  # Global average pooling

        return out


class AlexNetModule(nn.Module):
    """Modified AlexNet for feature extraction"""

    def __init__(self, input_dim: int = 768, num_classes: int = 256):
        super(AlexNetModule, self).__init__()

        self.features = nn.Sequential(
            nn.Linear(input_dim, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, 2048),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
        )

        self.classifier = nn.Linear(2048, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


class MultiModalADReSSoModel(nn.Module):
    """Complete multi-modal architecture for ADReSSo21"""

    def __init__(self,
                 audio_feature_dim: int = 768,
                 text_feature_dim: int = 768,
                 spectrogram_height: int = 80,
                 num_classes: int = 2,
                 device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
        super(MultiModalADReSSoModel, self).__init__()

        self.device = device
        self.num_classes = num_classes

        # Initialize modules
        self.graph_attention = GraphAttentionModule(input_dim=text_feature_dim)
        self.vision_transformer = VisionTransformerModule(input_dim=spectrogram_height)
        self.unet = UNetModule()
        self.alexnet = AlexNetModule(input_dim=audio_feature_dim)

        # BERT for text processing
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Fusion layers
        self.fusion_layer = nn.Sequential(
            nn.Linear(256 + 256 + 128 + 256, 512),  # Graph + ViT + UNet + AlexNet
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Final classifier
        self.classifier = nn.Linear(256, num_classes)

        # Move to device
        self.to(device)

    def create_semantic_graph(self, text_features, audio_features):
        """Create semantic relationship graph between audio and text"""
        batch_size = text_features.shape[0]
        graphs = []

        for i in range(batch_size):
            # Compute similarity matrix
            text_feat = text_features[i].unsqueeze(0)  # (1, dim)
            audio_feat = audio_features[i].unsqueeze(0)  # (1, dim)

            # Create nodes (text + audio features)
            node_features = torch.cat([text_feat, audio_feat], dim=0)  # (2, dim)

            # Create edges based on similarity
            similarity = F.cosine_similarity(text_feat, audio_feat, dim=1)

            # Create bidirectional edges if similarity > threshold
            if similarity.item() > 0.1:
                edge_index = torch.tensor([[0, 1], [1, 0]], dtype=torch.long).t()
            else:
                # Self-loops only
                edge_index = torch.tensor([[0, 1], [0, 1]], dtype=torch.long).t()

            graph = Data(x=node_features, edge_index=edge_index.to(self.device))
            graphs.append(graph)

        return Batch.from_data_list(graphs).to(self.device)

    def forward(self, audio_features, text_input_ids, text_attention_mask, spectrograms):
        batch_size = audio_features.shape[0]

        # Process text with BERT
        bert_outputs = self.bert(input_ids=text_input_ids, attention_mask=text_attention_mask)
        text_features = bert_outputs.last_hidden_state.mean(dim=1)  # (batch_size, 768)

        # Create semantic graph
        graph_batch = self.create_semantic_graph(text_features, audio_features)

        # Process through different modules
        graph_out = self.graph_attention(graph_batch.x, graph_batch.edge_index, graph_batch.batch)
        vit_out = self.vision_transformer(spectrograms)

        # Prepare audio for U-Net (add channel dimension)
        audio_1d = audio_features.unsqueeze(1)  # (batch_size, 1, features)
        unet_out = self.unet(audio_1d)

        alexnet_out = self.alexnet(audio_features)

        # Fusion
        fused_features = torch.cat([graph_out, vit_out, unet_out, alexnet_out], dim=1)
        fused_features = self.fusion_layer(fused_features)

        # Final classification
        output = self.classifier(fused_features)

        return output


class ModelArchitectureService:
    """Service for managing neural network architectures"""

    def __init__(self, config: Dict):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"ModelArchitectureService initialized on device: {self.device}")

    def create_model(self,
                    audio_feature_dim: int = 768,
                    text_feature_dim: int = 768,
                    spectrogram_height: int = 80,
                    num_classes: int = 2) -> MultiModalADReSSoModel:
        """Create and initialize the multi-modal model"""
        try:
            model = MultiModalADReSSoModel(
                audio_feature_dim=audio_feature_dim,
                text_feature_dim=text_feature_dim,
                spectrogram_height=spectrogram_height,
                num_classes=num_classes,
                device=self.device
            )

            # Count parameters
            total_params = sum(p.numel() for p in model.parameters())
            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

            logger.info(f"Model created successfully:")
            logger.info(f"  - Total parameters: {total_params:,}")
            logger.info(f"  - Trainable parameters: {trainable_params:,}")
            logger.info(f"  - Model size: {total_params * 4 / 1024 / 1024:.2f} MB")

            return model

        except Exception as e:
            logger.error(f"Error creating model: {str(e)}")
            raise

    def get_model_info(self, model: nn.Module) -> Dict:
        """Get detailed model information"""
        try:
            total_params = sum(p.numel() for p in model.parameters())
            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

            # Get module-wise parameter count
            module_params = {}
            for name, module in model.named_children():
                module_params[name] = sum(p.numel() for p in module.parameters())

            return {
                'total_parameters': total_params,
                'trainable_parameters': trainable_params,
                'model_size_mb': total_params * 4 / 1024 / 1024,
                'module_parameters': module_params,
                'device': str(next(model.parameters()).device)
            }

        except Exception as e:
            logger.error(f"Error getting model info: {str(e)}")
            return {}

    def save_model(self, model: nn.Module, filepath: str,
                   additional_info: Optional[Dict] = None) -> bool:
        """Save model with additional information"""
        try:
            save_dict = {
                'model_state_dict': model.state_dict(),
                'model_info': self.get_model_info(model),
                'architecture': model.__class__.__name__
            }

            if additional_info:
                save_dict.update(additional_info)

            torch.save(save_dict, filepath)
            logger.info(f"Model saved successfully to {filepath}")
            return True

        except Exception as e:
            logger.error(f"Error saving model: {str(e)}")
            return False

    def load_model(self, filepath: str,
                   model_class: type = MultiModalADReSSoModel,
                   **model_kwargs) -> Optional[nn.Module]:
        """Load model from filepath"""
        try:
            checkpoint = torch.load(filepath, map_location=self.device)

            # Create model instance
            model = model_class(**model_kwargs)

            # Load state dict
            model.load_state_dict(checkpoint['model_state_dict'])
            model.to(self.device)

            logger.info(f"Model loaded successfully from {filepath}")

            # Print model info if available
            if 'model_info' in checkpoint:
                info = checkpoint['model_info']
                logger.info(f"  - Total parameters: {info.get('total_parameters', 'Unknown'):,}")
                logger.info(f"  - Model size: {info.get('model_size_mb', 'Unknown'):.2f} MB")

            return model

        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return None

    def print_model_summary(self, model: nn.Module):
        """Print detailed model summary"""
        info = self.get_model_info(model)

        print("\n" + "="*60)
        print("MODEL ARCHITECTURE SUMMARY")
        print("="*60)
        print(f"Model Class: {model.__class__.__name__}")
        print(f"Device: {info.get('device', 'Unknown')}")
        print(f"Total Parameters: {info.get('total_parameters', 0):,}")
        print(f"Trainable Parameters: {info.get('trainable_parameters', 0):,}")
        print(f"Model Size: {info.get('model_size_mb', 0):.2f} MB")

        print("\nMODULE-WISE PARAMETER COUNT:")
        for module_name, param_count in info.get('module_parameters', {}).items():
            print(f"  - {module_name}: {param_count:,} parameters")
        print("="*60)

# training_service.py - Model Training Service


In [None]:
"""
Training Service for ADReSSo21 Multi-Modal Model
Handles training, validation, evaluation, and semantic relationship analysis
"""

import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                           roc_auc_score, confusion_matrix)
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from transformers import BertModel
import logging
from typing import Dict, List, Tuple, Optional, Any
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import multiprocessing as mp
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from config import Config
from utils import setup_logging, create_directory, log_system_info
from model_architecture_service import MultiModalADReSSoModel


class ADReSSoDataset(Dataset):
    """Custom dataset for ADReSSo multi-modal data"""

    def __init__(self, features_dict: Dict, linguistic_features: Dict,
                 labels: Dict, transform=None):
        """
        Initialize dataset

        Args:
            features_dict: Dictionary of acoustic features
            linguistic_features: Dictionary of linguistic features
            labels: Dictionary of labels
            transform: Optional transforms
        """
        self.features_dict = features_dict
        self.linguistic_features = linguistic_features
        self.labels = labels
        self.transform = transform
        self.file_ids = list(features_dict.keys())

        self.logger = logging.getLogger(__name__)
        self.logger.info(f"Dataset initialized with {len(self.file_ids)} samples")

    def __len__(self) -> int:
        return len(self.file_ids)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """Get a single sample"""
        file_id = self.file_ids[idx]

        try:
            # Get features
            features = self.features_dict[file_id]
            linguistic = self.linguistic_features[file_id]

            # Prepare audio features (Wav2Vec2)
            audio_features = torch.FloatTensor(features['wav2vec2'])

            # Prepare text features
            text_input_ids = torch.LongTensor(linguistic['bert_input_ids'])
            text_attention_mask = torch.LongTensor(linguistic['bert_attention_mask'])

            # Prepare spectrogram from log-mel features
            log_mel_mean = features['log_mel']['mean']
            log_mel_std = features['log_mel']['std']

            # Create spectrogram (square format for ViT)
            spectrogram = np.stack([log_mel_mean, log_mel_std])  # (2, 80)
            spectrogram = np.expand_dims(spectrogram.mean(axis=0), axis=0)  # (1, 80)
            spectrogram = np.tile(spectrogram, (1, 1, 80))  # (1, 80, 80)
            spectrogram = torch.FloatTensor(spectrogram)

            # Get label
            label = torch.LongTensor([self.labels[file_id]])

            sample = {
                'audio_features': audio_features,
                'text_input_ids': text_input_ids,
                'text_attention_mask': text_attention_mask,
                'spectrogram': spectrogram,
                'label': label,
                'file_id': file_id
            }

            if self.transform:
                sample = self.transform(sample)

            return sample

        except Exception as e:
            self.logger.error(f"Error loading sample {file_id}: {str(e)}")
            raise


class ModelTrainer:
    """Main trainer class for multi-modal ADReSSo model"""

    def __init__(self, model: nn.Module, config: Config):
        """
        Initialize trainer

        Args:
            model: The multi-modal model to train
            config: Configuration object
        """
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device)

        # Setup logging
        self.logger = logging.getLogger(__name__)
        log_system_info(self.logger, self.device)

        # Training components
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=config.LEARNING_RATE,
            weight_decay=config.WEIGHT_DECAY
        )
        self.scheduler = ReduceLROnPlateau(
            self.optimizer,
            mode='min',
            factor=0.5,
            patience=5,
            verbose=True
        )

        # Training history
        self.train_losses = []
        self.val_losses = []
        self.train_accuracies = []
        self.val_accuracies = []

        # Create output directories
        create_directory(config.MODEL_OUTPUT_PATH)
        create_directory(config.RESULTS_PATH)

        self.logger.info(f"Trainer initialized on {self.device}")
        self.logger.info(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

    def train_epoch(self, train_loader: DataLoader) -> Tuple[float, float]:
        """Train for one epoch"""
        self.model.train()
        total_loss = 0.0
        correct = 0
        total = 0

        for batch_idx, batch in enumerate(train_loader):
            try:
                # Move to device
                audio_features = batch['audio_features'].to(self.device)
                text_input_ids = batch['text_input_ids'].to(self.device)
                text_attention_mask = batch['text_attention_mask'].to(self.device)
                spectrograms = batch['spectrogram'].to(self.device)
                labels = batch['label'].squeeze().to(self.device)

                # Forward pass
                self.optimizer.zero_grad()
                outputs = self.model(
                    audio_features, text_input_ids,
                    text_attention_mask, spectrograms
                )
                loss = self.criterion(outputs, labels)

                # Backward pass
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optimizer.step()

                # Statistics
                total_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                # Log progress
                if batch_idx % 10 == 0:
                    self.logger.info(
                        f'Batch {batch_idx}/{len(train_loader)}, '
                        f'Loss: {loss.item():.4f}'
                    )

            except Exception as e:
                self.logger.error(f"Error in training batch {batch_idx}: {str(e)}")
                continue

        avg_loss = total_loss / len(train_loader)
        accuracy = 100.0 * correct / total

        return avg_loss, accuracy

    def validate(self, val_loader: DataLoader) -> Tuple[float, float, List, List, List]:
        """Validate the model"""
        self.model.eval()
        total_loss = 0.0
        correct = 0
        total = 0
        all_preds = []
        all_labels = []
        all_probs = []

        with torch.no_grad():
            for batch in val_loader:
                try:
                    # Move to device
                    audio_features = batch['audio_features'].to(self.device)
                    text_input_ids = batch['text_input_ids'].to(self.device)
                    text_attention_mask = batch['text_attention_mask'].to(self.device)
                    spectrograms = batch['spectrogram'].to(self.device)
                    labels = batch['label'].squeeze().to(self.device)

                    # Forward pass
                    outputs = self.model(
                        audio_features, text_input_ids,
                        text_attention_mask, spectrograms
                    )
                    loss = self.criterion(outputs, labels)

                    # Statistics
                    total_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += labels.size(0)
                    correct += predicted.eq(labels).sum().item()

                    # Store for metrics
                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())
                    all_probs.extend(F.softmax(outputs, dim=1).cpu().numpy())

                except Exception as e:
                    self.logger.error(f"Error in validation batch: {str(e)}")
                    continue

        avg_loss = total_loss / len(val_loader)
        accuracy = 100.0 * correct / total

        return avg_loss, accuracy, all_preds, all_labels, all_probs

    def train_model(self, train_loader: DataLoader, val_loader: DataLoader,
                   num_epochs: int = 30) -> Dict[str, Any]:
        """Main training loop"""
        self.logger.info(f"Starting training for {num_epochs} epochs")

        best_val_acc = 0.0
        patience_counter = 0
        start_time = datetime.now()

        for epoch in range(num_epochs):
            epoch_start = datetime.now()
            self.logger.info(f'\nEpoch {epoch+1}/{num_epochs}')
            self.logger.info('-' * 50)

            # Training
            train_loss, train_acc = self.train_epoch(train_loader)
            self.train_losses.append(train_loss)
            self.train_accuracies.append(train_acc)

            # Validation
            val_loss, val_acc, val_preds, val_labels, val_probs = self.validate(val_loader)
            self.val_losses.append(val_loss)
            self.val_accuracies.append(val_acc)

            epoch_time = datetime.now() - epoch_start

            self.logger.info(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
            self.logger.info(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
            self.logger.info(f'Epoch Time: {epoch_time.total_seconds():.2f}s')

            # Learning rate scheduling
            self.scheduler.step(val_loss)

            # Save best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                self.save_model('best_adresso_model.pth')
                patience_counter = 0
                self.logger.info(f'New best validation accuracy: {best_val_acc:.2f}%')
            else:
                patience_counter += 1
                if patience_counter >= self.config.EARLY_STOPPING_PATIENCE:
                    self.logger.info('Early stopping triggered')
                    break

        total_time = datetime.now() - start_time
        self.logger.info(f'\nTraining completed in {total_time}')
        self.logger.info(f'Best validation accuracy: {best_val_acc:.2f}%')

        return {
            'best_val_accuracy': best_val_acc,
            'total_time': total_time.total_seconds(),
            'epochs_completed': epoch + 1
        }

    def save_model(self, filename: str):
        """Save model state"""
        filepath = os.path.join(self.config.MODEL_OUTPUT_PATH, filename)
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'train_losses': self.train_losses,
            'val_losses': self.val_losses,
            'train_accuracies': self.train_accuracies,
            'val_accuracies': self.val_accuracies,
        }, filepath)
        self.logger.info(f"Model saved to {filepath}")

    def load_model(self, filename: str):
        """Load model state"""
        filepath = os.path.join(self.config.MODEL_OUTPUT_PATH, filename)
        if os.path.exists(filepath):
            checkpoint = torch.load(filepath, map_location=self.device)
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            self.train_losses = checkpoint.get('train_losses', [])
            self.val_losses = checkpoint.get('val_losses', [])
            self.train_accuracies = checkpoint.get('train_accuracies', [])
            self.val_accuracies = checkpoint.get('val_accuracies', [])
            self.logger.info(f"Model loaded from {filepath}")
        else:
            self.logger.warning(f"Model file {filepath} not found")


class ModelEvaluator:
    """Model evaluation and analysis class"""

    def __init__(self, trainer: ModelTrainer, config: Config):
        """
        Initialize evaluator

        Args:
            trainer: Trained ModelTrainer instance
            config: Configuration object
        """
        self.trainer = trainer
        self.config = config
        self.logger = logging.getLogger(__name__)

    def evaluate_detailed(self, test_loader: DataLoader,
                         class_names: List[str] = ['CN', 'AD']) -> Dict[str, Any]:
        """Comprehensive model evaluation"""
        self.logger.info("Starting detailed evaluation...")

        self.trainer.model.eval()
        all_preds = []
        all_labels = []
        all_probs = []
        all_file_ids = []

        with torch.no_grad():
            for batch in test_loader:
                try:
                    # Move to device
                    audio_features = batch['audio_features'].to(self.trainer.device)
                    text_input_ids = batch['text_input_ids'].to(self.trainer.device)
                    text_attention_mask = batch['text_attention_mask'].to(self.trainer.device)
                    spectrograms = batch['spectrogram'].to(self.trainer.device)
                    labels = batch['label'].squeeze().to(self.trainer.device)

                    # Forward pass
                    outputs = self.trainer.model(
                        audio_features, text_input_ids,
                        text_attention_mask, spectrograms
                    )
                    probs = F.softmax(outputs, dim=1)
                    _, predicted = outputs.max(1)

                    # Store results
                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())
                    all_probs.extend(probs.cpu().numpy())
                    all_file_ids.extend(batch['file_id'])

                except Exception as e:
                    self.logger.error(f"Error in evaluation batch: {str(e)}")
                    continue

        # Calculate metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average='weighted'
        )

        # ROC AUC for binary classification
        auc = None
        if len(class_names) == 2:
            probs_positive = [prob[1] for prob in all_probs]
            auc = roc_auc_score(all_labels, probs_positive)

        # Confusion matrix
        cm = confusion_matrix(all_labels, all_preds)

        # Results DataFrame
        results_df = pd.DataFrame({
            'file_id': all_file_ids,
            'true_label': all_labels,
            'predicted_label': all_preds,
            'confidence': [max(prob) for prob in all_probs],
            'prob_CN': [prob[0] for prob in all_probs],
            'prob_AD': [prob[1] if len(prob) > 1 else 0 for prob in all_probs]
        })

        # Log results
        self.logger.info("="*60)
        self.logger.info("DETAILED EVALUATION RESULTS")
        self.logger.info("="*60)
        self.logger.info(f"Accuracy: {accuracy:.4f}")
        self.logger.info(f"Precision: {precision:.4f}")
        self.logger.info(f"Recall: {recall:.4f}")
        self.logger.info(f"F1-Score: {f1:.4f}")
        if auc:
            self.logger.info(f"ROC AUC: {auc:.4f}")

        # Visualizations
        self.plot_confusion_matrix(cm, class_names)
        self.plot_training_curves()

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': auc,
            'confusion_matrix': cm,
            'results_df': results_df
        }

    def plot_confusion_matrix(self, cm: np.ndarray, class_names: List[str]):
        """Plot confusion matrix"""
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names, yticklabels=class_names)
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()

        # Save plot
        save_path = os.path.join(self.config.RESULTS_PATH, 'confusion_matrix.png')
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

        self.logger.info(f"Confusion matrix saved to {save_path}")

    def plot_training_curves(self):
        """Plot training and validation curves"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

        # Loss curves
        ax1.plot(self.trainer.train_losses, label='Training Loss', color='blue')
        ax1.plot(self.trainer.val_losses, label='Validation Loss', color='red')
        ax1.set_title('Training and Validation Loss')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.legend()
        ax1.grid(True)

        # Accuracy curves
        ax2.plot(self.trainer.train_accuracies, label='Training Accuracy', color='blue')
        ax2.plot(self.trainer.val_accuracies, label='Validation Accuracy', color='red')
        ax2.set_title('Training and Validation Accuracy')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('Accuracy (%)')
        ax2.legend()
        ax2.grid(True)

        plt.tight_layout()

        # Save plot
        save_path = os.path.join(self.config.RESULTS_PATH, 'training_curves.png')
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

        self.logger.info(f"Training curves saved to {save_path}")

    def analyze_semantic_relationships(self, test_loader: DataLoader,
                                     num_samples: int = 5):
        """Analyze semantic relationships between modalities"""
        self.logger.info(f"Analyzing semantic relationships for {num_samples} samples...")

        self.trainer.model.eval()
        sample_count = 0

        with torch.no_grad():
            for batch in test_loader:
                if sample_count >= num_samples:
                    break

                try:
                    # Process batch
                    audio_features = batch['audio_features'].to(self.trainer.device)
                    text_input_ids = batch['text_input_ids'].to(self.trainer.device)
                    text_attention_mask = batch['text_attention_mask'].to(self.trainer.device)
                    file_ids = batch['file_id']

                    # Get BERT features
                    bert_outputs = self.trainer.model.bert(
                        input_ids=text_input_ids,
                        attention_mask=text_attention_mask
                    )
                    text_features = bert_outputs.last_hidden_state.mean(dim=1)

                    # Analyze each sample
                    for i in range(min(len(file_ids), num_samples - sample_count)):
                        file_id = file_ids[i]
                        text_feat = text_features[i]
                        audio_feat = audio_features[i]

                        # Create and visualize semantic graph
                        self.visualize_semantic_graph(
                            text_feat.cpu(), audio_feat.cpu(), file_id
                        )

                        # Analyze relationship
                        self.analyze_relationship_metrics(
                            text_feat.cpu(), audio_feat.cpu(), file_id
                        )

                        sample_count += 1
                        if sample_count >= num_samples:
                            break

                except Exception as e:
                    self.logger.error(f"Error in semantic analysis: {str(e)}")
                    continue

    def visualize_semantic_graph(self, text_features: torch.Tensor,
                                audio_features: torch.Tensor, file_id: str):
        """Visualize semantic relationship graph"""
        # Compute similarity
        similarity = F.cosine_similarity(
            text_features, audio_features, dim=0
        ).item()

        # Create networkx graph
        G = nx.Graph()
        G.add_node("Text", type="text")
        G.add_node("Audio", type="audio")

        # Add edge if similarity is significant
        if similarity > 0.1:
            G.add_edge("Text", "Audio", weight=similarity, similarity=similarity)

        # Visualize
        plt.figure(figsize=(10, 8))
        pos = nx.spring_layout(G, k=3, iterations=50)

        # Draw nodes
        node_colors = ['lightblue', 'lightcoral']
        nx.draw_networkx_nodes(G, pos, node_color=node_colors,
                              node_size=3000, alpha=0.7)

        # Draw edges
        if G.edges():
            edge_widths = [G[u][v]['weight'] * 10 for u, v in G.edges()]
            nx.draw_networkx_edges(G, pos, width=edge_widths,
                                 alpha=0.6, edge_color='gray')

        # Draw labels
        nx.draw_networkx_labels(G, pos, font_size=12, font_weight='bold')

        # Add edge labels
        if G.edges():
            edge_labels = {(u, v): f"Sim: {G[u][v]['similarity']:.3f}"
                          for u, v in G.edges()}
            nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=10)

        plt.title(f'Semantic Relationship Graph - {file_id}\n'
                 f'Similarity: {similarity:.3f}')
        plt.axis('off')
        plt.tight_layout()

        # Save plot
        save_path = os.path.join(
            self.config.RESULTS_PATH, f'semantic_graph_{file_id}.png'
        )
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

        return G, similarity

    def analyze_relationship_metrics(self, text_features: torch.Tensor,
                                   audio_features: torch.Tensor, file_id: str):
        """Analyze semantic relationship metrics"""
        # Compute similarity metrics
        cosine_sim = F.cosine_similarity(
            text_features, audio_features, dim=0
        ).item()

        # L2 distance (normalized)
        l2_distance = torch.norm(text_features - audio_features).item()
        normalized_l2 = l2_distance / (
            torch.norm(text_features) + torch.norm(audio_features)
        ).item()

        # Dot product similarity
        dot_product = torch.dot(text_features, audio_features).item()

        # Interpretation
        if cosine_sim > 0.7:
            relationship = "Strong positive correlation"
        elif cosine_sim > 0.3:
            relationship = "Moderate positive correlation"
        elif cosine_sim > 0.1:
            relationship = "Weak positive correlation"
        elif cosine_sim > -0.1:
            relationship = "No significant correlation"
        else:
            relationship = "Negative correlation"

        self.logger.info(f"Semantic Analysis for {file_id}:")
        self.logger.info(f"  - Cosine Similarity: {cosine_sim:.4f}")
        self.logger.info(f"  - Normalized L2 Distance: {normalized_l2:.4f}")
        self.logger.info(f"  - Dot Product: {dot_product:.4f}")
        self.logger.info(f"  - Interpretation: {relationship}")

        return {
            'cosine_similarity': cosine_sim,
            'l2_distance': normalized_l2,
            'dot_product': dot_product,
            'relationship': relationship
        }

    def generate_evaluation_report(self, evaluation_results: Dict[str, Any]):
        """Generate comprehensive evaluation report"""
        results_df = evaluation_results['results_df']

        # Create report
        report = {
            'timestamp': datetime.now().isoformat(),
            'performance_metrics': {
                'accuracy': evaluation_results['accuracy'],
                'precision': evaluation_results['precision'],
                'recall': evaluation_results['recall'],
                'f1_score': evaluation_results['f1'],
                'roc_auc': evaluation_results['auc']
            },
            'dataset_summary': {
                'total_samples': len(results_df),
                'correct_predictions': (results_df['true_label'] ==
                                      results_df['predicted_label']).sum(),
                'high_confidence_predictions': (results_df['confidence'] > 0.9).sum()
            }
        }

        # Save report
        report_path = os.path.join(
            self.config.RESULTS_PATH, 'evaluation_report.json'
        )
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2)

        # Save detailed results
        results_path = os.path.join(
            self.config.RESULTS_PATH, 'detailed_results.csv'
        )
        results_df.to_csv(results_path, index=False)

        self.logger.info(f"Evaluation report saved to {report_path}")
        self.logger.info(f"Detailed results saved to {results_path}")

        return report


class TrainingService:
    """Main training service orchestrating all components"""

    def __init__(self, config: Config):
        """
        Initialize training service

        Args:
            config: Configuration object
        """
        self.config = config
        self.logger = setup_logging(
            os.path.join(config.LOG_PATH, 'training_service.log')
        )

        # Initialize model
        self.model = MultiModalADReSSoModel(
            audio_feature_dim=768,
            text_feature_dim=768,
            spectrogram_height=80,
            num_classes=2
        )

        # Initialize trainer and evaluator
        self.trainer = ModelTrainer(self.model, config)
        self.evaluator = ModelEvaluator(self.trainer, config)

        # Training state
        self.is_trained = False
        self.training_results = None
        self.evaluation_results = None

        self.logger.info("Training service initialized")

    def prepare_data(self, features_dict: Dict, linguistic_features: Dict,
                    batch_size: int = 8) -> Tuple[DataLoader, DataLoader, DataLoader]:
        """Prepare data loaders for training"""
        self.logger.info("Preparing data loaders...")

        # Create labels based on file naming convention
        labels = {}
        for file_id in features_dict.keys():
            # Assuming file naming convention includes diagnosis/progression info
            if any(keyword in file_id.lower() for keyword in ['ad', 'alzheimer', 'decline', 'impaired']):
                labels[file_id] = 1  # AD/Decline
            else:
                labels[file_id] = 0  # CN/No decline

        self.logger.info(f"Dataset summary:")
        self.logger.info(f"- Total files: {len(features_dict)}")
        self.logger.info(f"- AD/Decline cases: {sum(labels.values())}")
        self.logger.info(f"- CN/No decline cases: {len(labels) - sum(labels.values())}")

        # Validate that we have both classes
        if sum(labels.values()) == 0 or sum(labels.values()) == len(labels):
            self.logger.warning("Dataset appears to have only one class! Check labeling logic.")

        # Split data stratified
        file_ids = list(features_dict.keys())
        try:
            train_ids, test_ids = train_test_split(
                file_ids, test_size=0.2,
                stratify=[labels[f] for f in file_ids],
                random_state=42
            )
            train_ids, val_ids = train_test_split(
                train_ids, test_size=0.2,
                stratify=[labels[f] for f in train_ids],
                random_state=42
            )
        except ValueError as e:
            self.logger.warning(f"Stratified split failed: {e}. Using random split.")
            train_ids, test_ids = train_test_split(file_ids, test_size=0.2, random_state=42)
            train_ids, val_ids = train_test_split(train_ids, test_size=0.2, random_state=42)

        self.logger.info(f"Data split - Train: {len(train_ids)}, "
                        f"Val: {len(val_ids)}, Test: {len(test_ids)}")

        # Create datasets
        def create_subset(ids):
            return (
                {fid: features_dict[fid] for fid in ids if fid in features_dict},
                {fid: linguistic_features[fid] for fid in ids if fid in linguistic_features},
                {fid: labels[fid] for fid in ids if fid in labels}
            )

        train_features, train_linguistic, train_labels = create_subset(train_ids)
        val_features, val_linguistic, val_labels = create_subset(val_ids)
        test_features, test_linguistic, test_labels = create_subset(test_ids)

        # Create datasets
        train_dataset = ADReSSoDataset(train_features, train_linguistic, train_labels)
        val_dataset = ADReSSoDataset(val_features, val_linguistic, val_labels)
        test_dataset = ADReSSoDataset(test_features, test_linguistic, test_labels)

        # Create data loaders
        train_loader = DataLoader(
            train_dataset, batch_size=batch_size, shuffle=True,
            num_workers=min(4, mp.cpu_count()), pin_memory=torch.cuda.is_available()
        )
        val_loader = DataLoader(
            val_dataset, batch_size=batch_size, shuffle=False,
            num_workers=min(4, mp.cpu_count()), pin_memory=torch.cuda.is_available()
        )
        test_loader = DataLoader(
            test_dataset, batch_size=batch_size, shuffle=False,
            num_workers=min(4, mp.cpu_count()), pin_memory=torch.cuda.is_available()
        )

        self.logger.info("Data loaders created successfully")
        return train_loader, val_loader, test_loader

    def train_model(self, features_dict: Dict, linguistic_features: Dict,
                   num_epochs: int = 30, batch_size: int = 8) -> Dict[str, Any]:
        """
        Train the multi-modal model

        Args:
            features_dict: Dictionary of acoustic features
            linguistic_features: Dictionary of linguistic features
            num_epochs: Number of training epochs
            batch_size: Batch size for training

        Returns:
            Training results dictionary
        """
        self.logger.info("Starting model training...")

        try:
            # Prepare data
            train_loader, val_loader, test_loader = self.prepare_data(
                features_dict, linguistic_features, batch_size
            )

            # Store test loader for later evaluation
            self.test_loader = test_loader

            # Train the model
            self.training_results = self.trainer.train_model(
                train_loader, val_loader, num_epochs
            )

            self.is_trained = True
            self.logger.info("Model training completed successfully")

            return self.training_results

        except Exception as e:
            self.logger.error(f"Error during model training: {str(e)}")
            raise

    def evaluate_model(self, custom_test_loader: Optional[DataLoader] = None) -> Dict[str, Any]:
        """
        Evaluate the trained model

        Args:
            custom_test_loader: Optional custom test data loader

        Returns:
            Evaluation results dictionary
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")

        self.logger.info("Starting model evaluation...")

        try:
            # Use provided test loader or stored one
            test_loader = custom_test_loader or self.test_loader
            if test_loader is None:
                raise ValueError("No test data available for evaluation")

            # Perform detailed evaluation
            self.evaluation_results = self.evaluator.evaluate_detailed(test_loader)

            # Generate comprehensive report
            self.evaluator.generate_evaluation_report(self.evaluation_results)

            self.logger.info("Model evaluation completed successfully")
            return self.evaluation_results

        except Exception as e:
            self.logger.error(f"Error during model evaluation: {str(e)}")
            raise

    def analyze_semantics(self, custom_test_loader: Optional[DataLoader] = None,
                         num_samples: int = 5) -> None:
        """
        Analyze semantic relationships between modalities

        Args:
            custom_test_loader: Optional custom test data loader
            num_samples: Number of samples to analyze
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before semantic analysis")

        self.logger.info("Starting semantic relationship analysis...")

        try:
            # Use provided test loader or stored one
            test_loader = custom_test_loader or self.test_loader
            if test_loader is None:
                raise ValueError("No test data available for semantic analysis")

            # Perform semantic analysis
            self.evaluator.analyze_semantic_relationships(test_loader, num_samples)

            self.logger.info("Semantic analysis completed successfully")

        except Exception as e:
            self.logger.error(f"Error during semantic analysis: {str(e)}")
            raise

    def save_model(self, filename: str = "final_adresso_model.pth") -> None:
        """
        Save the trained model

        Args:
            filename: Name of the model file
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before saving")

        try:
            self.trainer.save_model(filename)
            self.logger.info(f"Model saved successfully as {filename}")
        except Exception as e:
            self.logger.error(f"Error saving model: {str(e)}")
            raise

    def load_model(self, filename: str = "best_adresso_model.pth") -> None:
        """
        Load a pre-trained model

        Args:
            filename: Name of the model file to load
        """
        try:
            self.trainer.load_model(filename)
            self.is_trained = True
            self.logger.info(f"Model loaded successfully from {filename}")
        except Exception as e:
            self.logger.error(f"Error loading model: {str(e)}")
            raise

    def predict(self, features_dict: Dict, linguistic_features: Dict,
               batch_size: int = 8) -> Dict[str, Any]:
        """
        Make predictions on new data

        Args:
            features_dict: Dictionary of acoustic features
            linguistic_features: Dictionary of linguistic features
            batch_size: Batch size for prediction

        Returns:
            Prediction results dictionary
        """
        if not self.is_trained:
            raise ValueError("Model must be trained or loaded before making predictions")

        self.logger.info("Making predictions on new data...")

        try:
            # Create temporary labels (will be ignored)
            temp_labels = {fid: 0 for fid in features_dict.keys()}

            # Create dataset and loader
            dataset = ADReSSoDataset(features_dict, linguistic_features, temp_labels)
            data_loader = DataLoader(
                dataset, batch_size=batch_size, shuffle=False,
                num_workers=min(4, mp.cpu_count()), pin_memory=torch.cuda.is_available()
            )

            # Make predictions
            self.trainer.model.eval()
            all_preds = []
            all_probs = []
            all_file_ids = []

            with torch.no_grad():
                for batch in data_loader:
                    # Move to device
                    audio_features = batch['audio_features'].to(self.trainer.device)
                    text_input_ids = batch['text_input_ids'].to(self.trainer.device)
                    text_attention_mask = batch['text_attention_mask'].to(self.trainer.device)
                    spectrograms = batch['spectrogram'].to(self.trainer.device)

                    # Forward pass
                    outputs = self.trainer.model(
                        audio_features, text_input_ids,
                        text_attention_mask, spectrograms
                    )
                    probs = F.softmax(outputs, dim=1)
                    _, predicted = outputs.max(1)

                    # Store results
                    all_preds.extend(predicted.cpu().numpy())
                    all_probs.extend(probs.cpu().numpy())
                    all_file_ids.extend(batch['file_id'])

            # Create results DataFrame
            results_df = pd.DataFrame({
                'file_id': all_file_ids,
                'predicted_label': all_preds,
                'predicted_class': ['CN' if pred == 0 else 'AD' for pred in all_preds],
                'confidence': [max(prob) for prob in all_probs],
                'prob_CN': [prob[0] for prob in all_probs],
                'prob_AD': [prob[1] if len(prob) > 1 else 0 for prob in all_probs]
            })

            self.logger.info(f"Predictions completed for {len(results_df)} samples")

            return {
                'predictions': results_df,
                'summary': {
                    'total_samples': len(results_df),
                    'predicted_CN': (results_df['predicted_label'] == 0).sum(),
                    'predicted_AD': (results_df['predicted_label'] == 1).sum(),
                    'high_confidence': (results_df['confidence'] > 0.9).sum()
                }
            }

        except Exception as e:
            self.logger.error(f"Error during prediction: {str(e)}")
            raise

    def get_training_summary(self) -> Optional[Dict[str, Any]]:
        """
        Get summary of training results

        Returns:
            Training summary dictionary or None if not trained
        """
        if not self.is_trained or self.training_results is None:
            return None

        summary = {
            'model_info': {
                'total_parameters': sum(p.numel() for p in self.model.parameters()),
                'trainable_parameters': sum(p.numel() for p in self.model.parameters() if p.requires_grad),
                'model_size_mb': sum(p.numel() * p.element_size() for p in self.model.parameters()) / (1024 * 1024)
            },
            'training_results': self.training_results,
            'evaluation_results': self.evaluation_results,
            'training_curves': {
                'train_losses': self.trainer.train_losses,
                'val_losses': self.trainer.val_losses,
                'train_accuracies': self.trainer.train_accuracies,
                'val_accuracies': self.trainer.val_accuracies
            }
        }

        return summary

    def run_complete_pipeline(self, features_dict: Dict, linguistic_features: Dict,
                             num_epochs: int = 30, batch_size: int = 8,
                             semantic_samples: int = 5) -> Dict[str, Any]:
        """
        Run the complete training and evaluation pipeline

        Args:
            features_dict: Dictionary of acoustic features
            linguistic_features: Dictionary of linguistic features
            num_epochs: Number of training epochs
            batch_size: Batch size for training
            semantic_samples: Number of samples for semantic analysis

        Returns:
            Complete pipeline results
        """
        self.logger.info("="*60)
        self.logger.info("STARTING COMPLETE TRAINING PIPELINE")
        self.logger.info("="*60)

        pipeline_start = datetime.now()

        try:
            # Step 1: Train model
            self.logger.info("Step 1: Training model...")
            training_results = self.train_model(
                features_dict, linguistic_features, num_epochs, batch_size
            )

            # Step 2: Evaluate model
            self.logger.info("Step 2: Evaluating model...")
            evaluation_results = self.evaluate_model()

            # Step 3: Analyze semantics
            self.logger.info("Step 3: Analyzing semantic relationships...")
            self.analyze_semantics(num_samples=semantic_samples)

            # Step 4: Save model
            self.logger.info("Step 4: Saving final model...")
            self.save_model("final_adresso_model.pth")

            pipeline_time = datetime.now() - pipeline_start

            # Compile results
            complete_results = {
                'pipeline_summary': {
                    'total_time': pipeline_time.total_seconds(),
                    'completion_time': datetime.now().isoformat(),
                    'status': 'success'
                },
                'training_summary': self.get_training_summary(),
                'best_performance': {
                    'accuracy': evaluation_results['accuracy'],
                    'f1_score': evaluation_results['f1'],
                    'auc': evaluation_results['auc']
                }
            }

            self.logger.info("="*60)
            self.logger.info("PIPELINE COMPLETED SUCCESSFULLY")
            self.logger.info(f"Total time: {pipeline_time}")
            self.logger.info(f"Best accuracy: {evaluation_results['accuracy']:.4f}")
            self.logger.info("="*60)

            return complete_results

        except Exception as e:
            self.logger.error(f"Pipeline failed: {str(e)}")
            raise

    def __del__(self):
        """Cleanup resources"""
        if hasattr(self, 'logger'):
            self.logger.info("Training service cleanup completed")

# Extra

# pipeline_service.py - Main Pipeline Orchestrator


In [None]:
"""
Pipeline Service - Main orchestrator for ADReSSo21 speech analysis pipeline
Coordinates all microservices for complete analysis workflow
"""

import os
import json
import pickle
import pandas as pd
from typing import Dict, List, Any, Optional
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import multiprocessing as mp
from datetime import datetime
import logging

# Import your microservices
from config import Config
from utils import setup_logging, ensure_directory
from data_manager_service import DataManagerService
from acoustic_features_service import AcousticFeaturesService
from transcription_service import TranscriptionService
from linguistic_features_service import LinguisticFeaturesService


class PipelineService:
    """
    Main pipeline service that orchestrates all analysis components
    """

    def __init__(self, config_path: Optional[str] = None):
        """
        Initialize pipeline with configuration and services

        Args:
            config_path: Path to configuration file
        """
        # Load configuration
        self.config = Config(config_path)

        # Setup logging
        self.logger = setup_logging(
            log_level=self.config.get('logging.level', 'INFO'),
            log_file=self.config.get('logging.file')
        )

        # Initialize services
        self.data_manager = DataManagerService(self.config)
        self.acoustic_service = AcousticFeaturesService(self.config)
        self.transcription_service = TranscriptionService(self.config)
        self.linguistic_service = LinguisticFeaturesService(self.config)

        # Pipeline state
        self.results = {}
        self.start_time = None
        self.end_time = None

        self.logger.info("Pipeline initialized successfully")

    def run_complete_pipeline(self, parallel: bool = True) -> Dict[str, Any]:
        """
        Run the complete analysis pipeline

        Args:
            parallel: Whether to use parallel processing where possible

        Returns:
            Dictionary containing all pipeline results
        """
        self.start_time = datetime.now()
        self.logger.info("=== Starting ADReSSo21 Speech Analysis Pipeline ===")

        try:
            # Step 1: Load dataset and get audio files
            self.logger.info("Step 1: Loading dataset...")
            audio_files = self._load_dataset()

            # Step 2: Extract acoustic features
            self.logger.info("Step 2: Extracting acoustic features...")
            acoustic_features = self._extract_acoustic_features(audio_files, parallel)

            # Step 3: Extract transcripts
            self.logger.info("Step 3: Extracting transcripts...")
            transcripts = self._extract_transcripts(audio_files, parallel)

            # Step 4: Extract linguistic features
            self.logger.info("Step 4: Extracting linguistic features...")
            linguistic_features = self._extract_linguistic_features(transcripts)

            # Step 5: Combine and save results
            self.logger.info("Step 5: Combining and saving results...")
            final_results = self._combine_and_save_results(
                audio_files, acoustic_features, transcripts, linguistic_features
            )

            self.end_time = datetime.now()
            duration = self.end_time - self.start_time

            self.logger.info(f"Pipeline completed successfully in {duration}")
            self.logger.info(f"Results saved to: {self.config.output_path}")

            return final_results

        except Exception as e:
            self.logger.error(f"Pipeline failed: {str(e)}")
            raise

    def _load_dataset(self) -> Dict[str, List[str]]:
        """Load dataset and get audio file paths"""
        audio_files = self.data_manager.get_audio_files()

        total_files = sum(len(files) for files in audio_files.values())
        self.logger.info(f"Found {total_files} audio files across all categories")

        for category, files in audio_files.items():
            self.logger.info(f"  {category}: {len(files)} files")

        if total_files == 0:
            raise ValueError("No audio files found. Please check the dataset path.")

        return audio_files

    def _extract_acoustic_features(self, audio_files: Dict[str, List[str]],
                                 parallel: bool = True) -> Dict[str, Any]:
        """Extract acoustic features from all audio files"""
        all_features = {}

        if parallel:
            all_features = self._extract_acoustic_features_parallel(audio_files)
        else:
            all_features = self._extract_acoustic_features_sequential(audio_files)

        # Save acoustic features
        features_path = os.path.join(self.config.output_path, "acoustic_features.pkl")
        with open(features_path, 'wb') as f:
            pickle.dump(all_features, f)

        self.logger.info(f"Acoustic features saved to {features_path}")
        return all_features

    def _extract_acoustic_features_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract acoustic features using parallel processing"""
        all_features = {}
        max_workers = min(self.config.get('processing.max_workers', mp.cpu_count()), mp.cpu_count())

        # Flatten all files with their categories
        file_tasks = []
        for category, files in audio_files.items():
            for file_path in files:
                file_tasks.append((file_path, category))

        self.logger.info(f"Processing {len(file_tasks)} files with {max_workers} workers")

        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_file = {
                executor.submit(self.acoustic_service.extract_features, file_path): (file_path, category)
                for file_path, category in file_tasks
            }

            # Collect results
            completed = 0
            for future in as_completed(future_to_file):
                file_path, category = future_to_file[future]
                filename = os.path.basename(file_path)

                try:
                    features = future.result()
                    if features is not None:
                        all_features[f"{category}_{filename}"] = {
                            'file_path': file_path,
                            'category': category,
                            'filename': filename,
                            'features': features
                        }
                    else:
                        self.logger.warning(f"Failed to extract features from {filename}")

                except Exception as e:
                    self.logger.error(f"Error processing {filename}: {str(e)}")

                completed += 1
                if completed % 10 == 0:
                    self.logger.info(f"Completed acoustic feature extraction for {completed}/{len(file_tasks)} files")

        return all_features

    def _extract_acoustic_features_sequential(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract acoustic features sequentially"""
        all_features = {}
        total_files = sum(len(files) for files in audio_files.values())
        processed = 0

        for category, files in audio_files.items():
            self.logger.info(f"Processing acoustic features for {category}...")

            for file_path in files:
                filename = os.path.basename(file_path)

                try:
                    features = self.acoustic_service.extract_features(file_path)
                    if features is not None:
                        all_features[f"{category}_{filename}"] = {
                            'file_path': file_path,
                            'category': category,
                            'filename': filename,
                            'features': features
                        }
                    else:
                        self.logger.warning(f"Failed to extract features from {filename}")

                except Exception as e:
                    self.logger.error(f"Error processing {filename}: {str(e)}")

                processed += 1
                if processed % 10 == 0:
                    self.logger.info(f"Completed {processed}/{total_files} files")

        return all_features

    def _extract_transcripts(self, audio_files: Dict[str, List[str]],
                           parallel: bool = True) -> Dict[str, Any]:
        """Extract transcripts from all audio files"""
        if parallel:
            transcripts = self._extract_transcripts_parallel(audio_files)
        else:
            transcripts = self._extract_transcripts_sequential(audio_files)

        # Save transcripts
        self._save_transcripts(transcripts)

        return transcripts

    def _extract_transcripts_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract transcripts using parallel processing"""
        transcripts = {}
        max_workers = min(self.config.get('processing.transcription_workers', 2), 4)  # Limit for memory

        # Flatten all files with their categories
        file_tasks = []
        for category, files in audio_files.items():
            for file_path in files:
                file_tasks.append((file_path, category))

        self.logger.info(f"Transcribing {len(file_tasks)} files with {max_workers} workers")

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_file = {
                executor.submit(self.transcription_service.transcribe_audio, file_path): (file_path, category)
                for file_path, category in file_tasks
            }

            # Collect results
            completed = 0
            for future in as_completed(future_to_file):
                file_path, category = future_to_file[future]
                filename = os.path.basename(file_path)

                try:
                    transcript_data = future.result()
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        **transcript_data
                    }

                except Exception as e:
                    self.logger.error(f"Error transcribing {filename}: {str(e)}")
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        'transcript': '',
                        'error': str(e)
                    }

                completed += 1
                if completed % 5 == 0:
                    self.logger.info(f"Completed transcription for {completed}/{len(file_tasks)} files")

        return transcripts

    def _extract_transcripts_sequential(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract transcripts sequentially"""
        transcripts = {}

        for category, files in audio_files.items():
            self.logger.info(f"Transcribing {category}...")

            for file_path in files:
                filename = os.path.basename(file_path)

                try:
                    transcript_data = self.transcription_service.transcribe_audio(file_path)
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        **transcript_data
                    }

                except Exception as e:
                    self.logger.error(f"Error transcribing {filename}: {str(e)}")
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        'transcript': '',
                        'error': str(e)
                    }

        return transcripts

    def _extract_linguistic_features(self, transcripts: Dict[str, Any]) -> Dict[str, Any]:
        """Extract linguistic features from transcripts"""
        linguistic_features = self.linguistic_service.extract_features(transcripts)

        # Save linguistic features
        features_path = os.path.join(self.config.output_path, "linguistic_features.pkl")
        with open(features_path, 'wb') as f:
            pickle.dump(linguistic_features, f)

        self.logger.info(f"Linguistic features saved to {features_path}")
        return linguistic_features

    def _save_transcripts(self, transcripts: Dict[str, Any]):
        """Save transcripts to various formats"""
        transcripts_dir = os.path.join(self.config.output_path, "transcripts")
        ensure_directory(transcripts_dir)

        # Save individual transcript files
        for key, data in transcripts.items():
            if 'transcript' in data and data['transcript']:
                filename = f"{key}_transcript.txt"
                filepath = os.path.join(transcripts_dir, filename)

                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(data['transcript'])

        # Save consolidated JSON
        json_path = os.path.join(transcripts_dir, "all_transcripts.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(transcripts, f, indent=2, ensure_ascii=False)

        # Save as pickle
        pkl_path = os.path.join(transcripts_dir, "transcripts.pkl")
        with open(pkl_path, 'wb') as f:
            pickle.dump(transcripts, f)

        self.logger.info(f"Transcripts saved to {transcripts_dir}")

    def _combine_and_save_results(self, audio_files: Dict[str, List[str]],
                                acoustic_features: Dict[str, Any],
                                transcripts: Dict[str, Any],
                                linguistic_features: Dict[str, Any]) -> Dict[str, Any]:
        """Combine all results and save comprehensive dataset"""

        # Create comprehensive results dictionary
        final_results = {
            'pipeline_info': {
                'start_time': self.start_time.isoformat(),
                'end_time': self.end_time.isoformat() if self.end_time else None,
                'total_files': sum(len(files) for files in audio_files.values()),
                'categories': list(audio_files.keys()),
                'config': self.config.to_dict()
            },
            'audio_files': audio_files,
            'acoustic_features': acoustic_features,
            'transcripts': transcripts,
            'linguistic_features': linguistic_features
        }

        # Create summary DataFrame
        summary_data = []
        for key in set(acoustic_features.keys()) | set(transcripts.keys()):
            row = {'file_id': key}

            # Add acoustic info
            if key in acoustic_features:
                row.update({
                    'category': acoustic_features[key]['category'],
                    'filename': acoustic_features[key]['filename'],
                    'has_acoustic_features': True
                })

            # Add transcript info
            if key in transcripts:
                row.update({
                    'has_transcript': True,
                    'transcript_length': len(transcripts[key].get('transcript', '')),
                    'word_count': len(transcripts[key].get('transcript', '').split()),
                    'language': transcripts[key].get('language', 'unknown'),
                    'has_transcript_error': 'error' in transcripts[key]
                })
            else:
                row.update({
                    'has_transcript': False,
                    'transcript_length': 0,
                    'word_count': 0
                })

            # Add linguistic info
            if key in linguistic_features:
                row.update({
                    'has_linguistic_features': True,
                    'unique_words': linguistic_features[key].get('unique_words', 0),
                    'lexical_diversity': linguistic_features[key].get('lexical_diversity', 0)
                })
            else:
                row.update({
                    'has_linguistic_features': False,
                    'unique_words': 0,
                    'lexical_diversity': 0
                })

            summary_data.append(row)

        summary_df = pd.DataFrame(summary_data)

        # Save summary
        summary_path = os.path.join(self.config.output_path, "pipeline_summary.csv")
        summary_df.to_csv(summary_path, index=False)

        # Save complete results
        results_path = os.path.join(self.config.output_path, "complete_results.pkl")
        with open(results_path, 'wb') as f:
            pickle.dump(final_results, f)

        self.logger.info("="*50)
        self.logger.info("PIPELINE SUMMARY")
        self.logger.info("="*50)
        self.logger.info(f"Total files processed: {len(summary_data)}")
        self.logger.info(f"Files with acoustic features: {summary_df['has_acoustic_features'].sum()}")
        self.logger.info(f"Files with transcripts: {summary_df['has_transcript'].sum()}")
        self.logger.info(f"Files with linguistic features: {summary_df['has_linguistic_features'].sum()}")
        self.logger.info(f"Average words per transcript: {summary_df['word_count'].mean():.1f}")
        self.logger.info("="*50)
        self.logger.info("Output files:")
        self.logger.info(f"  - Complete results: {results_path}")
        self.logger.info(f"  - Pipeline summary: {summary_path}")
        self.logger.info(f"  - Acoustic features: {os.path.join(self.config.output_path, 'acoustic_features.pkl')}")
        self.logger.info(f"  - Transcripts: {os.path.join(self.config.output_path, 'transcripts/')}")
        self.logger.info(f"  - Linguistic features: {os.path.join(self.config.output_path, 'linguistic_features.pkl')}")

        return final_results

    def run_sample_analysis(self, max_files_per_category: int = 2):
        """Run pipeline on a small sample for testing"""
        self.logger.info(f"Running sample analysis with max {max_files_per_category} files per category")

        # Get limited audio files
        all_audio_files = self.data_manager.get_audio_files()
        sample_audio_files = {}

        for category, files in all_audio_files.items():
            sample_audio_files[category] = files[:max_files_per_category]

        # Run pipeline on sample
        return self.run_complete_pipeline(parallel=False)

    def get_pipeline_status(self) -> Dict[str, Any]:
        """Get current pipeline status"""
        return {
            'start_time': self.start_time.isoformat() if self.start_time else None,
            'end_time': self.end_time.isoformat() if self.end_time else None,
            'is_running': self.start_time is not None and self.end_time is None,
            'output_path': self.config.output_path,
            'results_available': bool(self.results)
        }


if __name__ == "__main__":
    # Example usage
    pipeline = PipelineService()

    # Run sample analysis first
    print("Running sample analysis...")
    sample_results = pipeline.run_sample_analysis(max_files_per_category=1)

    # Then run full pipeline
    print("\nRunning full pipeline...")
    results = pipeline.run_complete_pipeline(parallel=True)

# main.py - Main Application Entry Point

In [None]:
"""
Main Application Entry Point for ADReSSo21 Speech Analysis
Command-line interface for running the complete analysis pipeline
"""

import argparse
import sys
import os
from pathlib import Path
import json
import traceback
from datetime import datetime

# Add current directory to Python path
current_dir = Path(__file__).parent
sys.path.append(str(current_dir))

from pipeline_service import PipelineService
from config import Config
from utils import setup_logging


def create_sample_config():
    """Create a sample configuration file for first-time setup"""
    sample_config = {
        "dataset": {
            "base_path": "C:/Users/Administrator/Desktop/Speech/ADReSSo21",
            "diagnosis_train_path": "ADReSSo21-diagnosis-train/ADReSSo21/diagnosis/train",
            "progression_train_path": "ADReSSo21-progression-train/ADReSSo21/progression/train",
            "progression_test_path": "ADReSSo21-progression-test/ADReSSo21/progression/test-dist"
        },
        "output": {
            "base_path": "C:/Users/Administrator/Desktop/Speech/output",
            "create_timestamped_folders": True
        },
        "processing": {
            "max_workers": 8,
            "transcription_workers": 2,
            "batch_size": 10,
            "enable_parallel": True
        },
        "models": {
            "whisper_model": "base",
            "wav2vec_model": "facebook/wav2vec2-base-960h",
            "bert_model": "bert-base-uncased"
        },
        "features": {
            "acoustic": {
                "sample_rate": 16000,
                "n_mfcc": 13,
                "n_mels": 80,
                "extract_egemaps": True,
                "extract_prosodic": True
            },
            "linguistic": {
                "max_sequence_length": 512,
                "extract_basic_stats": True,
                "extract_bert_features": True
            }
        },
        "logging": {
            "level": "INFO",
            "file": "adresso_pipeline.log",
            "console": True
        }
    }

    config_path = "config.json"
    with open(config_path, 'w') as f:
        json.dump(sample_config, f, indent=2)

    print(f"Sample configuration created: {config_path}")
    print("Please edit the paths in config.json to match your setup before running the pipeline.")
    return config_path


def validate_paths(config: Config) -> bool:
    """Validate that required paths exist"""
    base_path = config.get('dataset.base_path')

    if not os.path.exists(base_path):
        print(f"Error: Dataset base path does not exist: {base_path}")
        return False

    # Check for at least one of the dataset directories
    required_subdirs = [
        config.get('dataset.diagnosis_train_path'),
        config.get('dataset.progression_train_path'),
        config.get('dataset.progression_test_path')
    ]

    found_dirs = []
    for subdir in required_subdirs:
        full_path = os.path.join(base_path, subdir)
        if os.path.exists(full_path):
            found_dirs.append(subdir)

    if not found_dirs:
        print("Error: No valid dataset directories found!")
        print(f"Checked paths under {base_path}:")
        for subdir in required_subdirs:
            print(f"  - {subdir}")
        return False

    print(f"Found dataset directories: {found_dirs}")
    return True


def run_pipeline_command(args):
    """Run the complete pipeline"""
    try:
        # Initialize pipeline
        pipeline = PipelineService(args.config)

        # Validate configuration
        if not validate_paths(pipeline.config):
            return 1

        # Run pipeline
        if args.sample:
            print("Running sample analysis...")
            results = pipeline.run_sample_analysis(max_files_per_category=args.sample_size)
        else:
            print("Running complete pipeline...")
            results = pipeline.run_complete_pipeline(parallel=args.parallel)

        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY")
        print("="*50)

        # Print summary
        total_files = results['pipeline_info']['total_files']
        print(f"Total files processed: {total_files}")
        print(f"Output directory: {pipeline.config.output_path}")

        return 0

    except Exception as e:
        print(f"\nPipeline failed with error: {str(e)}")
        if args.debug:
            print("\nFull traceback:")
            traceback.print_exc()
        return 1


def run_status_command(args):
    """Check pipeline status"""
    try:
        pipeline = PipelineService(args.config)
        status = pipeline.get_pipeline_status()

        print("Pipeline Status:")
        print(f"  Output Path: {status['output_path']}")
        print(f"  Is Running: {status['is_running']}")
        print(f"  Results Available: {status['results_available']}")

        if status['start_time']:
            print(f"  Last Start Time: {status['start_time']}")
        if status['end_time']:
            print(f"  Last End Time: {status['end_time']}")

        return 0

    except Exception as e:
        print(f"Error checking status: {str(e)}")
        return 1


def run_demo_command(args):
    """Run demo with single file from each category"""
    try:
        pipeline = PipelineService(args.config)

        print("Running demo analysis...")
        print("This will process 1 file from each available category")

        # Run with minimal files
        results = pipeline.run_sample_analysis(max_files_per_category=1)

        print("\nDemo completed successfully!")
        return 0

    except Exception as e:
        print(f"Demo failed: {str(e)}")
        if args.debug:
            traceback.print_exc()
        return 1


def main():
    """Main application entry point"""
    parser = argparse.ArgumentParser(
        description="ADReSSo21 Speech Analysis Pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Create sample configuration
  python main.py init

  # Run demo analysis
  python main.py demo

  # Run complete pipeline
  python main.py run

  # Run with custom config
  python main.py run --config my_config.json

  # Run sample analysis only
  python main.py run --sample --sample-size 2

  # Run without parallel processing
  python main.py run --no-parallel

  # Check status
  python main.py status
        """
    )

    # Global arguments
    parser.add_argument('--config', '-c', default='config.json',
                       help='Configuration file path (default: config.json)')
    parser.add_argument('--debug', action='store_true',
                       help='Enable debug mode with full error traces')

    # Subcommands
    subparsers = parser.add_subparsers(dest='command', help='Available commands')

    # Init command
    init_parser = subparsers.add_parser('init', help='Create sample configuration file')

    # Run command
    run_parser = subparsers.add_parser('run', help='Run the analysis pipeline')
    run_parser.add_argument('--sample', action='store_true',
                           help='Run on sample data only')
    run_parser.add_argument('--sample-size', type=int, default=2,
                           help='Number of files per category for sample run (default: 2)')
    run_parser.add_argument('--no-parallel', dest='parallel', action='store_false',
                           help='Disable parallel processing')

    # Demo command
    demo_parser = subparsers.add_parser('demo', help='Run demo analysis')

    # Status command
    status_parser = subparsers.add_parser('status', help='Check pipeline status')

    # Parse arguments
    args = parser.parse_args()

    # Handle commands
    if args.command == 'init':
        create_sample_config()
        return 0

    elif args.command == 'run':
        return run_pipeline_command(args)

    elif args.command == 'demo':
        return run_demo_command(args)

    elif args.command == 'status':
        return run_status_command(args)

    else:
        # No command specified, show help
        parser.print_help()
        return 0


if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)


# requirements.txt - Project Dependencies


In [None]:
# Core dependencies for ADReSSo21 Speech Analysis Pipeline

# Audio processing
librosa>=0.10.0
soundfile>=0.12.1
opensmile>=2.4.2

# Speech recognition and transcription
openai-whisper>=20231117
transformers>=4.35.0
torch>=2.0.0
torchaudio>=2.0.0

# NLP and language models
tokenizers>=0.14.0
numpy>=1.24.0
scipy>=1.10.0

# Data handling and processing
pandas>=2.0.0
scikit-learn>=1.3.0

# Parallel processing
joblib>=1.3.0

# Configuration and utilities
pyyaml>=6.0
python-dotenv>=1.0.0

# Optional GPU support (uncomment if using CUDA)
# torch>=2.0.0+cu118
# torchaudio>=2.0.0+cu118

# Development and testing (optional)
pytest>=7.4.0
jupyter>=1.0.0
matplotlib>=3.7.0
seaborn>=0.12.0




# setup.py - Project Setup Script



In [None]:
"""
Setup script for ADReSSo21 Speech Analysis Pipeline
Handles installation, environment setup, and model downloads
"""

import os
import sys
import subprocess
import platform
from pathlib import Path
import urllib.request
import zipfile
import json


class PipelineSetup:
    """Setup and installation handler for the pipeline"""

    def __init__(self):
        self.project_root = Path(__file__).parent
        self.system_info = {
            'os': platform.system(),
            'python_version': sys.version,
            'architecture': platform.architecture()[0]
        }

    def check_system_requirements(self):
        """Check if system meets minimum requirements"""
        print("Checking system requirements...")

        # Check Python version
        if sys.version_info < (3, 8):
            print("❌ Python 3.8+ required. Current version:", sys.version)
            return False
        print("✅ Python version:", sys.version.split()[0])

        # Check available memory (approximate)
        try:
            import psutil
            memory_gb = psutil.virtual_memory().total / (1024**3)
            if memory_gb < 8:
                print(f"⚠️  Warning: Low memory detected ({memory_gb:.1f}GB). 16GB+ recommended.")
            else:
                print(f"✅ Memory: {memory_gb:.1f}GB")
        except ImportError:
            print("⚠️  Cannot check memory (psutil not available)")

        # Check disk space
        try:
            disk_space = psutil.disk_usage('.').free / (1024**3)
            if disk_space < 10:
                print(f"⚠️  Warning: Low disk space ({disk_space:.1f}GB). 20GB+ recommended.")
            else:
                print(f"✅ Disk space: {disk_space:.1f}GB available")
        except:
            print("⚠️  Cannot check disk space")

        return True

    def install_dependencies(self):
        """Install Python dependencies"""
        print("\nInstalling Python dependencies...")

        requirements_file = self.project_root / "requirements.txt"

        if not requirements_file.exists():
            print("❌ requirements.txt not found!")
            return False

        try:
            # Upgrade pip first
            subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "pip"],
                         check=True)

            # Install requirements
            subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(requirements_file)],
                         check=True)

            print("✅ Dependencies installed successfully")
            return True

        except subprocess.CalledProcessError as e:
            print(f"❌ Failed to install dependencies: {e}")
            return False

    def setup_directories(self):
        """Create necessary directories"""
        print("\nSetting up directories...")

        directories = [
            "output",
            "logs",
            "models",
            "temp",
            "data"
        ]

        for directory in directories:
            dir_path = self.project_root / directory
            dir_path.mkdir(exist_ok=True)
            print(f"✅ Created/verified: {directory}/")

        return True

    def download_sample_data(self):
        """Download sample data for testing (if available)"""
        print("\nSetting up sample data...")

        # Create a minimal sample structure for testing
        sample_dir = self.project_root / "data" / "sample"
        sample_dir.mkdir(parents=True, exist_ok=True)

        # Create sample directory structure
        sample_structure = [
            "diagnosis/train/audio/ad",
            "diagnosis/train/audio/cn",
            "diagnosis/train/segmentation/ad",
            "diagnosis/train/segmentation/cn",
            "progression/train/audio/decline",
            "progression/train/audio/no_decline",
            "progression/train/segmentation/decline",
            "progression/train/segmentation/no_decline",
            "progression/test-dist/audio",
            "progression/test-dist/segmentation"
        ]

        for structure in sample_structure:
            (sample_dir / structure).mkdir(parents=True, exist_ok=True)

        # Create a sample README
        readme_content = """
# Sample Data Directory Structure

This directory contains the expected structure for ADReSSo21 dataset.

## Directory Structure:
- diagnosis/train/audio/ad/          - Alzheimer's audio files
- diagnosis/train/audio/cn/          - Control audio files
- diagnosis/train/segmentation/      - Segmentation files
- progression/train/audio/           - Progression training audio
- progression/test-dist/audio/       - Progression test audio

## Usage:
Place your actual ADReSSo21 dataset files in this structure, or update
the paths in config.json to point to your dataset location.
"""

        with open(sample_dir / "README.md", "w") as f:
            f.write(readme_content)

        print("✅ Sample directory structure created")
        return True

    def create_default_config(self):
        """Create default configuration file"""
        print("\nCreating default configuration...")

        config = {
            "dataset": {
                "base_path": str(self.project_root / "data" / "sample"),
                "diagnosis_train_path": "diagnosis/train",
                "progression_train_path": "progression/train",
                "progression_test_path": "progression/test-dist"
            },
            "output": {
                "base_path": str(self.project_root / "output"),
                "create_timestamped_folders": True
            },
            "processing": {
                "max_workers": min(os.cpu_count(), 8),
                "transcription_workers": 2,
                "batch_size": 10,
                "enable_parallel": True
            },
            "models": {
                "whisper_model": "base",
                "wav2vec_model": "facebook/wav2vec2-base-960h",
                "bert_model": "bert-base-uncased"
            },
            "features": {
                "acoustic": {
                    "sample_rate": 16000,
                    "n_mfcc": 13,
                    "n_mels": 80,
                    "extract_egemaps": True,
                    "extract_prosodic": True
                },
                "linguistic": {
                    "max_sequence_length": 512,
                    "extract_basic_stats": True,
                    "extract_bert_features": True
                }
            },
            "logging": {
                "level": "INFO",
                "file": "adresso_pipeline.log",
                "console": True
            }
        }

        config_path = self.project_root / "config.json"
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        print(f"✅ Default configuration created: {config_path}")
        return True

    def verify_installation(self):
        """Verify that installation was successful"""
        print("\nVerifying installation...")

        # Test imports
        test_imports = [
            'librosa',
            'whisper',
            'transformers',
            'torch',
            'opensmile',
            'pandas',
            'numpy'
        ]

        failed_imports = []
        for module in test_imports:
            try:
                __import__(module)
                print(f"✅ {module}")
            except ImportError as e:
                print(f"❌ {module}: {e}")
                failed_imports.append(module)

        if failed_imports:
            print(f"\n❌ Failed to import: {failed_imports}")
            print("Please check the installation and try running:")
            print("pip install -r requirements.txt")
            return False

        print("\n✅ All modules imported successfully!")
        return True

    def run_setup(self):
        """Run complete setup process"""
        print("="*60)
        print("ADReSSo21 Speech Analysis Pipeline Setup")
        print("="*60)

        steps = [
            ("System Requirements", self.check_system_requirements),
            ("Dependencies", self.install_dependencies),
            ("Directories", self.setup_directories),
            ("Sample Data", self.download_sample_data),
            ("Configuration", self.create_default_config),
            ("Verification", self.verify_installation)
        ]

        for step_name, step_func in steps:
            print(f"\n{'='*20} {step_name} {'='*20}")
            if not step_func():
                print(f"\n❌ Setup failed at step: {step_name}")
                return False

        print("\n" + "="*60)
        print("🎉 SETUP COMPLETED SUCCESSFULLY!")
        print("="*60)
        print("\nNext steps:")
        print("1. Update config.json with your dataset paths")
        print("2. Run: python main.py demo")
        print("3. Run: python main.py run")
        print("\nFor help: python main.py --help")

        return True


def main():
    """Main setup function"""
    if len(sys.argv) > 1 and sys.argv[1] == "--help":
        print("""
ADReSSo21 Pipeline Setup

Usage:
    python setup.py                 - Run complete setup
    python setup.py --help          - Show this help
    python setup.py --verify-only   - Only verify installation
    python setup.py --deps-only     - Only install dependencies
        """)
        return

    setup = PipelineSetup()

    if len(sys.argv) > 1 and sys.argv[1] == "--verify-only":
        setup.verify_installation()
    elif len(sys.argv) > 1 and sys.argv[1] == "--deps-only":
        setup.install_dependencies()
    else:
        setup.run_setup()


if __name__ == "__main__":
    main()

# README.md - Project Documentation

# ADReSSo21 Speech Analysis Pipeline

A modular, high-performance pipeline for analyzing speech data from the ADReSSo21 dataset (Alzheimer's Dementia Recognition through Spontaneous Speech). This pipeline extracts comprehensive acoustic, linguistic, and semantic features for dementia detection and progression analysis.

## Features

🎯 **Comprehensive Analysis**
- Acoustic feature extraction (eGeMAPS, MFCCs, Mel-spectrograms, Wav2Vec2)
- Speech-to-text transcription (Whisper)
- Linguistic feature analysis (BERT embeddings, lexical diversity)
- Prosodic analysis (F0, energy, spectral features)

⚡ **High Performance**
- Multi-core parallel processing
- Optimized for Windows 10 with 35GB RAM, 10 cores
- Memory-efficient batch processing
- Modular microservice architecture

🔧 **Easy to Use**
- Command-line interface
- Configurable via JSON
- Sample data support
- Comprehensive logging

## System Requirements

- **OS**: Windows 10/11, Linux, macOS
- **Python**: 3.8+
- **RAM**: 16GB+ recommended (35GB optimal)
- **CPU**: Multi-core processor (10 cores optimal)
- **Storage**: 20GB+ free space
- **GPU**: Optional (CUDA-compatible for faster processing)

## Installation

### Quick Setup

```bash
# Clone or download the project
git clone <repository-url>
cd adresso21-pipeline

# Run setup script
python setup.py
```

### Manual Installation

```bash
# Install dependencies
pip install -r requirements.txt

# Create configuration
python main.py init

# Setup directories
mkdir output logs models temp data
```

## Configuration

Edit `config.json` to match your setup:

```json
{
  "dataset": {
    "base_path": "C:/Users/Administrator/Desktop/Speech/ADReSSo21",
    "diagnosis_train_path": "ADReSSo21-diagnosis-train/ADReSSo21/diagnosis/train",
    "progression_train_path": "ADReSSo21-progression-train/ADReSSo21/progression/train",
    "progression_test_path": "ADReSSo21-progression-test/ADReSSo21/progression/test-dist"
  },
  "output": {
    "base_path": "C:/Users/Administrator/Desktop/Speech/output",
    "create_timestamped_folders": true
  },
  "processing": {
    "max_workers": 8,
    "transcription_workers": 2,
    "enable_parallel": true
  }
}
```

## Dataset Structure

Ensure your ADReSSo21 dataset follows this structure:

```
ADReSSo21/
├── diagnosis/train/
│   ├── audio/
│   │   ├── ad/*.wav          # Alzheimer's audio files
│   │   └── cn/*.wav          # Control audio files
│   └── segmentation/
│       ├── ad/*.csv          # Alzheimer's segmentation
│       └── cn/*.csv          # Control segmentation
├── progression/train/
│   ├── audio/
│   │   ├── decline/*.wav     # Decline audio files
│   │   └── no_decline/*.wav  # No decline audio files
│   └── segmentation/
│       ├── decline/*.csv     # Decline segmentation
│       └── no_decline/*.csv  # No decline segmentation
└── progression/test-dist/
    ├── audio/*.wav           # Test audio files
    └── segmentation/*.csv    # Test segmentation
```

## Usage

### Command Line Interface

```bash
# Initialize configuration
python main.py init

# Run demo analysis (1 file per category)
python main.py demo

# Run sample analysis (2 files per category)  
python main.py run --sample --sample-size 2

# Run complete pipeline
python main.py run

# Run without parallel processing
python main.py run --no-parallel

# Check pipeline status
python main.py status

# Custom configuration
python main.py run --config my_config.json
```

### Python API

```python
from pipeline_service import PipelineService

# Initialize pipeline
pipeline = PipelineService('config.json')

# Run complete analysis
results = pipeline.run_complete_pipeline(parallel=True)

# Run sample analysis  
results = pipeline.run_sample_analysis(max_files_per_category=2)

# Check status
status = pipeline.get_pipeline_status()
```

## Architecture

The pipeline follows a modular microservice architecture:

```
main.py                     # Entry point and CLI
├── pipeline_service.py     # Main orchestrator
├── config.py              # Configuration management
├── utils.py               # Utilities and helpers
├── data_manager_service.py          # Dataset loading
├── acoustic_features_service.py     # Audio feature extraction
├── transcription_service.py         # Speech-to-text
└── linguistic_features_service.py   # Text analysis
```

### Key Components

1. **PipelineService**: Main orchestrator that coordinates all services
2. **DataManagerService**: Handles dataset loading and file management
3. **AcousticFeaturesService**: Extracts audio features (eGeMAPS, MFCCs, etc.)
4. **TranscriptionService**: Converts speech to text using Whisper
5. **LinguisticFeaturesService**: Analyzes text features and BERT embeddings

## Output

The pipeline generates comprehensive outputs:

```
output/
├── acoustic_features.pkl        # All acoustic features
├── transcripts/
│   ├── all_transcripts.json    # All transcriptions
│   ├── transcripts.pkl         # Pickle format
│   └── *_transcript.txt        # Individual transcripts
├── linguistic_features.pkl     # Text analysis results
├── pipeline_summary.csv        # Processing summary
├── complete_results.pkl        # Combined results
└── adresso_pipeline.log        # Processing logs
```

### Feature Types

**Acoustic Features:**
- eGeMAPS (88 features)
- MFCCs (13 coefficients + deltas)
- Mel-spectrograms (80 bands)
- Wav2Vec2 embeddings (768 dimensions)
- Prosodic features (F0, energy, spectral)

**Linguistic Features:**
- Basic statistics (word count, sentence count)
- Lexical diversity measures
- BERT embeddings (768 dimensions)
- Language detection
- Segmentation analysis

## Performance

Typical processing times on recommended hardware:

- **Demo** (5 files): ~2-3 minutes
- **Sample** (20 files): ~5-10 minutes  
- **Complete dataset** (500+ files): ~2-4 hours

Memory usage:
- Base: ~2-4 GB
- With parallel processing: ~8-12 GB
- Peak (large files): ~16-20 GB

## Troubleshooting

### Common Issues

**1. Import Errors**
```bash
# Reinstall dependencies
pip install -r requirements.txt --force-reinstall
```

**2. Memory Issues**
- Reduce `max_workers` in config
- Disable parallel processing: `--no-parallel`
- Process in smaller batches

**3. Model Download Issues**
```bash
# Pre-download models
python -c "import whisper; whisper.load_model('base')"
python -c "from transformers import AutoModel; AutoModel.from_pretrained('facebook/wav2vec2-base-960h')"
```

**4. Path Issues**
- Use absolute paths in config.json
- Check file permissions
- Verify dataset structure

### Performance Optimization

**For Limited RAM:**
```json
{
  "processing": {
    "max_workers": 4,
    "transcription_workers": 1,
    "enable_parallel": false
  }
}
```

**For High Performance:**
```json
{
  "processing": {
    "max_workers": 10,
    "transcription_workers": 4,
    "batch_size": 20,
    "enable_parallel": true
  }
}
```

## Development

### Adding New Features

1. Create new service in `services/`
2. Add configuration options
3. Update `pipeline_service.py`
4. Add tests and documentation

### Testing

```bash
# Run demo for testing
python main.py demo

# Run with debug output
python main.py run --debug

# Test specific components
python -c "from acoustic_features_service import AcousticFeaturesService; service = AcousticFeaturesService()"
```

## Contributing

1. Fork the repository
2. Create a feature branch
3. Make changes with tests
4. Submit a pull request

## License

This project is licensed under the MIT License - see the LICENSE file for details.

## Citation

If you use this pipeline in your research, please cite:

```bibtex
@software{adresso21_pipeline,
  title={ADReSSo21 Speech Analysis Pipeline},
  author={Your Name},
  year={2024},
  url={https://github.com/your-repo/adresso21-pipeline}
}
```

## Acknowledgments

- ADReSSo21 Dataset creators
- OpenAI Whisper team
- Hugging Face Transformers
- OpenSMILE developers

## Support

For support and questions:
- Check the troubleshooting section
- Review logs in `output/adresso_pipeline.log`
- Open an issue on GitHub
- Contact: your.email@domain.com