# config.py


In [None]:
"""
Configuration module for ADReSSo21 Speech Analysis
Handles all paths, settings, and system configuration
"""
import os
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List
import multiprocessing

@dataclass
class SystemConfig:
    """System resource configuration"""
    n_cores: int = min(10, multiprocessing.cpu_count())  # Use available cores, max 10
    max_workers: int = 8  # Leave some cores for system
    chunk_size: int = 2  # Process files in chunks
    memory_limit_gb: int = 30  # Leave 5GB for system from your 35GB

@dataclass
class PathConfig:
    """Path configuration for Windows 10"""
    base_path: str = r"C:\Users\Administrator\Desktop\Speech"
    output_path: str = r"C:\Users\Administrator\Desktop\Speech\output"

    # Diagnosis paths
    diagnosis_train_audio_ad: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad"
    diagnosis_train_audio_cn: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn"
    diagnosis_train_seg_ad: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\ad"
    diagnosis_train_seg_cn: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\cn"

    # Progression train paths
    progression_train_audio_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline"
    progression_train_audio_no_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline"
    progression_train_seg_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\decline"
    progression_train_seg_no_decline: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\no_decline"

    # Progression test paths
    progression_test_audio: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio"
    progression_test_seg: str = r"C:\Users\Administrator\Desktop\Speech\ADReSSo21-progression-test\ADReSSo21\progression\test-dist\segmentation"

    def __post_init__(self):
        """Create output directory if it doesn't exist"""
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "features")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "transcripts")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "models")).mkdir(parents=True, exist_ok=True)
        Path(os.path.join(self.output_path, "logs")).mkdir(parents=True, exist_ok=True)

@dataclass
class ModelConfig:
    """Model configuration"""
    whisper_model_size: str = "base"  # base, small, medium, large
    wav2vec_model: str = "facebook/wav2vec2-base-960h"
    bert_model: str = "bert-base-uncased"
    sampling_rate: int = 16000
    max_sequence_length: int = 512

@dataclass
class FeatureConfig:
    """Feature extraction configuration"""
    n_mfcc: int = 13
    n_mels: int = 80
    f0_min: float = 50.0
    f0_max: float = 300.0
    egemaps_feature_count: int = 88
    wav2vec_feature_size: int = 768

# Global configuration instances
SYSTEM_CONFIG = SystemConfig()
PATH_CONFIG = PathConfig()
MODEL_CONFIG = ModelConfig()
FEATURE_CONFIG = FeatureConfig()

def get_audio_file_paths() -> Dict[str, List[str]]:
    """Get all audio file paths organized by category"""
    audio_files = {
        'diagnosis_ad': [],
        'diagnosis_cn': [],
        'progression_decline': [],
        'progression_no_decline': [],
        'progression_test': []
    }

    # Helper function to safely get files
    def get_wav_files(path: str) -> List[str]:
        if os.path.exists(path):
            return [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.wav')]
        return []

    # Get diagnosis files
    audio_files['diagnosis_ad'] = get_wav_files(PATH_CONFIG.diagnosis_train_audio_ad)
    audio_files['diagnosis_cn'] = get_wav_files(PATH_CONFIG.diagnosis_train_audio_cn)

    # Get progression files
    audio_files['progression_decline'] = get_wav_files(PATH_CONFIG.progression_train_audio_decline)
    audio_files['progression_no_decline'] = get_wav_files(PATH_CONFIG.progression_train_audio_no_decline)
    audio_files['progression_test'] = get_wav_files(PATH_CONFIG.progression_test_audio)

    return audio_files

def print_system_info():
    """Print system configuration info"""
    print("=== System Configuration ===")
    print(f"CPU Cores Available: {multiprocessing.cpu_count()}")
    print(f"Using Cores: {SYSTEM_CONFIG.n_cores}")
    print(f"Max Workers: {SYSTEM_CONFIG.max_workers}")
    print(f"Memory Limit: {SYSTEM_CONFIG.memory_limit_gb}GB")
    print(f"Output Path: {PATH_CONFIG.output_path}")
    print(f"Whisper Model: {MODEL_CONFIG.whisper_model_size}")
    print("=" * 40)

# utils.py - Utilities Module


In [None]:
"""
Utilities module for ADReSSo21 Speech Analysis
Common utilities, logging, and helper functions
"""
import os
import json
import pickle
import logging
import psutil
import numpy as np
import pandas as pd
from typing import Dict, Any, List, Union
from pathlib import Path
from datetime import datetime
import gc
import torch

from config import PATH_CONFIG, SYSTEM_CONFIG

def setup_logging(log_level: str = "INFO") -> logging.Logger:
    """Setup logging configuration"""
    log_dir = os.path.join(PATH_CONFIG.output_path, "logs")
    Path(log_dir).mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(log_dir, f"adresso_analysis_{timestamp}.log")

    logging.basicConfig(
        level=getattr(logging, log_level.upper()),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler()
        ]
    )

    logger = logging.getLogger('ADReSSoAnalyzer')
    logger.info(f"Logging initialized. Log file: {log_file}")
    return logger

def monitor_memory_usage() -> Dict[str, float]:
    """Monitor current memory usage"""
    memory = psutil.virtual_memory()
    return {
        'total_gb': memory.total / (1024**3),
        'available_gb': memory.available / (1024**3),
        'used_gb': memory.used / (1024**3),
        'percent_used': memory.percent
    }

def cleanup_memory():
    """Force garbage collection and clear GPU memory if available"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def check_memory_limit(threshold_percent: float = 85.0) -> bool:
    """Check if memory usage is below threshold"""
    memory_info = monitor_memory_usage()
    return memory_info['percent_used'] < threshold_percent

def safe_save_pickle(data: Any, filepath: str, logger: logging.Logger = None):
    """Safely save data as pickle with error handling"""
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
        if logger:
            logger.info(f"Successfully saved pickle: {filepath}")
    except Exception as e:
        if logger:
            logger.error(f"Failed to save pickle {filepath}: {str(e)}")
        raise

def safe_load_pickle(filepath: str, logger: logging.Logger = None) -> Any:
    """Safely load pickle with error handling"""
    try:
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        if logger:
            logger.info(f"Successfully loaded pickle: {filepath}")
        return data
    except Exception as e:
        if logger:
            logger.error(f"Failed to load pickle {filepath}: {str(e)}")
        return None

def safe_save_json(data: Dict, filepath: str, logger: logging.Logger = None):
    """Safely save data as JSON with error handling"""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False, default=str)
        if logger:
            logger.info(f"Successfully saved JSON: {filepath}")
    except Exception as e:
        if logger:
            logger.error(f"Failed to save JSON {filepath}: {str(e)}")
        raise

def validate_audio_file(filepath: str) -> bool:
    """Validate if audio file exists and is readable"""
    return os.path.exists(filepath) and os.path.getsize(filepath) > 0

def get_file_info(filepath: str) -> Dict[str, Any]:
    """Get basic file information"""
    if not os.path.exists(filepath):
        return {'exists': False}

    stat = os.stat(filepath)
    return {
        'exists': True,
        'size_mb': stat.st_size / (1024**2),
        'modified': datetime.fromtimestamp(stat.st_mtime).isoformat(),
        'filename': os.path.basename(filepath),
        'extension': os.path.splitext(filepath)[1]
    }

def create_progress_bar(total: int, desc: str = "Processing") -> Any:
    """Create a progress bar for batch processing"""
    try:
        from tqdm import tqdm
        return tqdm(total=total, desc=desc, unit="files")
    except ImportError:
        # Fallback simple counter if tqdm not available
        class SimpleProgress:
            def __init__(self, total, desc):
                self.total = total
                self.current = 0
                self.desc = desc

            def update(self, n=1):
                self.current += n
                print(f"\r{self.desc}: {self.current}/{self.total}", end="")

            def close(self):
                print()  # New line

        return SimpleProgress(total, desc)

def batch_generator(items: List[Any], batch_size: int):
    """Generate batches from a list of items"""
    for i in range(0, len(items), batch_size):
        yield items[i:i + batch_size]

def flatten_dict(d: Dict, parent_key: str = '', sep: str = '_') -> Dict:
    """Flatten nested dictionary"""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, np.ndarray):
            # Convert numpy arrays to lists for JSON serialization
            items.append((new_key, v.tolist() if v.ndim > 0 else float(v)))
        else:
            items.append((new_key, v))
    return dict(items)

def create_summary_dataframe(results: Dict[str, Any], save_path: str = None) -> pd.DataFrame:
    """Create a summary DataFrame from results dictionary"""
    data = []

    for key, result in results.items():
        if isinstance(result, dict):
            # Flatten the result dictionary
            flat_result = flatten_dict(result)
            flat_result['file_id'] = key
            data.append(flat_result)

    df = pd.DataFrame(data)

    if save_path:
        df.to_csv(save_path, index=False)

    return df

def log_processing_stats(processed: int, failed: int, total: int, logger: logging.Logger):
    """Log processing statistics"""
    success_rate = (processed / total * 100) if total > 0 else 0
    logger.info(f"Processing completed: {processed}/{total} successful ({success_rate:.1f}%)")
    if failed > 0:
        logger.warning(f"Failed files: {failed}")

class ProcessingTimer:
    """Context manager for timing operations"""

    def __init__(self, operation_name: str, logger: logging.Logger = None):
        self.operation_name = operation_name
        self.logger = logger
        self.start_time = None

    def __enter__(self):
        self.start_time = datetime.now()
        if self.logger:
            self.logger.info(f"Starting {self.operation_name}...")
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        end_time = datetime.now()
        duration = end_time - self.start_time

        if self.logger:
            self.logger.info(f"Completed {self.operation_name} in {duration}")

        if exc_type:
            if self.logger:
                self.logger.error(f"Error in {self.operation_name}: {exc_val}")

def ensure_directory_exists(directory_path: str):
    """Ensure directory exists, create if not"""
    Path(directory_path).mkdir(parents=True, exist_ok=True)

def get_available_models() -> Dict[str, bool]:
    """Check which models are available/working"""
    models_status = {
        'whisper': False,
        'wav2vec2': False,
        'bert': False,
        'opensmile': False
    }

    try:
        import whisper
        models_status['whisper'] = True
    except ImportError:
        pass

    try:
        from transformers import Wav2Vec2Processor, Wav2Vec2Model
        models_status['wav2vec2'] = True
    except ImportError:
        pass

    try:
        from transformers import BertTokenizer, BertModel
        models_status['bert'] = True
    except ImportError:
        pass

    try:
        import opensmile
        models_status['opensmile'] = True
    except ImportError:
        pass

    return models_status

# acoustic_features_service.py - Acoustic Features Extraction Service


In [None]:
"""
Acoustic Features Service - Microservice for extracting acoustic features
Handles eGeMAPS, MFCC, Log-mel, Wav2Vec2, and prosodic features
"""
import os
import numpy as np
import librosa
import torch
import warnings
from typing import Dict, Any, List, Tuple, Optional
from concurrent.futures import ProcessPoolExecutor, as_completed
import logging
from dataclasses import dataclass

# Model imports with error handling
try:
    import opensmile
    OPENSMILE_AVAILABLE = True
except ImportError:
    OPENSMILE_AVAILABLE = False

try:
    from transformers import Wav2Vec2Processor, Wav2Vec2Model
    WAV2VEC_AVAILABLE = True
except ImportError:
    WAV2VEC_AVAILABLE = False

from config import MODEL_CONFIG, FEATURE_CONFIG, SYSTEM_CONFIG
from utils import setup_logging, monitor_memory_usage, cleanup_memory, safe_save_pickle

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)

@dataclass
class AcousticFeatures:
    """Data class to hold acoustic features"""
    egemaps: np.ndarray
    mfccs: Dict[str, np.ndarray]
    log_mel: Dict[str, np.ndarray]
    wav2vec2: np.ndarray
    prosodic: Dict[str, float]
    extraction_success: Dict[str, bool]

class AcousticFeaturesService:
    """Service for extracting acoustic features from audio files"""

    def __init__(self, logger: Optional[logging.Logger] = None):
        self.logger = logger or setup_logging()
        self.smile = None
        self.wav2vec_processor = None
        self.wav2vec_model = None

        self._initialize_models()

    def _initialize_models(self):
        """Initialize feature extraction models"""
        self.logger.info("Initializing acoustic feature extraction models...")

        # Initialize OpenSMILE for eGeMAPS
        if OPENSMILE_AVAILABLE:
            try:
                self.smile = opensmile.Smile(
                    feature_set=opensmile.FeatureSet.eGeMAPSv02,
                    feature_level=opensmile.FeatureLevel.Functionals,
                )
                self.logger.info("✓ OpenSMILE (eGeMAPS) initialized")
            except Exception as e:
                self.logger.error(f"Failed to initialize OpenSMILE: {e}")
                self.smile = None
        else:
            self.logger.warning("OpenSMILE not available - eGeMAPS features will be skipped")

        # Initialize Wav2Vec2
        if WAV2VEC_AVAILABLE:
            try:
                self.wav2vec_processor = Wav2Vec2Processor.from_pretrained(MODEL_CONFIG.wav2vec_model)
                self.wav2vec_model = Wav2Vec2Model.from_pretrained(MODEL_CONFIG.wav2vec_model)
                self.logger.info("✓ Wav2Vec2 initialized")
            except Exception as e:
                self.logger.error(f"Failed to initialize Wav2Vec2: {e}")
                self.wav2vec_processor = None
                self.wav2vec_model = None
        else:
            self.logger.warning("Transformers not available - Wav2Vec2 features will be skipped")

    def extract_egemaps_features(self, audio_path: str) -> Tuple[np.ndarray, bool]:
        """Extract eGeMAPS features using OpenSMILE"""
        try:
            if self.smile is None:
                return np.zeros(FEATURE_CONFIG.egemaps_feature_count), False

            features = self.smile.process_file(audio_path).values.flatten()
            return features, True

        except Exception as e:
            self.logger.debug(f"eGeMAPS extraction failed for {os.path.basename(audio_path)}: {e}")
            return np.zeros(FEATURE_CONFIG.egemaps_feature_count), False

    def extract_mfcc_features(self, y: np.ndarray, sr: int) -> Tuple[Dict[str, np.ndarray], bool]:
        """Extract MFCC features and their derivatives"""
        try:
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=FEATURE_CONFIG.n_mfcc)

            features = {
                'mean': np.mean(mfccs, axis=1),
                'std': np.std(mfccs, axis=1),
                'delta': np.mean(librosa.feature.delta(mfccs), axis=1),
                'delta2': np.mean(librosa.feature.delta(mfccs, order=2), axis=1)
            }
            return features, True

        except Exception as e:
            self.logger.debug(f"MFCC extraction failed: {e}")
            default_features = {
                'mean': np.zeros(FEATURE_CONFIG.n_mfcc),
                'std': np.zeros(FEATURE_CONFIG.n_mfcc),
                'delta': np.zeros(FEATURE_CONFIG.n_mfcc),
                'delta2': np.zeros(FEATURE_CONFIG.n_mfcc)
            }
            return default_features, False

    def extract_logmel_features(self, y: np.ndarray, sr: int) -> Tuple[Dict[str, np.ndarray], bool]:
        """Extract log-mel spectrogram features"""
        try:
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=FEATURE_CONFIG.n_mels)
            log_mel = librosa.power_to_db(mel_spec)

            features = {
                'mean': np.mean(log_mel, axis=1),
                'std': np.std(log_mel, axis=1)
            }
            return features, True

        except Exception as e:
            self.logger.debug(f"Log-mel extraction failed: {e}")
            default_features = {
                'mean': np.zeros(FEATURE_CONFIG.n_mels),
                'std': np.zeros(FEATURE_CONFIG.n_mels)
            }
            return default_features, False

    def extract_wav2vec_features(self, y: np.ndarray, sr: int) -> Tuple[np.ndarray, bool]:
        """Extract Wav2Vec2 features"""
        try:
            if self.wav2vec_processor is None or self.wav2vec_model is None:
                return np.zeros(FEATURE_CONFIG.wav2vec_feature_size), False

            if len(y) == 0:
                raise ValueError("Empty audio signal")

            # Ensure correct sampling rate
            if sr != MODEL_CONFIG.sampling_rate:
                y = librosa.resample(y, orig_sr=sr, target_sr=MODEL_CONFIG.sampling_rate)
                sr = MODEL_CONFIG.sampling_rate

            input_values = self.wav2vec_processor(
                y,
                sampling_rate=sr,
                return_tensors="pt"
            ).input_values

            with torch.no_grad():
                wav2vec_features = self.wav2vec_model(input_values).last_hidden_state
                features = torch.mean(wav2vec_features, dim=1).squeeze().numpy()

            return features, True

        except Exception as e:
            self.logger.debug(f"Wav2Vec2 extraction failed: {e}")
            return np.zeros(FEATURE_CONFIG.wav2vec_feature_size), False

    def extract_prosodic_features(self, y: np.ndarray, sr: int) -> Tuple[Dict[str, float], bool]:
        """Extract prosodic features"""
        try:
            # F0 extraction
            f0 = librosa.yin(y, fmin=FEATURE_CONFIG.f0_min, fmax=FEATURE_CONFIG.f0_max, sr=sr)
            f0_clean = f0[f0 > 0]  # Remove unvoiced frames

            # Energy features
            rms = librosa.feature.rms(y=y)

            # Spectral features
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            zero_crossing_rate = librosa.feature.zero_crossing_rate(y)

            features = {
                'f0_mean': float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'f0_std': float(np.std(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'f0_median': float(np.median(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'f0_range': float(np.max(f0_clean) - np.min(f0_clean)) if len(f0_clean) > 0 else 0.0,
                'energy_mean': float(np.mean(rms)),
                'energy_std': float(np.std(rms)),
                'zero_crossing_rate': float(np.mean(zero_crossing_rate)),
                'spectral_centroid': float(np.mean(spectral_centroid)),
                'spectral_rolloff': float(np.mean(spectral_rolloff)),
                'duration': len(y) / sr,
                'voicing_fraction': len(f0_clean) / len(f0) if len(f0) > 0 else 0.0
            }

            return features, True

        except Exception as e:
            self.logger.debug(f"Prosodic feature extraction failed: {e}")
            default_features = {
                'f0_mean': 0.0, 'f0_std': 0.0, 'f0_median': 0.0, 'f0_range': 0.0,
                'energy_mean': 0.0, 'energy_std': 0.0, 'zero_crossing_rate': 0.0,
                'spectral_centroid': 0.0, 'spectral_rolloff': 0.0,
                'duration': 0.0, 'voicing_fraction': 0.0
            }
            return default_features, False

    def extract_features_from_file(self, audio_path: str) -> Optional[AcousticFeatures]:
        """Extract all acoustic features from a single audio file"""
        try:
            # Load audio file
            y, sr = librosa.load(audio_path, sr=MODEL_CONFIG.sampling_rate)

            if len(y) == 0:
                self.logger.warning(f"Empty audio file: {audio_path}")
                return None

            # Extract all features
            egemaps, egemaps_success = self.extract_egemaps_features(audio_path)
            mfccs, mfccs_success = self.extract_mfcc_features(y, sr)
            log_mel, logmel_success = self.extract_logmel_features(y, sr)
            wav2vec2, wav2vec_success = self.extract_wav2vec_features(y, sr)
            prosodic, prosodic_success = self.extract_prosodic_features(y, sr)

            features = AcousticFeatures(
                egemaps=egemaps,
                mfccs=mfccs,
                log_mel=log_mel,
                wav2vec2=wav2vec2,
                prosodic=prosodic,
                extraction_success={
                    'egemaps': egemaps_success,
                    'mfccs': mfccs_success,
                    'log_mel': logmel_success,
                    'wav2vec2': wav2vec_success,
                    'prosodic': prosodic_success
                }
            )

            return features

        except Exception as e:
            self.logger.error(f"Failed to extract features from {audio_path}: {e}")
            return None

    def process_files_batch(self, file_paths: List[str]) -> Dict[str, Optional[AcousticFeatures]]:
        """Process a batch of audio files"""
        results = {}

        for file_path in file_paths:
            filename = os.path.basename(file_path)
            try:
                features = self.extract_features_from_file(file_path)
                results[filename] = features

                # Memory cleanup for large batches
                if len(results) % 10 == 0:
                    cleanup_memory()

            except Exception as e:
                self.logger.error(f"Error processing {filename}: {e}")
                results[filename] = None

        return results

    def extract_features_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Dict[str, Optional[AcousticFeatures]]]:
        """Extract features from all audio files using parallel processing"""
        self.logger.info("Starting parallel acoustic feature extraction...")

        all_results = {}
        total_files = sum(len(files) for files in audio_files.values())
        processed_files = 0

        for category, file_paths in audio_files.items():
            if not file_paths:
                continue

            self.logger.info(f"Processing {category}: {len(file_paths)} files")
            category_results = {}

            # Process files in batches to manage memory
            batch_size = SYSTEM_CONFIG.chunk_size
            batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]

            with ProcessPoolExecutor(max_workers=SYSTEM_CONFIG.max_workers) as executor:
                # Submit batch jobs
                future_to_batch = {
                    executor.submit(process_audio_batch_worker, batch): batch
                    for batch in batches
                }

                # Collect results
                for future in as_completed(future_to_batch):
                    batch = future_to_batch[future]
                    try:
                        batch_results = future.result()
                        category_results.update(batch_results)
                        processed_files += len(batch)

                        # Log progress
                        progress = (processed_files / total_files) * 100
                        self.logger.info(f"Progress: {processed_files}/{total_files} ({progress:.1f}%)")

                        # Check memory usage
                        memory_info = monitor_memory_usage()
                        if memory_info['percent_used'] > 80:
                            self.logger.warning(f"High memory usage: {memory_info['percent_used']:.1f}%")
                            cleanup_memory()

                    except Exception as e:
                        self.logger.error(f"Batch processing failed: {e}")

            all_results[category] = category_results
            self.logger.info(f"Completed {category}: {len(category_results)} files processed")

        # Final cleanup
        cleanup_memory()
        self.logger.info(f"Acoustic feature extraction completed: {processed_files}/{total_files} files")

        return all_results

    def save_features(self, features: Dict[str, Dict[str, Optional[AcousticFeatures]]], output_dir: str):
        """Save extracted features to disk"""
        self.logger.info("Saving acoustic features...")

        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        # Save features by category
        for category, category_features in features.items():
            category_path = os.path.join(output_dir, f"acoustic_features_{category}.pkl")

            # Convert AcousticFeatures objects to dictionaries for serialization
            serializable_features = {}
            for filename, feature_obj in category_features.items():
                if feature_obj is not None:
                    serializable_features[filename] = {
                        'egemaps': feature_obj.egemaps,
                        'mfccs': feature_obj.mfccs,
                        'log_mel': feature_obj.log_mel,
                        'wav2vec2': feature_obj.wav2vec2,
                        'prosodic': feature_obj.prosodic,
                        'extraction_success': feature_obj.extraction_success
                    }
                else:
                    serializable_features[filename] = None

            safe_save_pickle(serializable_features, category_path, self.logger)

        # Save combined features
        combined_path = os.path.join(output_dir, "acoustic_features_all.pkl")
        safe_save_pickle(features, combined_path, self.logger)

        self.logger.info(f"Acoustic features saved to {output_dir}")

def process_audio_batch_worker(file_paths: List[str]) -> Dict[str, Optional[AcousticFeatures]]:
    """Worker function for parallel processing of audio batches"""
    # Create a new service instance for each worker to avoid sharing model states
    service = AcousticFeaturesService()
    return service.process_files_batch(file_paths)

def demonstrate_acoustic_features(audio_file_path: str, logger: Optional[logging.Logger] = None):
    """Demonstrate acoustic feature extraction on a single file"""
    if logger is None:
        logger = setup_logging()

    service = AcousticFeaturesService(logger)

    logger.info(f"Demonstrating acoustic features for: {os.path.basename(audio_file_path)}")

    features = service.extract_features_from_file(audio_file_path)

    if features is None:
        logger.error("Failed to extract features")
        return

    print(f"\n=== Acoustic Features for {os.path.basename(audio_file_path)} ===\n")

    # eGeMAPS
    print(f"1. eGeMAPS Features: {len(features.egemaps)} features")
    print(f"   Success: {features.extraction_success['egemaps']}")
    print(f"   Shape: {features.egemaps.shape}")
    print(f"   Sample values: {features.egemaps[:5]}")
    print()

    # MFCCs
    print("2. MFCC Features:")
    print(f"   Success: {features.extraction_success['mfccs']}")
    for key, values in features.mfccs.items():
        print(f"   {key}: {values.shape} - {values[:5]}")
    print()

    # Log-mel
    print("3. Log-Mel Spectrogram Features:")
    print(f"   Success: {features.extraction_success['log_mel']}")
    for key, values in features.log_mel.items():
        print(f"   {key}: {values.shape} - {values[:5]}")
    print()

    # Wav2Vec2
    print(f"4. Wav2Vec2 Features: {features.wav2vec2.shape}")
    print(f"   Success: {features.extraction_success['wav2vec2']}")
    print(f"   Sample values: {features.wav2vec2[:5]}")
    print()

    # Prosodic
    print("5. Prosodic Features:")
    print(f"   Success: {features.extraction_success['prosodic']}")
    for key, value in features.prosodic.items():
        print(f"   {key}: {value:.4f}")
    print()

    # Success summary
    successful_features = sum(features.extraction_success.values())
    total_features = len(features.extraction_success)
    print(f"Feature extraction success: {successful_features}/{total_features}")

if __name__ == "__main__":
    # Test the service with a single file
    from config import get_audio_file_paths

    logger = setup_logging()
    audio_files = get_audio_file_paths()

    # Find first available audio file for demonstration
    test_file = None
    for category, files in audio_files.items():
        if files:
            test_file = files[0]
            break

    if test_file:
        demonstrate_acoustic_features(test_file, logger)
    else:
        logger.error("No audio files found for demonstration")

# transcription_service.py - Speech Transcription Service


In [None]:
"""
Transcription Service - Microservice for speech-to-text transcription using Whisper
Handles parallel transcription with memory management
"""
import os
import json
import numpy as np
from typing import Dict, List, Optional, Any
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass
import logging

# Whisper import with error handling
try:
    import whisper
    WHISPER_AVAILABLE = True
except ImportError:
    WHISPER_AVAILABLE = False

from config import MODEL_CONFIG, SYSTEM_CONFIG, PATH_CONFIG
from utils import (setup_logging, monitor_memory_usage, cleanup_memory,
                   safe_save_json, safe_save_pickle, ProcessingTimer,
                   create_progress_bar, validate_audio_file)

@dataclass
class TranscriptionResult:
    """Data class for transcription results"""
    file_path: str
    category: str
    filename: str
    transcript: str
    language: str
    segments: int
    duration: float
    confidence: float
    success: bool
    error_message: Optional[str] = None

class TranscriptionService:
    """Service for transcribing audio files using Whisper"""

    def __init__(self, model_size: str = None, logger: Optional[logging.Logger] = None):
        self.logger = logger or setup_logging()
        self.model_size = model_size or MODEL_CONFIG.whisper_model_size
        self.whisper_model = None

        self._initialize_model()

    def _initialize_model(self):
        """Initialize Whisper model"""
        if not WHISPER_AVAILABLE:
            self.logger.error("Whisper not available. Please install: pip install openai-whisper")
            return

        try:
            self.logger.info(f"Loading Whisper model: {self.model_size}")
            self.whisper_model = whisper.load_model(self.model_size)
            self.logger.info("✓ Whisper model loaded successfully")
        except Exception as e:
            self.logger.error(f"Failed to load Whisper model: {e}")
            self.whisper_model = None

    def transcribe_audio_file(self, audio_path: str, category: str = "") -> TranscriptionResult:
        """Transcribe a single audio file"""
        filename = os.path.basename(audio_path)

        # Validate file
        if not validate_audio_file(audio_path):
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message="Invalid or missing audio file"
            )

        if self.whisper_model is None:
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message="Whisper model not available"
            )

        try:
            self.logger.debug(f"Transcribing {filename}...")

            # Transcribe with Whisper
            result = self.whisper_model.transcribe(
                audio_path,
                fp16=False,  # Use fp32 for better compatibility
                language=None,  # Auto-detect language
                task="transcribe"
            )

            transcript_text = result["text"].strip()
            segments = result.get("segments", [])

            # Calculate average confidence if available
            confidence = 0.0
            if segments:
                confidences = [seg.get("avg_logprob", 0) for seg in segments if "avg_logprob" in seg]
                if confidences:
                    # Convert log probabilities to rough confidence scores
                    confidence = float(np.mean([np.exp(c) for c in confidences]))

            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript=transcript_text,
                language=result.get("language", "unknown"),
                segments=len(segments),
                duration=result.get("duration", 0.0),
                confidence=confidence,
                success=True
            )

        except Exception as e:
            self.logger.error(f"Transcription failed for {filename}: {e}")
            return TranscriptionResult(
                file_path=audio_path,
                category=category,
                filename=filename,
                transcript="",
                language="",
                segments=0,
                duration=0.0,
                confidence=0.0,
                success=False,
                error_message=str(e)
            )

    def transcribe_files_batch(self, file_paths: List[str], category: str = "") -> Dict[str, TranscriptionResult]:
        """Transcribe a batch of audio files"""
        results = {}

        for file_path in file_paths:
            filename = os.path.basename(file_path)

            try:
                result = self.transcribe_audio_file(file_path, category)
                results[filename] = result

                # Memory management
                if len(results) % 5 == 0:  # More frequent cleanup for transcription
                    cleanup_memory()

            except Exception as e:
                self.logger.error(f"Error processing {filename}: {e}")
                results[filename] = TranscriptionResult(
                    file_path=file_path,
                    category=category,
                    filename=filename,
                    transcript="",
                    language="",
                    segments=0,
                    duration=0.0,
                    confidence=0.0,
                    success=False,
                    error_message=str(e)
                )

        return results

    def transcribe_all_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Dict[str, TranscriptionResult]]:
        """Transcribe all audio files using parallel processing"""
        self.logger.info("Starting parallel transcription...")

        all_results = {}
        total_files = sum(len(files) for files in audio_files.values())
        processed_files = 0

        with ProcessingTimer("Complete transcription process", self.logger):
            for category, file_paths in audio_files.items():
                if not file_paths:
                    continue

                self.logger.info(f"Transcribing {category}: {len(file_paths)} files")
                category_results = {}

                # Use smaller batches for transcription to manage memory better
                batch_size = max(1, SYSTEM_CONFIG.chunk_size // 2)  # Smaller batches
                batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]

                # Use fewer workers for transcription as it's memory intensive
                max_workers = min(SYSTEM_CONFIG.max_workers // 2, 4)

                with ProcessPoolExecutor(max_workers=max_workers) as executor:
                    # Submit batch jobs
                    future_to_batch = {
                        executor.submit(transcribe_batch_worker, batch, category, self.model_size): batch
                        for batch in batches
                    }

                    # Progress tracking
                    progress_bar = create_progress_bar(len(batches), f"Transcribing {category}")

                    # Collect results
                    for future in as_completed(future_to_batch):
                        batch = future_to_batch[future]
                        try:
                            batch_results = future.result()
                            category_results.update(batch_results)
                            processed_files += len(batch)

                            progress_bar.update(1)

                            # Memory monitoring
                            memory_info = monitor_memory_usage()
                            if memory_info['percent_used'] > 75:
                                self.logger.warning(f"High memory usage: {memory_info['percent_used']:.1f}%")
                                cleanup_memory()

                        except Exception as e:
                            self.logger.error(f"Batch transcription failed: {e}")

                    progress_bar.close()

                all_results[category] = category_results

                # Log category completion stats
                successful = sum(1 for result in category_results.values() if result.success)
                self.logger.info(f"Completed {category}: {successful}/{len(category_results)} successful")

        # Final cleanup
        cleanup_memory()

        # Log overall stats
        total_successful = sum(
            sum(1 for result in category_results.values() if result.success)
            for category_results in all_results.values()
        )
        self.logger.info(f"Transcription completed: {total_successful}/{processed_files} successful")

        return all_results

    def save_transcriptions(self, transcriptions: Dict[str, Dict[str, TranscriptionResult]], output_dir: str):
        """Save transcriptions in multiple formats"""
        self.logger.info("Saving transcriptions...")

        # Create output directories
        transcripts_dir = os.path.join(output_dir, "transcripts")
        os.makedirs(transcripts_dir, exist_ok=True)

        all_transcripts = {}
        transcript_summary = []

        for category, category_results in transcriptions.items():
            # Save individual category files
            category_transcripts = {}

            for filename, result in category_results.items():
                # Convert to dictionary for JSON serialization
                transcript_dict = {
                    'file_path': result.file_path,
                    'category': result.category,
                    'filename': result.filename,
                    'transcript': result.transcript,
                    'language': result.language,
                    'segments': result.segments,
                    'duration': result.duration,
                    'confidence': result.confidence,
                    'success': result.success,
                    'error_message': result.error_message
                }

                category_transcripts[filename] = transcript_dict
                all_transcripts[f"{category}_{filename}"] = transcript_dict

                # Add to summary
                transcript_summary.append({
                    'File_ID': f"{category}_{filename}",
                    'Category': result.category,
                    'Filename': result.filename,
                    'Success': result.success,
                    'Language': result.language,
                    'Duration': result.duration,
                    'Transcript_Length': len(result.transcript),
                    'Word_Count': len(result.transcript.split()) if result.transcript else 0,
                    'Segments': result.segments,
                    'Confidence': result.confidence,
                    'Error': result.error_message if result.error_message else "",
                    'Transcript_Preview': (result.transcript[:100] + "...") if len(result.transcript) > 100 else result.transcript
                })

                # Save individual transcript file
                if result.success and result.transcript:
                    transcript_file = os.path.join(transcripts_dir, f"{category}_{filename}_transcript.txt")
                    try:
                        with open(transcript_file, 'w', encoding='utf-8') as f:
                            f.write(result.transcript)
                    except Exception as e:
                        self.logger.warning(f"Failed to save individual transcript {transcript_file}: {e}")

            # Save category JSON
            category_json_path = os.path.join(transcripts_dir, f"transcripts_{category}.json")
            safe_save_json(category_transcripts, category_json_path, self.logger)

        # Save consolidated files
        all_transcripts_path = os.path.join(transcripts_dir, "all_transcripts.json")
        safe_save_json(all_transcripts, all_transcripts_path, self.logger)

        # Save as pickle
        transcripts_pickle_path = os.path.join(transcripts_dir, "transcripts.pkl")
        safe_save_pickle(all_transcripts, transcripts_pickle_path, self.logger)

        # Save summary CSV
        try:
            import pandas as pd
            summary_df = pd.DataFrame(transcript_summary)
            summary_csv_path = os.path.join(output_dir, "transcript_summary.csv")
            summary_df.to_csv(summary_csv_path, index=False)
            self.logger.info(f"Transcript summary saved: {summary_csv_path}")
        except ImportError:
            self.logger.warning("Pandas not available, skipping CSV summary")

        self.logger.info(f"All transcriptions saved to {output_dir}")
        return all_transcripts

def transcribe_batch_worker(file_paths: List[str], category: str, model_size: str) -> Dict[str, TranscriptionResult]:
    """Worker function for parallel transcription"""
    # Create new service instance for each worker
    service = TranscriptionService(model_size=model_size)
    return service.transcribe_files_batch(file_paths, category)

def demonstrate_transcription(audio_file_path: str, logger: Optional[logging.Logger] = None):
    """Demonstrate transcription on a single file"""
    if logger is None:
        logger = setup_logging()

    service = TranscriptionService(logger=logger)

    logger.info(f"Demonstrating transcription for: {os.path.basename(audio_file_path)}")

    result = service.transcribe_audio_file(audio_file_path, "demo")

    print(f"\n=== Transcription Result for {result.filename} ===")
    print(f"Success: {result.success}")
    print(f"Language: {result.language}")
    print(f"Duration: {result.duration:.2f} seconds")
    print(f"Segments: {result.segments}")
    print(f"Confidence: {result.confidence:.3f}")

    if result.success:
        print(f"Transcript ({len(result.transcript)} chars, {len(result.transcript.split())} words):")
        print(f'"{result.transcript}"')
    else:
        print(f"Error: {result.error_message}")
    print()

if __name__ == "__main__":
    # Test the service
    from config import get_audio_file_paths

    logger = setup_logging()
    audio_files = get_audio_file_paths()

    # Find first available file for demonstration
    test_file = None
    for category, files in audio_files.items():
        if files:
            test_file = files[0]
            break

    if test_file:
        demonstrate_transcription(test_file, logger)
    else:
        logger.error("No audio files found for demonstration")

# linguistic_features_service.py - Linguistic Features Service


In [None]:
"""
Linguistic Features Service - Microservice for extracting linguistic features and BERT embeddings
Handles text analysis and BERT preprocessing
"""
import os
import re
import numpy as np
import torch
from typing import Dict, List, Optional, Any, Tuple
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass
import logging
from collections import Counter
import string

# BERT imports with error handling
try:
    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
    BERT_AVAILABLE = True
except ImportError:
    BERT_AVAILABLE = False

from config import MODEL_CONFIG, SYSTEM_CONFIG
from utils import (setup_logging, monitor_memory_usage, cleanup_memory,
                   safe_save_pickle, ProcessingTimer, batch_generator)
from transcription_service import TranscriptionResult

@dataclass
class LinguisticFeatures:
    """Data class for linguistic features"""
    # Basic text statistics
    raw_text: str
    word_count: int
    sentence_count: int
    char_count: int
    avg_word_length: float
    avg_sentence_length: float

    # Vocabulary features
    unique_words: int
    lexical_diversity: float
    function_words_ratio: float
    content_words_ratio: float

    # Syntactic features
    noun_ratio: float
    verb_ratio: float
    adjective_ratio: float
    pronoun_ratio: float

    # Semantic complexity
    syllable_count: int
    avg_syllables_per_word: float
    complex_words_ratio: float  # Words with 3+ syllables

    # Discourse features
    repetition_ratio: float
    pause_indicators: int
    filler_words: int

    # BERT features
    bert_tokens: List[str]
    bert_input_ids: List[int]
    bert_attention_mask: List[int]
    bert_embeddings: Optional[np.ndarray]

    # Processing metadata
    processing_success: bool
    error_message: Optional[str] = None

class LinguisticFeaturesService:
    """Service for extracting linguistic features from transcripts"""

    def __init__(self, logger: Optional[logging.Logger] = None):
        self.logger = logger or setup_logging()
        self.bert_tokenizer = None
        self.bert_model = None

        # Define word lists for analysis
        self.function_words = self._load_function_words()
        self.filler_words = {'um', 'uh', 'er', 'ah', 'hmm', 'well', 'like', 'you know', 'sort of', 'kind of'}
        self.pause_indicators = {'[pause]', '[silence]', '...', '--'}

        # Simple POS tag mappings for basic analysis
        self.noun_patterns = {'NN', 'NNS', 'NNP', 'NNPS'}
        self.verb_patterns = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
        self.adjective_patterns = {'JJ', 'JJR', 'JJS'}
        self.pronoun_patterns = {'PRP', 'PRP$', 'WP', 'WP$'}

        self._initialize_bert()

    def _initialize_bert(self):
        """Initialize BERT model for embeddings"""
        if not BERT_AVAILABLE:
            self.logger.warning("Transformers not available - BERT features will be limited")
            return

        try:
            self.logger.info(f"Loading BERT model: {MODEL_CONFIG.bert_model}")
            self.bert_tokenizer = BertTokenizer.from_pretrained(MODEL_CONFIG.bert_model)
            self.bert_model = BertModel.from_pretrained(MODEL_CONFIG.bert_model)
            self.bert_model.eval()  # Set to evaluation mode
            self.logger.info("✓ BERT model loaded successfully")
        except Exception as e:
            self.logger.error(f"Failed to load BERT model: {e}")
            self.bert_tokenizer = None
            self.bert_model = None

    def _load_function_words(self) -> set:
        """Load common function words"""
        function_words = {
            # Articles
            'a', 'an', 'the',
            # Prepositions
            'in', 'on', 'at', 'by', 'to', 'from', 'of', 'with', 'about', 'into', 'through', 'during',
            'before', 'after', 'above', 'below', 'over', 'under', 'between', 'among', 'against',
            # Conjunctions
            'and', 'or', 'but', 'nor', 'for', 'yet', 'so', 'because', 'since', 'although', 'while',
            'if', 'unless', 'until', 'when', 'where', 'how', 'why',
            # Pronouns
            'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
            'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'ours', 'theirs',
            'this', 'that', 'these', 'those', 'what', 'which', 'who', 'whom', 'whose',
            # Auxiliary verbs
            'am', 'is', 'are', 'was', 'were', 'be', 'being', 'been',
            'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
            'will', 'would', 'shall', 'should', 'may', 'might', 'can', 'could', 'must',
            # Others
            'not', 'no', 'yes', 'there', 'here'
        }
        return function_words

    def _count_syllables(self, word: str) -> int:
        """Count syllables in a word (simple heuristic)"""
        word = word.lower().strip(".,!?;:")
        if not word:
            return 0

        # Remove silent 'e' at the end
        if word.endswith('e') and len(word) > 1:
            word = word[:-1]

        # Count vowel groups
        vowels = "aeiouy"
        syllable_count = 0
        prev_was_vowel = False

        for char in word:
            is_vowel = char in vowels
            if is_vowel and not prev_was_vowel:
                syllable_count += 1
            prev_was_vowel = is_vowel

        # Every word has at least one syllable
        return max(1, syllable_count)

    def _extract_basic_features(self, text: str) -> Dict[str, Any]:
        """Extract basic text statistics"""
        if not text.strip():
            return {
                'word_count': 0,
                'sentence_count': 0,
                'char_count': 0,
                'avg_word_length': 0.0,
                'avg_sentence_length': 0.0
            }

        # Clean text
        clean_text = text.strip()

        # Word analysis
        words = clean_text.split()
        word_count = len(words)

        # Sentence analysis - improved sentence detection
        sentences = re.split(r'[.!?]+', clean_text)
        sentences = [s.strip() for s in sentences if s.strip()]
        sentence_count = len(sentences)

        # Character count (excluding spaces)
        char_count = len(clean_text.replace(' ', ''))

        # Averages
        avg_word_length = np.mean([len(word) for word in words]) if words else 0.0
        avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0.0

        return {
            'word_count': word_count,
            'sentence_count': sentence_count,
            'char_count': char_count,
            'avg_word_length': avg_word_length,
            'avg_sentence_length': avg_sentence_length
        }

    def _extract_vocabulary_features(self, text: str) -> Dict[str, Any]:
        """Extract vocabulary and lexical features"""
        if not text.strip():
            return {
                'unique_words': 0,
                'lexical_diversity': 0.0,
                'function_words_ratio': 0.0,
                'content_words_ratio': 0.0
            }

        # Tokenize and clean words
        words = text.lower().split()
        words = [word.strip(string.punctuation) for word in words if word.strip(string.punctuation)]

        if not words:
            return {
                'unique_words': 0,
                'lexical_diversity': 0.0,
                'function_words_ratio': 0.0,
                'content_words_ratio': 0.0
            }

        unique_words = len(set(words))
        lexical_diversity = unique_words / len(words)

        # Function vs content words
        function_word_count = sum(1 for word in words if word in self.function_words)
        content_word_count = len(words) - function_word_count

        function_words_ratio = function_word_count / len(words)
        content_words_ratio = content_word_count / len(words)

        return {
            'unique_words': unique_words,
            'lexical_diversity': lexical_diversity,
            'function_words_ratio': function_words_ratio,
            'content_words_ratio': content_words_ratio
        }

    def _extract_syntactic_features(self, text: str) -> Dict[str, Any]:
        """Extract syntactic features (simplified POS analysis)"""
        if not text.strip():
            return {
                'noun_ratio': 0.0,
                'verb_ratio': 0.0,
                'adjective_ratio': 0.0,
                'pronoun_ratio': 0.0
            }

        words = text.lower().split()
        words = [word.strip(string.punctuation) for word in words if word.strip(string.punctuation)]

        if not words:
            return {
                'noun_ratio': 0.0,
                'verb_ratio': 0.0,
                'adjective_ratio': 0.0,
                'pronoun_ratio': 0.0
            }

        # Simple heuristic-based POS tagging
        noun_count = 0
        verb_count = 0
        adjective_count = 0
        pronoun_count = 0

        # Common verb endings and forms
        verb_endings = {'ed', 'ing', 'es', 's'}
        common_verbs = {'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did',
                       'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must'}

        # Common adjective endings
        adj_endings = {'ly', 'ful', 'less', 'ous', 'ive', 'able', 'ible'}

        # Common pronouns (already in function words, but specific ones)
        pronouns = {'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
                   'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those'}

        for word in words:
            # Check pronouns first
            if word in pronouns:
                pronoun_count += 1
            # Check common verbs
            elif word in common_verbs or any(word.endswith(ending) for ending in verb_endings if len(word) > 3):
                verb_count += 1
            # Check adjectives (simple heuristic)
            elif any(word.endswith(ending) for ending in adj_endings):
                adjective_count += 1
            # Default to noun if capitalized or doesn't match other patterns
            else:
                noun_count += 1

        total_words = len(words)
        return {
            'noun_ratio': noun_count / total_words,
            'verb_ratio': verb_count / total_words,
            'adjective_ratio': adjective_count / total_words,
            'pronoun_ratio': pronoun_count / total_words
        }

    def _extract_semantic_features(self, text: str) -> Dict[str, Any]:
        """Extract semantic complexity features"""
        if not text.strip():
            return {
                'syllable_count': 0,
                'avg_syllables_per_word': 0.0,
                'complex_words_ratio': 0.0
            }

        words = text.split()
        words = [word.strip(string.punctuation) for word in words if word.strip(string.punctuation)]

        if not words:
            return {
                'syllable_count': 0,
                'avg_syllables_per_word': 0.0,
                'complex_words_ratio': 0.0
            }

        syllable_counts = [self._count_syllables(word) for word in words]
        total_syllables = sum(syllable_counts)
        complex_words = sum(1 for count in syllable_counts if count >= 3)

        return {
            'syllable_count': total_syllables,
            'avg_syllables_per_word': total_syllables / len(words),
            'complex_words_ratio': complex_words / len(words)
        }

    def _extract_discourse_features(self, text: str) -> Dict[str, Any]:
        """Extract discourse and disfluency features"""
        if not text.strip():
            return {
                'repetition_ratio': 0.0,
                'pause_indicators': 0,
                'filler_words': 0
            }

        # Count pause indicators
        pause_count = 0
        text_lower = text.lower()
        for indicator in self.pause_indicators:
            pause_count += text_lower.count(indicator)

        # Count filler words
        words = text.lower().split()
        filler_count = 0
        for filler in self.filler_words:
            if ' ' in filler:  # Multi-word fillers
                filler_count += text_lower.count(filler)
            else:  # Single word fillers
                filler_count += words.count(filler)

        # Calculate repetition ratio (simple word repetition)
        word_counts = Counter(words)
        repeated_words = sum(count - 1 for count in word_counts.values() if count > 1)
        repetition_ratio = repeated_words / len(words) if words else 0.0

        return {
            'repetition_ratio': repetition_ratio,
            'pause_indicators': pause_count,
            'filler_words': filler_count
        }

    def _extract_bert_features(self, text: str, max_length: int = 512) -> Dict[str, Any]:
        """Extract BERT tokens and embeddings"""
        if not BERT_AVAILABLE or not self.bert_tokenizer:
            return {
                'bert_tokens': [],
                'bert_input_ids': [],
                'bert_attention_mask': [],
                'bert_embeddings': None
            }

        try:
            # Tokenize text
            encoded = self.bert_tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )

            # Get tokens for analysis
            tokens = self.bert_tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])

            # Get embeddings if model is available
            embeddings = None
            if self.bert_model:
                with torch.no_grad():
                    outputs = self.bert_model(**encoded)
                    # Use [CLS] token embedding as sentence representation
                    embeddings = outputs.last_hidden_state[0][0].cpu().numpy()

            return {
                'bert_tokens': tokens,
                'bert_input_ids': encoded['input_ids'][0].tolist(),
                'bert_attention_mask': encoded['attention_mask'][0].tolist(),
                'bert_embeddings': embeddings
            }

        except Exception as e:
            self.logger.error(f"Error extracting BERT features: {e}")
            return {
                'bert_tokens': [],
                'bert_input_ids': [],
                'bert_attention_mask': [],
                'bert_embeddings': None
            }

    def extract_features(self, text: str) -> LinguisticFeatures:
        """Extract all linguistic features from text"""
        try:
            if not text or not text.strip():
                self.logger.warning("Empty text provided for feature extraction")
                return self._create_empty_features(text, "Empty text provided")

            self.logger.info(f"Extracting linguistic features from text ({len(text)} characters)")

            with ProcessingTimer() as timer:
                # Extract different feature categories
                basic_features = self._extract_basic_features(text)
                vocabulary_features = self._extract_vocabulary_features(text)
                syntactic_features = self._extract_syntactic_features(text)
                semantic_features = self._extract_semantic_features(text)
                discourse_features = self._extract_discourse_features(text)
                bert_features = self._extract_bert_features(text)

                # Create LinguisticFeatures object
                features = LinguisticFeatures(
                    raw_text=text,
                    processing_success=True,
                    **basic_features,
                    **vocabulary_features,
                    **syntactic_features,
                    **semantic_features,
                    **discourse_features,
                    **bert_features
                )

            self.logger.info(f"✓ Feature extraction completed in {timer.elapsed:.2f}s")
            return features

        except Exception as e:
            self.logger.error(f"Error extracting linguistic features: {e}")
            return self._create_empty_features(text, str(e))

    def _create_empty_features(self, text: str, error_message: str) -> LinguisticFeatures:
        """Create empty features object for error cases"""
        return LinguisticFeatures(
            raw_text=text or "",
            word_count=0,
            sentence_count=0,
            char_count=0,
            avg_word_length=0.0,
            avg_sentence_length=0.0,
            unique_words=0,
            lexical_diversity=0.0,
            function_words_ratio=0.0,
            content_words_ratio=0.0,
            noun_ratio=0.0,
            verb_ratio=0.0,
            adjective_ratio=0.0,
            pronoun_ratio=0.0,
            syllable_count=0,
            avg_syllables_per_word=0.0,
            complex_words_ratio=0.0,
            repetition_ratio=0.0,
            pause_indicators=0,
            filler_words=0,
            bert_tokens=[],
            bert_input_ids=[],
            bert_attention_mask=[],
            bert_embeddings=None,
            processing_success=False,
            error_message=error_message
        )

    def process_transcription_result(self, result: TranscriptionResult) -> LinguisticFeatures:
        """Process a TranscriptionResult to extract linguistic features"""
        try:
            if not result.success or not result.text:
                return self._create_empty_features(
                    result.text or "",
                    f"Transcription failed: {result.error_message}"
                )

            return self.extract_features(result.text)

        except Exception as e:
            self.logger.error(f"Error processing transcription result: {e}")
            return self._create_empty_features("", str(e))

    def batch_process_texts(self, texts: List[str], batch_size: int = 32) -> List[LinguisticFeatures]:
        """Process multiple texts in batches"""
        try:
            self.logger.info(f"Processing {len(texts)} texts in batches of {batch_size}")
            results = []

            with ProcessingTimer() as timer:
                for batch in batch_generator(texts, batch_size):
                    batch_results = []
                    for text in batch:
                        features = self.extract_features(text)
                        batch_results.append(features)
                    results.extend(batch_results)

                    # Memory cleanup between batches
                    cleanup_memory()

            self.logger.info(f"✓ Batch processing completed in {timer.elapsed:.2f}s")
            return results

        except Exception as e:
            self.logger.error(f"Error in batch processing: {e}")
            return [self._create_empty_features(text, str(e)) for text in texts]

    def get_feature_summary(self, features: LinguisticFeatures) -> Dict[str, Any]:
        """Get a summary of extracted features"""
        if not features.processing_success:
            return {
                'status': 'failed',
                'error': features.error_message,
                'text_length': len(features.raw_text)
            }

        return {
            'status': 'success',
            'basic_stats': {
                'words': features.word_count,
                'sentences': features.sentence_count,
                'characters': features.char_count,
                'avg_word_length': round(features.avg_word_length, 2),
                'avg_sentence_length': round(features.avg_sentence_length, 2)
            },
            'vocabulary': {
                'unique_words': features.unique_words,
                'lexical_diversity': round(features.lexical_diversity, 3),
                'function_words_ratio': round(features.function_words_ratio, 3),
                'content_words_ratio': round(features.content_words_ratio, 3)
            },
            'complexity': {
                'avg_syllables_per_word': round(features.avg_syllables_per_word, 2),
                'complex_words_ratio': round(features.complex_words_ratio, 3),
                'total_syllables': features.syllable_count
            },
            'discourse': {
                'repetition_ratio': round(features.repetition_ratio, 3),
                'pause_indicators': features.pause_indicators,
                'filler_words': features.filler_words
            },
            'bert_available': features.bert_embeddings is not None,
            'bert_tokens_count': len(features.bert_tokens)
        }

    def save_features(self, features: LinguisticFeatures, filepath: str) -> bool:
        """Save linguistic features to file"""
        try:
            return safe_save_pickle(features, filepath)
        except Exception as e:
            self.logger.error(f"Error saving features: {e}")
            return False

    def cleanup(self):
        """Cleanup resources"""
        try:
            if self.bert_model:
                del self.bert_model
            if self.bert_tokenizer:
                del self.bert_tokenizer
            cleanup_memory()
            self.logger.info("✓ Linguistic features service cleanup completed")
        except Exception as e:
            self.logger.error(f"Error during cleanup: {e}")

# Factory function for easy service creation
def create_linguistic_features_service(logger: Optional[logging.Logger] = None) -> LinguisticFeaturesService:
    """Factory function to create a LinguisticFeaturesService instance"""
    return LinguisticFeaturesService(logger=logger)

In [None]:
# Data_Manager_Service.py

In [None]:
"""
Data Manager Service - Handles ADReSSo21 dataset loading and file management
Optimized for Windows 10 with parallel processing
"""
import os
import glob
import json
import pickle
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import pandas as pd
import logging
from pathlib import Path
import shutil

from config import SYSTEM_CONFIG, MODEL_CONFIG
from utils import setup_logging, ProcessingTimer, safe_save_pickle, cleanup_memory

@dataclass
class AudioFile:
    """Data class for audio file information"""
    file_path: str
    filename: str
    category: str
    label: str
    dataset_type: str  # 'diagnosis' or 'progression'
    split: str  # 'train' or 'test'
    segmentation_path: Optional[str] = None
    file_size: Optional[int] = None
    duration: Optional[float] = None

@dataclass
class DatasetInfo:
    """Data class for dataset information"""
    total_files: int
    categories: Dict[str, int]
    dataset_types: Dict[str, int]
    splits: Dict[str, int]
    total_size_mb: float
    audio_files: List[AudioFile]

class DataManagerService:
    """Service for managing ADReSSo21 dataset files and metadata"""

    def __init__(self, base_path: str, logger: Optional[logging.Logger] = None):
        self.base_path = Path(base_path)
        self.logger = logger or setup_logging()

        # Define dataset structure based on your paths
        self.dataset_paths = {
            'diagnosis_train': {
                'audio': {
                    'ad': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\ad",
                    'cn': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\audio\cn"
                },
                'segmentation': {
                    'ad': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\ad",
                    'cn': r"ADReSSo21-diagnosis-train\ADReSSo21\diagnosis\train\segmentation\cn"
                }
            },
            'progression_train': {
                'audio': {
                    'decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\audio\decline",
                    'no_decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\audio\no_decline"
                },
                'segmentation': {
                    'decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\decline",
                    'no_decline': r"ADReSSo21-progression-train\ADReSSo21\progression\train\segmentation\no_decline"
                }
            },
            'progression_test': {
                'audio': {
                    'test': r"ADReSSo21-progression-test\ADReSSo21\progression\test-dist\audio"
                },
                'segmentation': {
                    'test': r"ADReSSo21-progression-test\ADReSSo21\progression\test-dist\segmentation"
                }
            }
        }

        # Create output directories
        self.output_dir = self.base_path / "output"
        self.create_output_directories()

    def create_output_directories(self):
        """Create necessary output directories"""
        directories = [
            self.output_dir,
            self.output_dir / "features",
            self.output_dir / "transcripts",
            self.output_dir / "models",
            self.output_dir / "results",
            self.output_dir / "logs",
            self.output_dir / "cache"
        ]

        for directory in directories:
            directory.mkdir(parents=True, exist_ok=True)

        self.logger.info(f"✓ Output directories created at {self.output_dir}")

    def scan_audio_files(self) -> List[AudioFile]:
        """Scan and catalog all audio files in the dataset"""
        self.logger.info("Scanning audio files in dataset...")
        audio_files = []

        with ProcessingTimer() as timer:
            for dataset_name, paths in self.dataset_paths.items():
                self.logger.info(f"Scanning {dataset_name}...")

                # Extract dataset info from name
                if 'diagnosis' in dataset_name:
                    dataset_type = 'diagnosis'
                    split = 'train'
                else:  # progression
                    dataset_type = 'progression'
                    split = 'test' if 'test' in dataset_name else 'train'

                # Scan audio directories
                for label, audio_path in paths['audio'].items():
                    full_audio_path = self.base_path / audio_path

                    if not full_audio_path.exists():
                        self.logger.warning(f"Audio path not found: {full_audio_path}")
                        continue

                    # Find segmentation path
                    seg_path = None
                    if 'segmentation' in paths:
                        seg_key = label if label in paths['segmentation'] else 'test'
                        if seg_key in paths['segmentation']:
                            seg_path = self.base_path / paths['segmentation'][seg_key]

                    # Get all WAV files
                    wav_files = list(full_audio_path.glob("*.wav"))

                    for wav_file in wav_files:
                        # Find corresponding segmentation file
                        seg_file = None
                        if seg_path and seg_path.exists():
                            seg_file_path = seg_path / f"{wav_file.stem}.csv"
                            if seg_file_path.exists():
                                seg_file = str(seg_file_path)

                        audio_file = AudioFile(
                            file_path=str(wav_file),
                            filename=wav_file.name,
                            category=f"{dataset_type}_{label}",
                            label=label,
                            dataset_type=dataset_type,
                            split=split,
                            segmentation_path=seg_file,
                            file_size=wav_file.stat().st_size if wav_file.exists() else None
                        )

                        audio_files.append(audio_file)

        self.logger.info(f"✓ Found {len(audio_files)} audio files in {timer.elapsed:.2f}s")
        return audio_files

    def get_dataset_info(self, audio_files: List[AudioFile]) -> DatasetInfo:
        """Generate comprehensive dataset information"""
        if not audio_files:
            return DatasetInfo(0, {}, {}, {}, 0.0, [])

        # Count by categories
        categories = {}
        dataset_types = {}
        splits = {}
        total_size = 0

        for af in audio_files:
            # Count categories
            categories[af.category] = categories.get(af.category, 0) + 1
            dataset_types[af.dataset_type] = dataset_types.get(af.dataset_type, 0) + 1
            splits[af.split] = splits.get(af.split, 0) + 1

            # Sum file sizes
            if af.file_size:
                total_size += af.file_size

        return DatasetInfo(
            total_files=len(audio_files),
            categories=categories,
            dataset_types=dataset_types,
            splits=splits,
            total_size_mb=total_size / (1024 * 1024),
            audio_files=audio_files
        )

    def create_file_manifest(self, audio_files: List[AudioFile]) -> pd.DataFrame:
        """Create a detailed file manifest"""
        data = []

        for af in audio_files:
            data.append({
                'filename': af.filename,
                'file_path': af.file_path,
                'category': af.category,
                'label': af.label,
                'dataset_type': af.dataset_type,
                'split': af.split,
                'segmentation_path': af.segmentation_path,
                'has_segmentation': af.segmentation_path is not None,
                'file_size_mb': af.file_size / (1024 * 1024) if af.file_size else None,
                'file_exists': os.path.exists(af.file_path)
            })

        df = pd.DataFrame(data)

        # Save manifest
        manifest_path = self.output_dir / "file_manifest.csv"
        df.to_csv(manifest_path, index=False)
        self.logger.info(f"✓ File manifest saved to {manifest_path}")

        return df

    def validate_dataset(self, audio_files: List[AudioFile]) -> Dict[str, Any]:
        """Validate dataset integrity"""
        self.logger.info("Validating dataset integrity...")

        validation_results = {
            'total_files': len(audio_files),
            'valid_files': 0,
            'missing_files': 0,
            'files_with_segmentation': 0,
            'missing_segmentation': 0,
            'errors': []
        }

        for af in audio_files:
            # Check if audio file exists
            if not os.path.exists(af.file_path):
                validation_results['missing_files'] += 1
                validation_results['errors'].append(f"Missing audio: {af.file_path}")
                continue

            validation_results['valid_files'] += 1

            # Check segmentation file
            if af.segmentation_path:
                if os.path.exists(af.segmentation_path):
                    validation_results['files_with_segmentation'] += 1
                else:
                    validation_results['missing_segmentation'] += 1
                    validation_results['errors'].append(f"Missing segmentation: {af.segmentation_path}")

        # Save validation report
        report_path = self.output_dir / "validation_report.json"
        with open(report_path, 'w') as f:
            json.dump(validation_results, f, indent=2)

        self.logger.info(f"✓ Dataset validation complete. Report saved to {report_path}")
        return validation_results

    def get_files_by_category(self, audio_files: List[AudioFile],
                            category: Optional[str] = None,
                            dataset_type: Optional[str] = None,
                            split: Optional[str] = None) -> List[AudioFile]:
        """Filter audio files by category, dataset type, or split"""
        filtered_files = audio_files

        if category:
            filtered_files = [af for af in filtered_files if af.category == category]

        if dataset_type:
            filtered_files = [af for af in filtered_files if af.dataset_type == dataset_type]

        if split:
            filtered_files = [af for af in filtered_files if af.split == split]

        return filtered_files

    def create_train_test_splits(self, audio_files: List[AudioFile],
                                test_size: float = 0.2) -> Tuple[List[AudioFile], List[AudioFile]]:
        """Create train/test splits for datasets that don't have predefined splits"""
        from sklearn.model_selection import train_test_split

        # Group by category to ensure balanced splits
        category_files = {}
        for af in audio_files:
            if af.category not in category_files:
                category_files[af.category] = []
            category_files[af.category].append(af)

        train_files = []
        test_files = []

        for category, files in category_files.items():
            if len(files) < 2:
                # If too few files, put all in training
                train_files.extend(files)
                continue

            cat_train, cat_test = train_test_split(
                files,
                test_size=test_size,
                random_state=42,
                stratify=None  # Can't stratify on single category
            )

            train_files.extend(cat_train)
            test_files.extend(cat_test)

        self.logger.info(f"✓ Created splits: {len(train_files)} train, {len(test_files)} test")
        return train_files, test_files

    def batch_load_files(self, audio_files: List[AudioFile],
                        batch_size: int = 32) -> List[List[AudioFile]]:
        """Create batches of files for parallel processing"""
        batches = []
        for i in range(0, len(audio_files), batch_size):
            batch = audio_files[i:i + batch_size]
            batches.append(batch)

        self.logger.info(f"✓ Created {len(batches)} batches of size {batch_size}")
        return batches

    def save_dataset_cache(self, audio_files: List[AudioFile],
                          dataset_info: DatasetInfo) -> bool:
        """Save dataset information to cache for faster loading"""
        try:
            cache_data = {
                'audio_files': audio_files,
                'dataset_info': dataset_info,
                'scan_timestamp': pd.Timestamp.now().isoformat(),
                'base_path': str(self.base_path)
            }

            cache_path = self.output_dir / "cache" / "dataset_cache.pkl"
            success = safe_save_pickle(cache_data, cache_path)

            if success:
                self.logger.info(f"✓ Dataset cache saved to {cache_path}")

            return success

        except Exception as e:
            self.logger.error(f"Error saving dataset cache: {e}")
            return False

    def load_dataset_cache(self) -> Optional[Tuple[List[AudioFile], DatasetInfo]]:
        """Load dataset information from cache"""
        try:
            cache_path = self.output_dir / "cache" / "dataset_cache.pkl"

            if not cache_path.exists():
                return None

            with open(cache_path, 'rb') as f:
                cache_data = pickle.load(f)

            # Verify cache is for same base path
            if cache_data.get('base_path') != str(self.base_path):
                self.logger.warning("Cache base path mismatch, ignoring cache")
                return None

            self.logger.info("✓ Loaded dataset from cache")
            return cache_data['audio_files'], cache_data['dataset_info']

        except Exception as e:
            self.logger.error(f"Error loading dataset cache: {e}")
            return None

    def print_dataset_summary(self, dataset_info: DatasetInfo):
        """Print a comprehensive dataset summary"""
        print("\n" + "="*60)
        print("ADRESSO21 DATASET SUMMARY")
        print("="*60)

        print(f"Total Files: {dataset_info.total_files}")
        print(f"Total Size: {dataset_info.total_size_mb:.2f} MB")
        print()

        print("By Dataset Type:")
        for dtype, count in dataset_info.dataset_types.items():
            print(f"  {dtype}: {count} files")
        print()

        print("By Split:")
        for split, count in dataset_info.splits.items():
            print(f"  {split}: {count} files")
        print()

        print("By Category:")
        for category, count in dataset_info.categories.items():
            print(f"  {category}: {count} files")
        print()

    def cleanup_output_directory(self, keep_cache: bool = True):
        """Clean up output directory"""
        try:
            for item in self.output_dir.iterdir():
                if item.is_dir():
                    if item.name == 'cache' and keep_cache:
                        continue
                    shutil.rmtree(item)
                else:
                    item.unlink()

            # Recreate directories
            self.create_output_directories()
            self.logger.info("✓ Output directory cleaned")

        except Exception as e:
            self.logger.error(f"Error cleaning output directory: {e}")

# Factory function
def create_data_manager(base_path: str, logger: Optional[logging.Logger] = None) -> DataManagerService:
    """Factory function to create DataManagerService"""
    return DataManagerService(base_path, logger)

# pipeline_service.py - Main Pipeline Orchestrator


In [None]:
"""
Pipeline Service - Main orchestrator for ADReSSo21 speech analysis pipeline
Coordinates all microservices for complete analysis workflow
"""

import os
import json
import pickle
import pandas as pd
from typing import Dict, List, Any, Optional
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import multiprocessing as mp
from datetime import datetime
import logging

# Import your microservices
from config import Config
from utils import setup_logging, ensure_directory
from data_manager_service import DataManagerService
from acoustic_features_service import AcousticFeaturesService
from transcription_service import TranscriptionService
from linguistic_features_service import LinguisticFeaturesService


class PipelineService:
    """
    Main pipeline service that orchestrates all analysis components
    """

    def __init__(self, config_path: Optional[str] = None):
        """
        Initialize pipeline with configuration and services

        Args:
            config_path: Path to configuration file
        """
        # Load configuration
        self.config = Config(config_path)

        # Setup logging
        self.logger = setup_logging(
            log_level=self.config.get('logging.level', 'INFO'),
            log_file=self.config.get('logging.file')
        )

        # Initialize services
        self.data_manager = DataManagerService(self.config)
        self.acoustic_service = AcousticFeaturesService(self.config)
        self.transcription_service = TranscriptionService(self.config)
        self.linguistic_service = LinguisticFeaturesService(self.config)

        # Pipeline state
        self.results = {}
        self.start_time = None
        self.end_time = None

        self.logger.info("Pipeline initialized successfully")

    def run_complete_pipeline(self, parallel: bool = True) -> Dict[str, Any]:
        """
        Run the complete analysis pipeline

        Args:
            parallel: Whether to use parallel processing where possible

        Returns:
            Dictionary containing all pipeline results
        """
        self.start_time = datetime.now()
        self.logger.info("=== Starting ADReSSo21 Speech Analysis Pipeline ===")

        try:
            # Step 1: Load dataset and get audio files
            self.logger.info("Step 1: Loading dataset...")
            audio_files = self._load_dataset()

            # Step 2: Extract acoustic features
            self.logger.info("Step 2: Extracting acoustic features...")
            acoustic_features = self._extract_acoustic_features(audio_files, parallel)

            # Step 3: Extract transcripts
            self.logger.info("Step 3: Extracting transcripts...")
            transcripts = self._extract_transcripts(audio_files, parallel)

            # Step 4: Extract linguistic features
            self.logger.info("Step 4: Extracting linguistic features...")
            linguistic_features = self._extract_linguistic_features(transcripts)

            # Step 5: Combine and save results
            self.logger.info("Step 5: Combining and saving results...")
            final_results = self._combine_and_save_results(
                audio_files, acoustic_features, transcripts, linguistic_features
            )

            self.end_time = datetime.now()
            duration = self.end_time - self.start_time

            self.logger.info(f"Pipeline completed successfully in {duration}")
            self.logger.info(f"Results saved to: {self.config.output_path}")

            return final_results

        except Exception as e:
            self.logger.error(f"Pipeline failed: {str(e)}")
            raise

    def _load_dataset(self) -> Dict[str, List[str]]:
        """Load dataset and get audio file paths"""
        audio_files = self.data_manager.get_audio_files()

        total_files = sum(len(files) for files in audio_files.values())
        self.logger.info(f"Found {total_files} audio files across all categories")

        for category, files in audio_files.items():
            self.logger.info(f"  {category}: {len(files)} files")

        if total_files == 0:
            raise ValueError("No audio files found. Please check the dataset path.")

        return audio_files

    def _extract_acoustic_features(self, audio_files: Dict[str, List[str]],
                                 parallel: bool = True) -> Dict[str, Any]:
        """Extract acoustic features from all audio files"""
        all_features = {}

        if parallel:
            all_features = self._extract_acoustic_features_parallel(audio_files)
        else:
            all_features = self._extract_acoustic_features_sequential(audio_files)

        # Save acoustic features
        features_path = os.path.join(self.config.output_path, "acoustic_features.pkl")
        with open(features_path, 'wb') as f:
            pickle.dump(all_features, f)

        self.logger.info(f"Acoustic features saved to {features_path}")
        return all_features

    def _extract_acoustic_features_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract acoustic features using parallel processing"""
        all_features = {}
        max_workers = min(self.config.get('processing.max_workers', mp.cpu_count()), mp.cpu_count())

        # Flatten all files with their categories
        file_tasks = []
        for category, files in audio_files.items():
            for file_path in files:
                file_tasks.append((file_path, category))

        self.logger.info(f"Processing {len(file_tasks)} files with {max_workers} workers")

        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_file = {
                executor.submit(self.acoustic_service.extract_features, file_path): (file_path, category)
                for file_path, category in file_tasks
            }

            # Collect results
            completed = 0
            for future in as_completed(future_to_file):
                file_path, category = future_to_file[future]
                filename = os.path.basename(file_path)

                try:
                    features = future.result()
                    if features is not None:
                        all_features[f"{category}_{filename}"] = {
                            'file_path': file_path,
                            'category': category,
                            'filename': filename,
                            'features': features
                        }
                    else:
                        self.logger.warning(f"Failed to extract features from {filename}")

                except Exception as e:
                    self.logger.error(f"Error processing {filename}: {str(e)}")

                completed += 1
                if completed % 10 == 0:
                    self.logger.info(f"Completed acoustic feature extraction for {completed}/{len(file_tasks)} files")

        return all_features

    def _extract_acoustic_features_sequential(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract acoustic features sequentially"""
        all_features = {}
        total_files = sum(len(files) for files in audio_files.values())
        processed = 0

        for category, files in audio_files.items():
            self.logger.info(f"Processing acoustic features for {category}...")

            for file_path in files:
                filename = os.path.basename(file_path)

                try:
                    features = self.acoustic_service.extract_features(file_path)
                    if features is not None:
                        all_features[f"{category}_{filename}"] = {
                            'file_path': file_path,
                            'category': category,
                            'filename': filename,
                            'features': features
                        }
                    else:
                        self.logger.warning(f"Failed to extract features from {filename}")

                except Exception as e:
                    self.logger.error(f"Error processing {filename}: {str(e)}")

                processed += 1
                if processed % 10 == 0:
                    self.logger.info(f"Completed {processed}/{total_files} files")

        return all_features

    def _extract_transcripts(self, audio_files: Dict[str, List[str]],
                           parallel: bool = True) -> Dict[str, Any]:
        """Extract transcripts from all audio files"""
        if parallel:
            transcripts = self._extract_transcripts_parallel(audio_files)
        else:
            transcripts = self._extract_transcripts_sequential(audio_files)

        # Save transcripts
        self._save_transcripts(transcripts)

        return transcripts

    def _extract_transcripts_parallel(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract transcripts using parallel processing"""
        transcripts = {}
        max_workers = min(self.config.get('processing.transcription_workers', 2), 4)  # Limit for memory

        # Flatten all files with their categories
        file_tasks = []
        for category, files in audio_files.items():
            for file_path in files:
                file_tasks.append((file_path, category))

        self.logger.info(f"Transcribing {len(file_tasks)} files with {max_workers} workers")

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_file = {
                executor.submit(self.transcription_service.transcribe_audio, file_path): (file_path, category)
                for file_path, category in file_tasks
            }

            # Collect results
            completed = 0
            for future in as_completed(future_to_file):
                file_path, category = future_to_file[future]
                filename = os.path.basename(file_path)

                try:
                    transcript_data = future.result()
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        **transcript_data
                    }

                except Exception as e:
                    self.logger.error(f"Error transcribing {filename}: {str(e)}")
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        'transcript': '',
                        'error': str(e)
                    }

                completed += 1
                if completed % 5 == 0:
                    self.logger.info(f"Completed transcription for {completed}/{len(file_tasks)} files")

        return transcripts

    def _extract_transcripts_sequential(self, audio_files: Dict[str, List[str]]) -> Dict[str, Any]:
        """Extract transcripts sequentially"""
        transcripts = {}

        for category, files in audio_files.items():
            self.logger.info(f"Transcribing {category}...")

            for file_path in files:
                filename = os.path.basename(file_path)

                try:
                    transcript_data = self.transcription_service.transcribe_audio(file_path)
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        **transcript_data
                    }

                except Exception as e:
                    self.logger.error(f"Error transcribing {filename}: {str(e)}")
                    transcripts[f"{category}_{filename}"] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        'transcript': '',
                        'error': str(e)
                    }

        return transcripts

    def _extract_linguistic_features(self, transcripts: Dict[str, Any]) -> Dict[str, Any]:
        """Extract linguistic features from transcripts"""
        linguistic_features = self.linguistic_service.extract_features(transcripts)

        # Save linguistic features
        features_path = os.path.join(self.config.output_path, "linguistic_features.pkl")
        with open(features_path, 'wb') as f:
            pickle.dump(linguistic_features, f)

        self.logger.info(f"Linguistic features saved to {features_path}")
        return linguistic_features

    def _save_transcripts(self, transcripts: Dict[str, Any]):
        """Save transcripts to various formats"""
        transcripts_dir = os.path.join(self.config.output_path, "transcripts")
        ensure_directory(transcripts_dir)

        # Save individual transcript files
        for key, data in transcripts.items():
            if 'transcript' in data and data['transcript']:
                filename = f"{key}_transcript.txt"
                filepath = os.path.join(transcripts_dir, filename)

                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(data['transcript'])

        # Save consolidated JSON
        json_path = os.path.join(transcripts_dir, "all_transcripts.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(transcripts, f, indent=2, ensure_ascii=False)

        # Save as pickle
        pkl_path = os.path.join(transcripts_dir, "transcripts.pkl")
        with open(pkl_path, 'wb') as f:
            pickle.dump(transcripts, f)

        self.logger.info(f"Transcripts saved to {transcripts_dir}")

    def _combine_and_save_results(self, audio_files: Dict[str, List[str]],
                                acoustic_features: Dict[str, Any],
                                transcripts: Dict[str, Any],
                                linguistic_features: Dict[str, Any]) -> Dict[str, Any]:
        """Combine all results and save comprehensive dataset"""

        # Create comprehensive results dictionary
        final_results = {
            'pipeline_info': {
                'start_time': self.start_time.isoformat(),
                'end_time': self.end_time.isoformat() if self.end_time else None,
                'total_files': sum(len(files) for files in audio_files.values()),
                'categories': list(audio_files.keys()),
                'config': self.config.to_dict()
            },
            'audio_files': audio_files,
            'acoustic_features': acoustic_features,
            'transcripts': transcripts,
            'linguistic_features': linguistic_features
        }

        # Create summary DataFrame
        summary_data = []
        for key in set(acoustic_features.keys()) | set(transcripts.keys()):
            row = {'file_id': key}

            # Add acoustic info
            if key in acoustic_features:
                row.update({
                    'category': acoustic_features[key]['category'],
                    'filename': acoustic_features[key]['filename'],
                    'has_acoustic_features': True
                })

            # Add transcript info
            if key in transcripts:
                row.update({
                    'has_transcript': True,
                    'transcript_length': len(transcripts[key].get('transcript', '')),
                    'word_count': len(transcripts[key].get('transcript', '').split()),
                    'language': transcripts[key].get('language', 'unknown'),
                    'has_transcript_error': 'error' in transcripts[key]
                })
            else:
                row.update({
                    'has_transcript': False,
                    'transcript_length': 0,
                    'word_count': 0
                })

            # Add linguistic info
            if key in linguistic_features:
                row.update({
                    'has_linguistic_features': True,
                    'unique_words': linguistic_features[key].get('unique_words', 0),
                    'lexical_diversity': linguistic_features[key].get('lexical_diversity', 0)
                })
            else:
                row.update({
                    'has_linguistic_features': False,
                    'unique_words': 0,
                    'lexical_diversity': 0
                })

            summary_data.append(row)

        summary_df = pd.DataFrame(summary_data)

        # Save summary
        summary_path = os.path.join(self.config.output_path, "pipeline_summary.csv")
        summary_df.to_csv(summary_path, index=False)

        # Save complete results
        results_path = os.path.join(self.config.output_path, "complete_results.pkl")
        with open(results_path, 'wb') as f:
            pickle.dump(final_results, f)

        self.logger.info("="*50)
        self.logger.info("PIPELINE SUMMARY")
        self.logger.info("="*50)
        self.logger.info(f"Total files processed: {len(summary_data)}")
        self.logger.info(f"Files with acoustic features: {summary_df['has_acoustic_features'].sum()}")
        self.logger.info(f"Files with transcripts: {summary_df['has_transcript'].sum()}")
        self.logger.info(f"Files with linguistic features: {summary_df['has_linguistic_features'].sum()}")
        self.logger.info(f"Average words per transcript: {summary_df['word_count'].mean():.1f}")
        self.logger.info("="*50)
        self.logger.info("Output files:")
        self.logger.info(f"  - Complete results: {results_path}")
        self.logger.info(f"  - Pipeline summary: {summary_path}")
        self.logger.info(f"  - Acoustic features: {os.path.join(self.config.output_path, 'acoustic_features.pkl')}")
        self.logger.info(f"  - Transcripts: {os.path.join(self.config.output_path, 'transcripts/')}")
        self.logger.info(f"  - Linguistic features: {os.path.join(self.config.output_path, 'linguistic_features.pkl')}")

        return final_results

    def run_sample_analysis(self, max_files_per_category: int = 2):
        """Run pipeline on a small sample for testing"""
        self.logger.info(f"Running sample analysis with max {max_files_per_category} files per category")

        # Get limited audio files
        all_audio_files = self.data_manager.get_audio_files()
        sample_audio_files = {}

        for category, files in all_audio_files.items():
            sample_audio_files[category] = files[:max_files_per_category]

        # Run pipeline on sample
        return self.run_complete_pipeline(parallel=False)

    def get_pipeline_status(self) -> Dict[str, Any]:
        """Get current pipeline status"""
        return {
            'start_time': self.start_time.isoformat() if self.start_time else None,
            'end_time': self.end_time.isoformat() if self.end_time else None,
            'is_running': self.start_time is not None and self.end_time is None,
            'output_path': self.config.output_path,
            'results_available': bool(self.results)
        }


if __name__ == "__main__":
    # Example usage
    pipeline = PipelineService()

    # Run sample analysis first
    print("Running sample analysis...")
    sample_results = pipeline.run_sample_analysis(max_files_per_category=1)

    # Then run full pipeline
    print("\nRunning full pipeline...")
    results = pipeline.run_complete_pipeline(parallel=True)

# main.py - Main Application Entry Point

In [None]:
"""
Main Application Entry Point for ADReSSo21 Speech Analysis
Command-line interface for running the complete analysis pipeline
"""

import argparse
import sys
import os
from pathlib import Path
import json
import traceback
from datetime import datetime

# Add current directory to Python path
current_dir = Path(__file__).parent
sys.path.append(str(current_dir))

from pipeline_service import PipelineService
from config import Config
from utils import setup_logging


def create_sample_config():
    """Create a sample configuration file for first-time setup"""
    sample_config = {
        "dataset": {
            "base_path": "C:/Users/Administrator/Desktop/Speech/ADReSSo21",
            "diagnosis_train_path": "ADReSSo21-diagnosis-train/ADReSSo21/diagnosis/train",
            "progression_train_path": "ADReSSo21-progression-train/ADReSSo21/progression/train",
            "progression_test_path": "ADReSSo21-progression-test/ADReSSo21/progression/test-dist"
        },
        "output": {
            "base_path": "C:/Users/Administrator/Desktop/Speech/output",
            "create_timestamped_folders": True
        },
        "processing": {
            "max_workers": 8,
            "transcription_workers": 2,
            "batch_size": 10,
            "enable_parallel": True
        },
        "models": {
            "whisper_model": "base",
            "wav2vec_model": "facebook/wav2vec2-base-960h",
            "bert_model": "bert-base-uncased"
        },
        "features": {
            "acoustic": {
                "sample_rate": 16000,
                "n_mfcc": 13,
                "n_mels": 80,
                "extract_egemaps": True,
                "extract_prosodic": True
            },
            "linguistic": {
                "max_sequence_length": 512,
                "extract_basic_stats": True,
                "extract_bert_features": True
            }
        },
        "logging": {
            "level": "INFO",
            "file": "adresso_pipeline.log",
            "console": True
        }
    }

    config_path = "config.json"
    with open(config_path, 'w') as f:
        json.dump(sample_config, f, indent=2)

    print(f"Sample configuration created: {config_path}")
    print("Please edit the paths in config.json to match your setup before running the pipeline.")
    return config_path


def validate_paths(config: Config) -> bool:
    """Validate that required paths exist"""
    base_path = config.get('dataset.base_path')

    if not os.path.exists(base_path):
        print(f"Error: Dataset base path does not exist: {base_path}")
        return False

    # Check for at least one of the dataset directories
    required_subdirs = [
        config.get('dataset.diagnosis_train_path'),
        config.get('dataset.progression_train_path'),
        config.get('dataset.progression_test_path')
    ]

    found_dirs = []
    for subdir in required_subdirs:
        full_path = os.path.join(base_path, subdir)
        if os.path.exists(full_path):
            found_dirs.append(subdir)

    if not found_dirs:
        print("Error: No valid dataset directories found!")
        print(f"Checked paths under {base_path}:")
        for subdir in required_subdirs:
            print(f"  - {subdir}")
        return False

    print(f"Found dataset directories: {found_dirs}")
    return True


def run_pipeline_command(args):
    """Run the complete pipeline"""
    try:
        # Initialize pipeline
        pipeline = PipelineService(args.config)

        # Validate configuration
        if not validate_paths(pipeline.config):
            return 1

        # Run pipeline
        if args.sample:
            print("Running sample analysis...")
            results = pipeline.run_sample_analysis(max_files_per_category=args.sample_size)
        else:
            print("Running complete pipeline...")
            results = pipeline.run_complete_pipeline(parallel=args.parallel)

        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY")
        print("="*50)

        # Print summary
        total_files = results['pipeline_info']['total_files']
        print(f"Total files processed: {total_files}")
        print(f"Output directory: {pipeline.config.output_path}")

        return 0

    except Exception as e:
        print(f"\nPipeline failed with error: {str(e)}")
        if args.debug:
            print("\nFull traceback:")
            traceback.print_exc()
        return 1


def run_status_command(args):
    """Check pipeline status"""
    try:
        pipeline = PipelineService(args.config)
        status = pipeline.get_pipeline_status()

        print("Pipeline Status:")
        print(f"  Output Path: {status['output_path']}")
        print(f"  Is Running: {status['is_running']}")
        print(f"  Results Available: {status['results_available']}")

        if status['start_time']:
            print(f"  Last Start Time: {status['start_time']}")
        if status['end_time']:
            print(f"  Last End Time: {status['end_time']}")

        return 0

    except Exception as e:
        print(f"Error checking status: {str(e)}")
        return 1


def run_demo_command(args):
    """Run demo with single file from each category"""
    try:
        pipeline = PipelineService(args.config)

        print("Running demo analysis...")
        print("This will process 1 file from each available category")

        # Run with minimal files
        results = pipeline.run_sample_analysis(max_files_per_category=1)

        print("\nDemo completed successfully!")
        return 0

    except Exception as e:
        print(f"Demo failed: {str(e)}")
        if args.debug:
            traceback.print_exc()
        return 1


def main():
    """Main application entry point"""
    parser = argparse.ArgumentParser(
        description="ADReSSo21 Speech Analysis Pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Create sample configuration
  python main.py init

  # Run demo analysis
  python main.py demo

  # Run complete pipeline
  python main.py run

  # Run with custom config
  python main.py run --config my_config.json

  # Run sample analysis only
  python main.py run --sample --sample-size 2

  # Run without parallel processing
  python main.py run --no-parallel

  # Check status
  python main.py status
        """
    )

    # Global arguments
    parser.add_argument('--config', '-c', default='config.json',
                       help='Configuration file path (default: config.json)')
    parser.add_argument('--debug', action='store_true',
                       help='Enable debug mode with full error traces')

    # Subcommands
    subparsers = parser.add_subparsers(dest='command', help='Available commands')

    # Init command
    init_parser = subparsers.add_parser('init', help='Create sample configuration file')

    # Run command
    run_parser = subparsers.add_parser('run', help='Run the analysis pipeline')
    run_parser.add_argument('--sample', action='store_true',
                           help='Run on sample data only')
    run_parser.add_argument('--sample-size', type=int, default=2,
                           help='Number of files per category for sample run (default: 2)')
    run_parser.add_argument('--no-parallel', dest='parallel', action='store_false',
                           help='Disable parallel processing')

    # Demo command
    demo_parser = subparsers.add_parser('demo', help='Run demo analysis')

    # Status command
    status_parser = subparsers.add_parser('status', help='Check pipeline status')

    # Parse arguments
    args = parser.parse_args()

    # Handle commands
    if args.command == 'init':
        create_sample_config()
        return 0

    elif args.command == 'run':
        return run_pipeline_command(args)

    elif args.command == 'demo':
        return run_demo_command(args)

    elif args.command == 'status':
        return run_status_command(args)

    else:
        # No command specified, show help
        parser.print_help()
        return 0


if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)


# requirements.txt - Project Dependencies


In [None]:
# Core dependencies for ADReSSo21 Speech Analysis Pipeline

# Audio processing
librosa>=0.10.0
soundfile>=0.12.1
opensmile>=2.4.2

# Speech recognition and transcription
openai-whisper>=20231117
transformers>=4.35.0
torch>=2.0.0
torchaudio>=2.0.0

# NLP and language models
tokenizers>=0.14.0
numpy>=1.24.0
scipy>=1.10.0

# Data handling and processing
pandas>=2.0.0
scikit-learn>=1.3.0

# Parallel processing
joblib>=1.3.0

# Configuration and utilities
pyyaml>=6.0
python-dotenv>=1.0.0

# Optional GPU support (uncomment if using CUDA)
# torch>=2.0.0+cu118
# torchaudio>=2.0.0+cu118

# Development and testing (optional)
pytest>=7.4.0
jupyter>=1.0.0
matplotlib>=3.7.0
seaborn>=0.12.0




# setup.py - Project Setup Script



In [None]:
"""
Setup script for ADReSSo21 Speech Analysis Pipeline
Handles installation, environment setup, and model downloads
"""

import os
import sys
import subprocess
import platform
from pathlib import Path
import urllib.request
import zipfile
import json


class PipelineSetup:
    """Setup and installation handler for the pipeline"""

    def __init__(self):
        self.project_root = Path(__file__).parent
        self.system_info = {
            'os': platform.system(),
            'python_version': sys.version,
            'architecture': platform.architecture()[0]
        }

    def check_system_requirements(self):
        """Check if system meets minimum requirements"""
        print("Checking system requirements...")

        # Check Python version
        if sys.version_info < (3, 8):
            print("❌ Python 3.8+ required. Current version:", sys.version)
            return False
        print("✅ Python version:", sys.version.split()[0])

        # Check available memory (approximate)
        try:
            import psutil
            memory_gb = psutil.virtual_memory().total / (1024**3)
            if memory_gb < 8:
                print(f"⚠️  Warning: Low memory detected ({memory_gb:.1f}GB). 16GB+ recommended.")
            else:
                print(f"✅ Memory: {memory_gb:.1f}GB")
        except ImportError:
            print("⚠️  Cannot check memory (psutil not available)")

        # Check disk space
        try:
            disk_space = psutil.disk_usage('.').free / (1024**3)
            if disk_space < 10:
                print(f"⚠️  Warning: Low disk space ({disk_space:.1f}GB). 20GB+ recommended.")
            else:
                print(f"✅ Disk space: {disk_space:.1f}GB available")
        except:
            print("⚠️  Cannot check disk space")

        return True

    def install_dependencies(self):
        """Install Python dependencies"""
        print("\nInstalling Python dependencies...")

        requirements_file = self.project_root / "requirements.txt"

        if not requirements_file.exists():
            print("❌ requirements.txt not found!")
            return False

        try:
            # Upgrade pip first
            subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "pip"],
                         check=True)

            # Install requirements
            subprocess.run([sys.executable, "-m", "pip", "install", "-r", str(requirements_file)],
                         check=True)

            print("✅ Dependencies installed successfully")
            return True

        except subprocess.CalledProcessError as e:
            print(f"❌ Failed to install dependencies: {e}")
            return False

    def setup_directories(self):
        """Create necessary directories"""
        print("\nSetting up directories...")

        directories = [
            "output",
            "logs",
            "models",
            "temp",
            "data"
        ]

        for directory in directories:
            dir_path = self.project_root / directory
            dir_path.mkdir(exist_ok=True)
            print(f"✅ Created/verified: {directory}/")

        return True

    def download_sample_data(self):
        """Download sample data for testing (if available)"""
        print("\nSetting up sample data...")

        # Create a minimal sample structure for testing
        sample_dir = self.project_root / "data" / "sample"
        sample_dir.mkdir(parents=True, exist_ok=True)

        # Create sample directory structure
        sample_structure = [
            "diagnosis/train/audio/ad",
            "diagnosis/train/audio/cn",
            "diagnosis/train/segmentation/ad",
            "diagnosis/train/segmentation/cn",
            "progression/train/audio/decline",
            "progression/train/audio/no_decline",
            "progression/train/segmentation/decline",
            "progression/train/segmentation/no_decline",
            "progression/test-dist/audio",
            "progression/test-dist/segmentation"
        ]

        for structure in sample_structure:
            (sample_dir / structure).mkdir(parents=True, exist_ok=True)

        # Create a sample README
        readme_content = """
# Sample Data Directory Structure

This directory contains the expected structure for ADReSSo21 dataset.

## Directory Structure:
- diagnosis/train/audio/ad/          - Alzheimer's audio files
- diagnosis/train/audio/cn/          - Control audio files
- diagnosis/train/segmentation/      - Segmentation files
- progression/train/audio/           - Progression training audio
- progression/test-dist/audio/       - Progression test audio

## Usage:
Place your actual ADReSSo21 dataset files in this structure, or update
the paths in config.json to point to your dataset location.
"""

        with open(sample_dir / "README.md", "w") as f:
            f.write(readme_content)

        print("✅ Sample directory structure created")
        return True

    def create_default_config(self):
        """Create default configuration file"""
        print("\nCreating default configuration...")

        config = {
            "dataset": {
                "base_path": str(self.project_root / "data" / "sample"),
                "diagnosis_train_path": "diagnosis/train",
                "progression_train_path": "progression/train",
                "progression_test_path": "progression/test-dist"
            },
            "output": {
                "base_path": str(self.project_root / "output"),
                "create_timestamped_folders": True
            },
            "processing": {
                "max_workers": min(os.cpu_count(), 8),
                "transcription_workers": 2,
                "batch_size": 10,
                "enable_parallel": True
            },
            "models": {
                "whisper_model": "base",
                "wav2vec_model": "facebook/wav2vec2-base-960h",
                "bert_model": "bert-base-uncased"
            },
            "features": {
                "acoustic": {
                    "sample_rate": 16000,
                    "n_mfcc": 13,
                    "n_mels": 80,
                    "extract_egemaps": True,
                    "extract_prosodic": True
                },
                "linguistic": {
                    "max_sequence_length": 512,
                    "extract_basic_stats": True,
                    "extract_bert_features": True
                }
            },
            "logging": {
                "level": "INFO",
                "file": "adresso_pipeline.log",
                "console": True
            }
        }

        config_path = self.project_root / "config.json"
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        print(f"✅ Default configuration created: {config_path}")
        return True

    def verify_installation(self):
        """Verify that installation was successful"""
        print("\nVerifying installation...")

        # Test imports
        test_imports = [
            'librosa',
            'whisper',
            'transformers',
            'torch',
            'opensmile',
            'pandas',
            'numpy'
        ]

        failed_imports = []
        for module in test_imports:
            try:
                __import__(module)
                print(f"✅ {module}")
            except ImportError as e:
                print(f"❌ {module}: {e}")
                failed_imports.append(module)

        if failed_imports:
            print(f"\n❌ Failed to import: {failed_imports}")
            print("Please check the installation and try running:")
            print("pip install -r requirements.txt")
            return False

        print("\n✅ All modules imported successfully!")
        return True

    def run_setup(self):
        """Run complete setup process"""
        print("="*60)
        print("ADReSSo21 Speech Analysis Pipeline Setup")
        print("="*60)

        steps = [
            ("System Requirements", self.check_system_requirements),
            ("Dependencies", self.install_dependencies),
            ("Directories", self.setup_directories),
            ("Sample Data", self.download_sample_data),
            ("Configuration", self.create_default_config),
            ("Verification", self.verify_installation)
        ]

        for step_name, step_func in steps:
            print(f"\n{'='*20} {step_name} {'='*20}")
            if not step_func():
                print(f"\n❌ Setup failed at step: {step_name}")
                return False

        print("\n" + "="*60)
        print("🎉 SETUP COMPLETED SUCCESSFULLY!")
        print("="*60)
        print("\nNext steps:")
        print("1. Update config.json with your dataset paths")
        print("2. Run: python main.py demo")
        print("3. Run: python main.py run")
        print("\nFor help: python main.py --help")

        return True


def main():
    """Main setup function"""
    if len(sys.argv) > 1 and sys.argv[1] == "--help":
        print("""
ADReSSo21 Pipeline Setup

Usage:
    python setup.py                 - Run complete setup
    python setup.py --help          - Show this help
    python setup.py --verify-only   - Only verify installation
    python setup.py --deps-only     - Only install dependencies
        """)
        return

    setup = PipelineSetup()

    if len(sys.argv) > 1 and sys.argv[1] == "--verify-only":
        setup.verify_installation()
    elif len(sys.argv) > 1 and sys.argv[1] == "--deps-only":
        setup.install_dependencies()
    else:
        setup.run_setup()


if __name__ == "__main__":
    main()

# README.md - Project Documentation

# ADReSSo21 Speech Analysis Pipeline

A modular, high-performance pipeline for analyzing speech data from the ADReSSo21 dataset (Alzheimer's Dementia Recognition through Spontaneous Speech). This pipeline extracts comprehensive acoustic, linguistic, and semantic features for dementia detection and progression analysis.

## Features

🎯 **Comprehensive Analysis**
- Acoustic feature extraction (eGeMAPS, MFCCs, Mel-spectrograms, Wav2Vec2)
- Speech-to-text transcription (Whisper)
- Linguistic feature analysis (BERT embeddings, lexical diversity)
- Prosodic analysis (F0, energy, spectral features)

⚡ **High Performance**
- Multi-core parallel processing
- Optimized for Windows 10 with 35GB RAM, 10 cores
- Memory-efficient batch processing
- Modular microservice architecture

🔧 **Easy to Use**
- Command-line interface
- Configurable via JSON
- Sample data support
- Comprehensive logging

## System Requirements

- **OS**: Windows 10/11, Linux, macOS
- **Python**: 3.8+
- **RAM**: 16GB+ recommended (35GB optimal)
- **CPU**: Multi-core processor (10 cores optimal)
- **Storage**: 20GB+ free space
- **GPU**: Optional (CUDA-compatible for faster processing)

## Installation

### Quick Setup

```bash
# Clone or download the project
git clone <repository-url>
cd adresso21-pipeline

# Run setup script
python setup.py
```

### Manual Installation

```bash
# Install dependencies
pip install -r requirements.txt

# Create configuration
python main.py init

# Setup directories
mkdir output logs models temp data
```

## Configuration

Edit `config.json` to match your setup:

```json
{
  "dataset": {
    "base_path": "C:/Users/Administrator/Desktop/Speech/ADReSSo21",
    "diagnosis_train_path": "ADReSSo21-diagnosis-train/ADReSSo21/diagnosis/train",
    "progression_train_path": "ADReSSo21-progression-train/ADReSSo21/progression/train",
    "progression_test_path": "ADReSSo21-progression-test/ADReSSo21/progression/test-dist"
  },
  "output": {
    "base_path": "C:/Users/Administrator/Desktop/Speech/output",
    "create_timestamped_folders": true
  },
  "processing": {
    "max_workers": 8,
    "transcription_workers": 2,
    "enable_parallel": true
  }
}
```

## Dataset Structure

Ensure your ADReSSo21 dataset follows this structure:

```
ADReSSo21/
├── diagnosis/train/
│   ├── audio/
│   │   ├── ad/*.wav          # Alzheimer's audio files
│   │   └── cn/*.wav          # Control audio files
│   └── segmentation/
│       ├── ad/*.csv          # Alzheimer's segmentation
│       └── cn/*.csv          # Control segmentation
├── progression/train/
│   ├── audio/
│   │   ├── decline/*.wav     # Decline audio files
│   │   └── no_decline/*.wav  # No decline audio files
│   └── segmentation/
│       ├── decline/*.csv     # Decline segmentation
│       └── no_decline/*.csv  # No decline segmentation
└── progression/test-dist/
    ├── audio/*.wav           # Test audio files
    └── segmentation/*.csv    # Test segmentation
```

## Usage

### Command Line Interface

```bash
# Initialize configuration
python main.py init

# Run demo analysis (1 file per category)
python main.py demo

# Run sample analysis (2 files per category)  
python main.py run --sample --sample-size 2

# Run complete pipeline
python main.py run

# Run without parallel processing
python main.py run --no-parallel

# Check pipeline status
python main.py status

# Custom configuration
python main.py run --config my_config.json
```

### Python API

```python
from pipeline_service import PipelineService

# Initialize pipeline
pipeline = PipelineService('config.json')

# Run complete analysis
results = pipeline.run_complete_pipeline(parallel=True)

# Run sample analysis  
results = pipeline.run_sample_analysis(max_files_per_category=2)

# Check status
status = pipeline.get_pipeline_status()
```

## Architecture

The pipeline follows a modular microservice architecture:

```
main.py                     # Entry point and CLI
├── pipeline_service.py     # Main orchestrator
├── config.py              # Configuration management
├── utils.py               # Utilities and helpers
├── data_manager_service.py          # Dataset loading
├── acoustic_features_service.py     # Audio feature extraction
├── transcription_service.py         # Speech-to-text
└── linguistic_features_service.py   # Text analysis
```

### Key Components

1. **PipelineService**: Main orchestrator that coordinates all services
2. **DataManagerService**: Handles dataset loading and file management
3. **AcousticFeaturesService**: Extracts audio features (eGeMAPS, MFCCs, etc.)
4. **TranscriptionService**: Converts speech to text using Whisper
5. **LinguisticFeaturesService**: Analyzes text features and BERT embeddings

## Output

The pipeline generates comprehensive outputs:

```
output/
├── acoustic_features.pkl        # All acoustic features
├── transcripts/
│   ├── all_transcripts.json    # All transcriptions
│   ├── transcripts.pkl         # Pickle format
│   └── *_transcript.txt        # Individual transcripts
├── linguistic_features.pkl     # Text analysis results
├── pipeline_summary.csv        # Processing summary
├── complete_results.pkl        # Combined results
└── adresso_pipeline.log        # Processing logs
```

### Feature Types

**Acoustic Features:**
- eGeMAPS (88 features)
- MFCCs (13 coefficients + deltas)
- Mel-spectrograms (80 bands)
- Wav2Vec2 embeddings (768 dimensions)
- Prosodic features (F0, energy, spectral)

**Linguistic Features:**
- Basic statistics (word count, sentence count)
- Lexical diversity measures
- BERT embeddings (768 dimensions)
- Language detection
- Segmentation analysis

## Performance

Typical processing times on recommended hardware:

- **Demo** (5 files): ~2-3 minutes
- **Sample** (20 files): ~5-10 minutes  
- **Complete dataset** (500+ files): ~2-4 hours

Memory usage:
- Base: ~2-4 GB
- With parallel processing: ~8-12 GB
- Peak (large files): ~16-20 GB

## Troubleshooting

### Common Issues

**1. Import Errors**
```bash
# Reinstall dependencies
pip install -r requirements.txt --force-reinstall
```

**2. Memory Issues**
- Reduce `max_workers` in config
- Disable parallel processing: `--no-parallel`
- Process in smaller batches

**3. Model Download Issues**
```bash
# Pre-download models
python -c "import whisper; whisper.load_model('base')"
python -c "from transformers import AutoModel; AutoModel.from_pretrained('facebook/wav2vec2-base-960h')"
```

**4. Path Issues**
- Use absolute paths in config.json
- Check file permissions
- Verify dataset structure

### Performance Optimization

**For Limited RAM:**
```json
{
  "processing": {
    "max_workers": 4,
    "transcription_workers": 1,
    "enable_parallel": false
  }
}
```

**For High Performance:**
```json
{
  "processing": {
    "max_workers": 10,
    "transcription_workers": 4,
    "batch_size": 20,
    "enable_parallel": true
  }
}
```

## Development

### Adding New Features

1. Create new service in `services/`
2. Add configuration options
3. Update `pipeline_service.py`
4. Add tests and documentation

### Testing

```bash
# Run demo for testing
python main.py demo

# Run with debug output
python main.py run --debug

# Test specific components
python -c "from acoustic_features_service import AcousticFeaturesService; service = AcousticFeaturesService()"
```

## Contributing

1. Fork the repository
2. Create a feature branch
3. Make changes with tests
4. Submit a pull request

## License

This project is licensed under the MIT License - see the LICENSE file for details.

## Citation

If you use this pipeline in your research, please cite:

```bibtex
@software{adresso21_pipeline,
  title={ADReSSo21 Speech Analysis Pipeline},
  author={Your Name},
  year={2024},
  url={https://github.com/your-repo/adresso21-pipeline}
}
```

## Acknowledgments

- ADReSSo21 Dataset creators
- OpenAI Whisper team
- Hugging Face Transformers
- OpenSMILE developers

## Support

For support and questions:
- Check the troubleshooting section
- Review logs in `output/adresso_pipeline.log`
- Open an issue on GitHub
- Contact: your.email@domain.com