<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Jul16_Speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install librosa soundfile opensmile speechbrain transformers torch openai-whisper
!pip install torch-geometric

Collecting opensmile
  Downloading opensmile-2.5.1-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.12-py3-none-any.whl.metadata (2.7 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.3.1-py3-none-any.whl.metadata (4.3 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-

In [5]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Any, Tuple
import warnings
warnings.filterwarnings('ignore')

# Audio processing
import librosa
import opensmile
import whisper

# Deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Transformers
from transformers import (
    Wav2Vec2Processor, Wav2Vec2Model,
    BertTokenizer, BertModel,
    ViTModel, ViTFeatureExtractor
)

# Graph networks
import torch_geometric
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GATConv, global_mean_pool

# ML utilities
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Visualization
import networkx as nx
from tqdm import tqdm

class ADReSSoAnalyzer:
    """Complete ADReSSo analysis pipeline with error handling and checkpoints"""

    def __init__(self, base_path="/content/drive/MyDrive/Voice/extracted/ADReSSo21"):
        self.base_path = base_path
        self.output_path = "/content/drive/MyDrive/ADReSSo_Results"
        self.checkpoint_path = f"{self.output_path}/checkpoints"

        # Create output directories
        os.makedirs(self.output_path, exist_ok=True)
        os.makedirs(self.checkpoint_path, exist_ok=True)
        os.makedirs(f"{self.output_path}/visualizations", exist_ok=True)

        # Initialize containers
        self.audio_files = {}
        self.features = {}
        self.transcripts = {}
        self.linguistic_features = {}

        # Initialize models
        self.initialize_models()

    def initialize_models(self):
        """Initialize all required models"""
        print("Initializing models...")

        try:
            # Initialize openSMILE
            self.smile = opensmile.Smile(
                feature_set=opensmile.FeatureSet.eGeMAPSv02,
                feature_level=opensmile.FeatureLevel.Functionals,
            )
            print("✓ OpenSMILE initialized")
        except Exception as e:
            print(f"⚠ OpenSMILE initialization failed: {e}")
            self.smile = None

        try:
            # Initialize Whisper
            self.whisper_model = whisper.load_model("base")
            print("✓ Whisper model loaded")
        except Exception as e:
            print(f"⚠ Whisper initialization failed: {e}")
            self.whisper_model = None

        try:
            # Initialize Wav2Vec2
            self.wav2vec_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
            self.wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
            print("✓ Wav2Vec2 models loaded")
        except Exception as e:
            print(f"⚠ Wav2Vec2 initialization failed: {e}")
            self.wav2vec_processor = None
            self.wav2vec_model = None

        try:
            # Initialize BERT
            self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.bert_model = BertModel.from_pretrained('bert-base-uncased')
            print("✓ BERT models loaded")
        except Exception as e:
            print(f"⚠ BERT initialization failed: {e}")
            self.bert_tokenizer = None
            self.bert_model = None

    def save_checkpoint(self, data: Any, filename: str, step: str):
        """Save checkpoint data"""
        filepath = f"{self.checkpoint_path}/{filename}"

        try:
            if filename.endswith('.pkl'):
                with open(filepath, 'wb') as f:
                    pickle.dump(data, f)
            elif filename.endswith('.json'):
                with open(filepath, 'w') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
            elif filename.endswith('.csv'):
                if isinstance(data, pd.DataFrame):
                    data.to_csv(filepath, index=False)
                else:
                    pd.DataFrame(data).to_csv(filepath, index=False)

            print(f"✓ Checkpoint saved: {filename}")
            return True
        except Exception as e:
            print(f"⚠ Failed to save checkpoint {filename}: {e}")
            return False

    def load_checkpoint(self, filename: str):
        """Load checkpoint data"""
        filepath = f"{self.checkpoint_path}/{filename}"

        if not os.path.exists(filepath):
            return None

        try:
            if filename.endswith('.pkl'):
                with open(filepath, 'rb') as f:
                    return pickle.load(f)
            elif filename.endswith('.json'):
                with open(filepath, 'r') as f:
                    return json.load(f)
            elif filename.endswith('.csv'):
                return pd.read_csv(filepath)
        except Exception as e:
            print(f"⚠ Failed to load checkpoint {filename}: {e}")
            return None

    def step_1_get_audio_files(self) -> Dict[str, List[str]]:
        """Step 1: Get all audio files from the dataset"""
        print("\n" + "="*60)
        print("STEP 1: GETTING AUDIO FILES")
        print("="*60)

        # Check if checkpoint exists
        checkpoint_file = "step1_audio_files.json"
        audio_files = self.load_checkpoint(checkpoint_file)

        if audio_files is not None:
            print("✓ Loaded audio files from checkpoint")
            self.audio_files = audio_files
            return audio_files

        audio_files = {
            'diagnosis_ad': [],
            'diagnosis_cn': [],
            'progression_decline': [],
            'progression_no_decline': [],
            'progression_test': []
        }

        # Define paths
        paths = {
            'diagnosis_ad': f"{self.base_path}/diagnosis/train/audio/ad",
            'diagnosis_cn': f"{self.base_path}/diagnosis/train/audio/cn",
            'progression_decline': f"{self.base_path}/progression/train/audio/decline",
            'progression_no_decline': f"{self.base_path}/progression/train/audio/no_decline",
            'progression_test': f"{self.base_path}/progression/test-dist/audio"
        }

        # Collect files
        for category, path in paths.items():
            if os.path.exists(path):
                files = [f"{path}/{f}" for f in os.listdir(path) if f.endswith('.wav')]
                audio_files[category] = files
                print(f"✓ Found {len(files)} files in {category}")
            else:
                print(f"⚠ Path not found: {path}")

        total_files = sum(len(files) for files in audio_files.values())
        print(f"\nTotal audio files found: {total_files}")

        # Save checkpoint
        self.save_checkpoint(audio_files, checkpoint_file, "step1")
        self.audio_files = audio_files

        # Visualize file distribution
        self.visualize_file_distribution(audio_files)

        return audio_files

    def visualize_file_distribution(self, audio_files: Dict[str, List[str]]):
        """Visualize audio file distribution"""
        categories = list(audio_files.keys())
        counts = [len(files) for files in audio_files.values()]

        plt.figure(figsize=(12, 6))
        bars = plt.bar(categories, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
        plt.title('Audio File Distribution by Category', fontsize=16, fontweight='bold')
        plt.xlabel('Category', fontsize=12)
        plt.ylabel('Number of Files', fontsize=12)
        plt.xticks(rotation=45, ha='right')

        # Add value labels on bars
        for bar, count in zip(bars, counts):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    str(count), ha='center', va='bottom', fontweight='bold')

        plt.tight_layout()
        plt.savefig(f"{self.output_path}/visualizations/file_distribution.png", dpi=300, bbox_inches='tight')
        plt.show()

    def step_2_extract_acoustic_features(self, limit_per_category: int = None):
        """Step 2: Extract acoustic features from audio files"""
        print("\n" + "="*60)
        print("STEP 2: EXTRACTING ACOUSTIC FEATURES")
        print("="*60)

        # Check if checkpoint exists
        checkpoint_file = "step2_acoustic_features.pkl"
        features = self.load_checkpoint(checkpoint_file)

        if features is not None:
            print("✓ Loaded acoustic features from checkpoint")
            self.features = features
            return features

        features = {}

        for category, files in self.audio_files.items():
            if not files:
                continue

            print(f"\nProcessing {category}...")

            # Limit files if specified
            if limit_per_category:
                files = files[:limit_per_category]

            for file_path in tqdm(files, desc=f"Extracting features for {category}"):
                try:
                    filename = os.path.basename(file_path)
                    file_key = f"{category}_{filename}"

                    # Extract features
                    file_features = self.extract_acoustic_features_from_file(file_path)

                    if file_features is not None:
                        features[file_key] = {
                            'file_path': file_path,
                            'category': category,
                            'filename': filename,
                            **file_features
                        }

                except Exception as e:
                    print(f"⚠ Error processing {filename}: {e}")
                    continue

        print(f"\n✓ Extracted features from {len(features)} files")

        # Save checkpoint
        self.save_checkpoint(features, checkpoint_file, "step2")
        self.features = features

        # Visualize features
        self.visualize_acoustic_features(features)

        return features

    def extract_acoustic_features_from_file(self, audio_path: str) -> Dict[str, Any]:
        """Extract acoustic features from a single audio file"""
        features = {}

        try:
            # Load audio
            y, sr = librosa.load(audio_path, sr=16000)

            if len(y) == 0:
                return None

            # 1. eGeMAPS features
            if self.smile is not None:
                try:
                    egemaps = self.smile.process_file(audio_path).values.flatten()
                    features['egemaps'] = egemaps
                except Exception as e:
                    features['egemaps'] = np.zeros(88)
            else:
                features['egemaps'] = np.zeros(88)

            # 2. MFCC features
            try:
                mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                features['mfccs'] = {
                    'mean': np.mean(mfccs, axis=1),
                    'std': np.std(mfccs, axis=1),
                    'delta': np.mean(librosa.feature.delta(mfccs), axis=1),
                    'delta2': np.mean(librosa.feature.delta(mfccs, order=2), axis=1)
                }
            except Exception as e:
                features['mfccs'] = {
                    'mean': np.zeros(13), 'std': np.zeros(13),
                    'delta': np.zeros(13), 'delta2': np.zeros(13)
                }

            # 3. Mel spectrogram
            try:
                mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
                log_mel = librosa.power_to_db(mel_spec)
                features['log_mel'] = {
                    'mean': np.mean(log_mel, axis=1),
                    'std': np.std(log_mel, axis=1)
                }
            except Exception as e:
                features['log_mel'] = {
                    'mean': np.zeros(80), 'std': np.zeros(80)
                }

            # 4. Wav2Vec2 features
            if self.wav2vec_processor is not None and self.wav2vec_model is not None:
                try:
                    input_values = self.wav2vec_processor(
                        y, sampling_rate=16000, return_tensors="pt"
                    ).input_values

                    with torch.no_grad():
                        wav2vec_features = self.wav2vec_model(input_values).last_hidden_state
                    features['wav2vec2'] = torch.mean(wav2vec_features, dim=1).squeeze().numpy()
                except Exception as e:
                    features['wav2vec2'] = np.zeros(768)
            else:
                features['wav2vec2'] = np.zeros(768)

            # 5. Prosodic features
            try:
                f0 = librosa.yin(y, fmin=50, fmax=300, sr=sr)
                f0_clean = f0[f0 > 0]

                features['prosodic'] = {
                    'f0_mean': np.mean(f0_clean) if len(f0_clean) > 0 else 0.0,
                    'f0_std': np.std(f0_clean) if len(f0_clean) > 0 else 0.0,
                    'energy_mean': np.mean(librosa.feature.rms(y=y)),
                    'energy_std': np.std(librosa.feature.rms(y=y)),
                    'zero_crossing_rate': np.mean(librosa.feature.zero_crossing_rate(y)),
                    'spectral_centroid': np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
                    'spectral_rolloff': np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)),
                    'duration': len(y) / sr
                }
            except Exception as e:
                features['prosodic'] = {
                    'f0_mean': 0.0, 'f0_std': 0.0, 'energy_mean': 0.0, 'energy_std': 0.0,
                    'zero_crossing_rate': 0.0, 'spectral_centroid': 0.0, 'spectral_rolloff': 0.0,
                    'duration': 0.0
                }

        except Exception as e:
            print(f"Error processing audio file: {e}")
            return None

        return features

    def visualize_acoustic_features(self, features: Dict[str, Any]):
        """Visualize acoustic features"""
        if not features:
            return

        # Sample file for visualization
        sample_key = list(features.keys())[0]
        sample_features = features[sample_key]

        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle(f'Acoustic Features Visualization - {sample_key}', fontsize=16, fontweight='bold')

        # eGeMAPS
        axes[0, 0].plot(sample_features['egemaps'][:20])
        axes[0, 0].set_title('eGeMAPS Features (first 20)')
        axes[0, 0].set_xlabel('Feature Index')
        axes[0, 0].set_ylabel('Value')

        # MFCC
        mfcc_mean = sample_features['mfccs']['mean']
        axes[0, 1].plot(mfcc_mean, marker='o')
        axes[0, 1].set_title('MFCC Mean')
        axes[0, 1].set_xlabel('MFCC Coefficient')
        axes[0, 1].set_ylabel('Value')

        # Mel spectrogram
        mel_mean = sample_features['log_mel']['mean']
        axes[0, 2].plot(mel_mean)
        axes[0, 2].set_title('Log-Mel Spectrogram Mean')
        axes[0, 2].set_xlabel('Mel Bin')
        axes[0, 2].set_ylabel('Value')

        # Wav2Vec2
        axes[1, 0].plot(sample_features['wav2vec2'][:50])
        axes[1, 0].set_title('Wav2Vec2 Features (first 50)')
        axes[1, 0].set_xlabel('Feature Index')
        axes[1, 0].set_ylabel('Value')

        # Prosodic features
        prosodic = sample_features['prosodic']
        prosodic_names = list(prosodic.keys())
        prosodic_values = list(prosodic.values())

        axes[1, 1].bar(prosodic_names, prosodic_values)
        axes[1, 1].set_title('Prosodic Features')
        axes[1, 1].set_ylabel('Value')
        axes[1, 1].tick_params(axis='x', rotation=45)

        # Feature distribution by category
        categories = {}
        for key, feature_data in features.items():
            category = feature_data['category']
            if category not in categories:
                categories[category] = []
            categories[category].append(feature_data['prosodic']['duration'])

        for category, durations in categories.items():
            axes[1, 2].hist(durations, alpha=0.7, label=category, bins=20)

        axes[1, 2].set_title('Duration Distribution by Category')
        axes[1, 2].set_xlabel('Duration (seconds)')
        axes[1, 2].set_ylabel('Frequency')
        axes[1, 2].legend()

        plt.tight_layout()
        plt.savefig(f"{self.output_path}/visualizations/acoustic_features.png", dpi=300, bbox_inches='tight')
        plt.show()

    def step_3_extract_transcripts(self, limit_per_category: int = None):
        """Step 3: Extract transcripts using Whisper"""
        print("\n" + "="*60)
        print("STEP 3: EXTRACTING TRANSCRIPTS")
        print("="*60)

        # Check if checkpoint exists
        checkpoint_file = "step3_transcripts.json"
        transcripts = self.load_checkpoint(checkpoint_file)

        if transcripts is not None:
            print("✓ Loaded transcripts from checkpoint")
            self.transcripts = transcripts
            return transcripts

        if self.whisper_model is None:
            print("⚠ Whisper model not available, skipping transcript extraction")
            return {}

        transcripts = {}

        for category, files in self.audio_files.items():
            if not files:
                continue

            print(f"\nProcessing {category}...")

            # Limit files if specified
            if limit_per_category:
                files = files[:limit_per_category]

            for file_path in tqdm(files, desc=f"Transcribing {category}"):
                try:
                    filename = os.path.basename(file_path)
                    file_key = f"{category}_{filename}"

                    # Transcribe
                    result = self.whisper_model.transcribe(file_path)

                    transcripts[file_key] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        'transcript': result["text"].strip(),
                        'language': result.get('language', 'en'),
                        'segments': len(result.get('segments', []))
                    }

                except Exception as e:
                    print(f"⚠ Error transcribing {filename}: {e}")
                    transcripts[file_key] = {
                        'file_path': file_path,
                        'category': category,
                        'filename': filename,
                        'transcript': "",
                        'error': str(e)
                    }

        print(f"\n✓ Extracted transcripts from {len(transcripts)} files")

        # Save checkpoint
        self.save_checkpoint(transcripts, checkpoint_file, "step3")
        self.transcripts = transcripts

        # Visualize transcripts
        self.visualize_transcripts(transcripts)

        return transcripts

    def visualize_transcripts(self, transcripts: Dict[str, Any]):
        """Visualize transcript statistics"""
        if not transcripts:
            return

        # Prepare data
        data = []
        for key, info in transcripts.items():
            transcript = info.get('transcript', '')
            data.append({
                'category': info['category'],
                'word_count': len(transcript.split()) if transcript else 0,
                'char_count': len(transcript),
                'has_error': 'error' in info
            })

        df = pd.DataFrame(data)

        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Transcript Analysis', fontsize=16, fontweight='bold')

        # Word count distribution
        df.boxplot(column='word_count', by='category', ax=axes[0, 0])
        axes[0, 0].set_title('Word Count Distribution by Category')
        axes[0, 0].set_ylabel('Word Count')

        # Character count distribution
        df.boxplot(column='char_count', by='category', ax=axes[0, 1])
        axes[0, 1].set_title('Character Count Distribution by Category')
        axes[0, 1].set_ylabel('Character Count')

        # Error rate by category
        error_rate = df.groupby('category')['has_error'].mean()
        axes[1, 0].bar(error_rate.index, error_rate.values)
        axes[1, 0].set_title('Error Rate by Category')
        axes[1, 0].set_ylabel('Error Rate')
        axes[1, 0].tick_params(axis='x', rotation=45)

        # Average metrics by category
        avg_metrics = df.groupby('category')[['word_count', 'char_count']].mean()
        avg_metrics.plot(kind='bar', ax=axes[1, 1])
        axes[1, 1].set_title('Average Metrics by Category')
        axes[1, 1].set_ylabel('Count')
        axes[1, 1].tick_params(axis='x', rotation=45)
        axes[1, 1].legend()

        plt.tight_layout()
        plt.savefig(f"{self.output_path}/visualizations/transcript_analysis.png", dpi=300, bbox_inches='tight')
        plt.show()

    def step_4_extract_linguistic_features(self):
        """Step 4: Extract linguistic features for BERT"""
        print("\n" + "="*60)
        print("STEP 4: EXTRACTING LINGUISTIC FEATURES")
        print("="*60)

        # Check if checkpoint exists
        checkpoint_file = "step4_linguistic_features.pkl"
        linguistic_features = self.load_checkpoint(checkpoint_file)

        if linguistic_features is not None:
            print("✓ Loaded linguistic features from checkpoint")
            self.linguistic_features = linguistic_features
            return linguistic_features

        if self.bert_tokenizer is None:
            print("⚠ BERT tokenizer not available, skipping linguistic feature extraction")
            return {}

        linguistic_features = {}

        print("Processing transcripts for linguistic features...")

        for key, data in tqdm(self.transcripts.items(), desc="Extracting linguistic features"):
            transcript = data.get('transcript', '')

            if not transcript:
                linguistic_features[key] = self.create_empty_linguistic_features()
                continue

            try:
                # Basic linguistic features
                words = transcript.split()
                sentences = [s.strip() for s in transcript.split('.') if s.strip()]

                # BERT tokenization
                bert_encoding = self.bert_tokenizer(
                    transcript,
                    truncation=True,
                    padding='max_length',
                    max_length=512,
                    return_tensors='pt'
                )

                linguistic_features[key] = {
                    'raw_text': transcript,
                    'word_count': len(words),
                    'sentence_count': len(sentences),
                    'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
                    'unique_words': len(set(words)),
                    'lexical_diversity': len(set(words)) / len(words) if words else 0,
                    'bert_input_ids': bert_encoding['input_ids'].squeeze().tolist(),
                    'bert_attention_mask': bert_encoding['attention_mask'].squeeze().tolist(),
                    'category': data['category']
                }

            except Exception as e:
                print(f"⚠ Error processing {key}: {e}")
                linguistic_features[key] = self.create_empty_linguistic_features()

        print(f"\n✓ Extracted linguistic features from {len(linguistic_features)} files")

        # Save checkpoint
        self.save_checkpoint(linguistic_features, checkpoint_file, "step4")
        self.linguistic_features = linguistic_features

        # Visualize linguistic features
        self.visualize_linguistic_features(linguistic_features)

        return linguistic_features

    def create_empty_linguistic_features(self):
        """Create empty linguistic features structure"""
        return {
            'raw_text': '',
            'word_count': 0,
            'sentence_count': 0,
            'avg_word_length': 0,
            'unique_words': 0,
            'lexical_diversity': 0,
            'bert_input_ids': [0] * 512,
            'bert_attention_mask': [0] * 512,
            'category': 'unknown'
        }

    def visualize_linguistic_features(self, linguistic_features: Dict[str, Any]):
        """Visualize linguistic features"""
        if not linguistic_features:
            return

        # Prepare data
        data = []
        for key, features in linguistic_features.items():
            data.append({
                'key': key,
                'category': features['category'],
                'word_count': features['word_count'],
                'sentence_count': features['sentence_count'],
                'avg_word_length': features['avg_word_length'],
                'unique_words': features['unique_words'],
                'lexical_diversity': features['lexical_diversity']
            })

        df = pd.DataFrame(data)

        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('Linguistic Features Analysis', fontsize=16, fontweight='bold')

        # Metrics by category
        metrics = ['word_count', 'sentence_count', 'avg_word_length', 'unique_words', 'lexical_diversity']

        for i, metric in enumerate(metrics):
            row = i // 3
            col = i % 3

            if row < 2 and col < 3:
                df.boxplot(column=metric, by='category', ax=axes[row, col])
                axes[row, col].set_title(f'{metric.replace("_", " ").title()} by Category')
                axes[row, col].set_ylabel(metric.replace("_", " ").title())

        # Correlation heatmap
        numeric_df = df.select_dtypes(include=[np.number])
        correlation_matrix = numeric_df.corr()

        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1, 2])
        axes[1, 2].set_title('Feature Correlation Matrix')

        plt.tight_layout()
        plt.savefig(f"{self.output_path}/visualizations/linguistic_features.png", dpi=300, bbox_inches='tight')
        plt.show()

    def run_complete_pipeline(self, limit_per_category: int = None):
        """Run the complete analysis pipeline"""
        print("="*80)
        print("ADRESSO21 COMPLETE ANALYSIS PIPELINE")
        print("="*80)

        results = {}

        # Step 1: Get audio files
        results['audio_files'] = self.step_1_get_audio_files()

        # Step 2: Extract acoustic features
        results['acoustic_features'] = self.step_2_extract_acoustic_features(limit_per_category)

        # Step 3: Extract transcripts
        results['transcripts'] = self.step_3_extract_transcripts(limit_per_category)

        # Step 4: Extract linguistic features
        results['linguistic_features'] = self.step_4_extract_linguistic_features()

        # Generate final summary
        self.generate_final_summary(results)

        print("\n" + "="*80)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*80)
        print(f"Results saved to: {self.output_path}")
        print(f"Checkpoints saved to: {self.checkpoint_path}")
        print(f"Visualizations saved to: {self.output_path}/visualizations")

        return results
