<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Extractor_Jul10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Speech Dataset Analysis Pipeline for ADResSO21
# This pipeline processes audio files, generates transcripts, extracts acoustic features,
# and analyzes semantic connections between audio and text

import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import speech_recognition as sr
import opensmile
import warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set up paths
BASE_PATH = '/content/drive/MyDrive/Speech'
FEATURES_SAVE_PATH = '/content/drive/MyDrive/Speech/extracted_features'
TRANSCRIPTS_SAVE_PATH = '/content/drive/MyDrive/Speech/transcripts'

# Create directories for saving features and transcripts
os.makedirs(FEATURES_SAVE_PATH, exist_ok=True)
os.makedirs(TRANSCRIPTS_SAVE_PATH, exist_ok=True)

class SpeechAnalyzer:
    def __init__(self, base_path):
        self.base_path = base_path
        self.recognizer = sr.Recognizer()
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
        )

    def load_dataset_structure(self):
        """Load and organize the dataset structure"""
        dataset_info = {
            'diagnosis_train': {
                'audio': {
                    'cn': f'{self.base_path}/extracted-diagnosis-train/audio/cn',
                    'ad': f'{self.base_path}/extracted-diagnosis-train/audio/ad'
                },
                'segmentation': {
                    'cn': f'{self.base_path}/extracted-diagnosis-train/segmentation/cn',
                    'ad': f'{self.base_path}/extracted-diagnosis-train/segmentation/ad'
                }
            },
            'progression_train': {
                'audio': {
                    'decline': f'{self.base_path}/extracted-progression-train/audio/decline',
                    'no-decline': f'{self.base_path}/extracted-progression-train/audio/no-decline'
                },
                'segmentation': {
                    'decline': f'{self.base_path}/extracted-progression-train/segmentation/decline',
                    'no-decline': f'{self.base_path}/extracted-progression-train/segmentation/no-decline'
                }
            },
            'progression_test': {
                'audio': f'{self.base_path}/extracted-progression-test/audio',
                'segmentation': f'{self.base_path}/extracted-progression-test/segmentation'
            }
        }
        return dataset_info

    def get_audio_files(self, directory):
        """Get all audio files from a directory"""
        if not os.path.exists(directory):
            print(f"Directory not found: {directory}")
            return []

        audio_extensions = ['.wav', '.mp3', '.m4a', '.flac', '.aac']
        audio_files = []

        for file in os.listdir(directory):
            if any(file.lower().endswith(ext) for ext in audio_extensions):
                audio_files.append(os.path.join(directory, file))

        return audio_files

    def transcribe_audio(self, audio_file_path):
        """Transcribe audio file to text using speech recognition"""
        try:
            with sr.AudioFile(audio_file_path) as source:
                # Adjust for ambient noise
                self.recognizer.adjust_for_ambient_noise(source)
                audio = self.recognizer.record(source)

            # Try different recognition engines
            try:
                # Use Google's free service
                transcript = self.recognizer.recognize_google(audio)
                return transcript
            except sr.UnknownValueError:
                # Try with alternative service
                try:
                    transcript = self.recognizer.recognize_sphinx(audio)
                    return transcript
                except:
                    return "Unable to transcribe"
            except sr.RequestError as e:
                return f"Error with recognition service: {e}"

        except Exception as e:
            return f"Error processing audio: {e}"

    def extract_egemaps_features(self, audio_file_path):
        """Extract eGeMAPS features using OpenSMILE"""
        try:
            features = self.smile.process_file(audio_file_path)
            return features.values.flatten()
        except Exception as e:
            print(f"Error extracting eGeMAPS features from {audio_file_path}: {e}")
            return None

    def extract_log_mel_spectrogram(self, audio_file_path, n_mels=128, n_fft=2048, hop_length=512):
        """Extract log-mel spectrogram features"""
        try:
            y, sr = librosa.load(audio_file_path, sr=None)

            # Extract mel spectrogram
            mel_spec = librosa.feature.melspectrogram(
                y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length
            )

            # Convert to log scale
            log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

            # Compute statistics across time dimension
            features = {
                'mean': np.mean(log_mel_spec, axis=1),
                'std': np.std(log_mel_spec, axis=1),
                'max': np.max(log_mel_spec, axis=1),
                'min': np.min(log_mel_spec, axis=1)
            }

            # Flatten all features
            feature_vector = np.concatenate([features[key] for key in features.keys()])
            return feature_vector

        except Exception as e:
            print(f"Error extracting log-mel features from {audio_file_path}: {e}")
            return None

    def extract_spectral_features(self, audio_file_path):
        """Extract additional spectral features"""
        try:
            y, sr = librosa.load(audio_file_path, sr=None)

            # Extract various spectral features
            features = {}

            # Spectral centroid
            features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))

            # Spectral rolloff
            features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))

            # Spectral bandwidth
            features['spectral_bandwidth'] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))

            # Zero crossing rate
            features['zero_crossing_rate'] = np.mean(librosa.feature.zero_crossing_rate(y))

            # MFCC features
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            for i in range(13):
                features[f'mfcc_{i}'] = np.mean(mfccs[i])

            # Chroma features
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            features['chroma_mean'] = np.mean(chroma)
            features['chroma_std'] = np.std(chroma)

            # Tempo
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            features['tempo'] = tempo

            return np.array(list(features.values()))

        except Exception as e:
            print(f"Error extracting spectral features from {audio_file_path}: {e}")
            return None

    def process_dataset(self):
        """Process the entire dataset"""
        dataset_info = self.load_dataset_structure()
        results = {
            'files': [],
            'labels': [],
            'categories': [],
            'transcripts': [],
            'egemaps_features': [],
            'logmel_features': [],
            'spectral_features': []
        }

        print("Processing dataset...")

        # Process diagnosis training data
        for label in ['cn', 'ad']:
            audio_dir = dataset_info['diagnosis_train']['audio'][label]
            audio_files = self.get_audio_files(audio_dir)

            print(f"Processing {label} diagnosis files: {len(audio_files)} files")

            for audio_file in audio_files:
                print(f"Processing: {os.path.basename(audio_file)}")

                # Transcribe audio
                transcript = self.transcribe_audio(audio_file)

                # Extract features
                egemaps_feat = self.extract_egemaps_features(audio_file)
                logmel_feat = self.extract_log_mel_spectrogram(audio_file)
                spectral_feat = self.extract_spectral_features(audio_file)

                # Store results
                results['files'].append(audio_file)
                results['labels'].append(label)
                results['categories'].append('diagnosis')
                results['transcripts'].append(transcript)
                results['egemaps_features'].append(egemaps_feat)
                results['logmel_features'].append(logmel_feat)
                results['spectral_features'].append(spectral_feat)

        # Process progression training data
        for label in ['decline', 'no-decline']:
            audio_dir = dataset_info['progression_train']['audio'][label]
            audio_files = self.get_audio_files(audio_dir)

            print(f"Processing {label} progression files: {len(audio_files)} files")

            for audio_file in audio_files:
                print(f"Processing: {os.path.basename(audio_file)}")

                # Transcribe audio
                transcript = self.transcribe_audio(audio_file)

                # Extract features
                egemaps_feat = self.extract_egemaps_features(audio_file)
                logmel_feat = self.extract_log_mel_spectrogram(audio_file)
                spectral_feat = self.extract_spectral_features(audio_file)

                # Store results
                results['files'].append(audio_file)
                results['labels'].append(label)
                results['categories'].append('progression')
                results['transcripts'].append(transcript)
                results['egemaps_features'].append(egemaps_feat)
                results['logmel_features'].append(logmel_feat)
                results['spectral_features'].append(spectral_feat)

        return results

    def save_results(self, results):
        """Save extracted features and transcripts"""
        # Save transcripts
        transcripts_df = pd.DataFrame({
            'file': results['files'],
            'label': results['labels'],
            'category': results['categories'],
            'transcript': results['transcripts']
        })
        transcripts_df.to_csv(f'{TRANSCRIPTS_SAVE_PATH}/transcripts.csv', index=False)

        # Save features
        # Filter out None values
        valid_indices = [i for i, feat in enumerate(results['egemaps_features']) if feat is not None]

        if valid_indices:
            # eGeMAPS features
            egemaps_array = np.array([results['egemaps_features'][i] for i in valid_indices])
            np.save(f'{FEATURES_SAVE_PATH}/egemaps_features.npy', egemaps_array)

            # Log-mel features
            logmel_valid = [i for i in valid_indices if results['logmel_features'][i] is not None]
            if logmel_valid:
                logmel_array = np.array([results['logmel_features'][i] for i in logmel_valid])
                np.save(f'{FEATURES_SAVE_PATH}/logmel_features.npy', logmel_array)

            # Spectral features
            spectral_valid = [i for i in valid_indices if results['spectral_features'][i] is not None]
            if spectral_valid:
                spectral_array = np.array([results['spectral_features'][i] for i in spectral_valid])
                np.save(f'{FEATURES_SAVE_PATH}/spectral_features.npy', spectral_array)

            # Save metadata
            metadata_df = pd.DataFrame({
                'file': [results['files'][i] for i in valid_indices],
                'label': [results['labels'][i] for i in valid_indices],
                'category': [results['categories'][i] for i in valid_indices]
            })
            metadata_df.to_csv(f'{FEATURES_SAVE_PATH}/metadata.csv', index=False)

        print(f"Results saved to {FEATURES_SAVE_PATH} and {TRANSCRIPTS_SAVE_PATH}")

    def visualize_semantic_connections(self, results):
        """Visualize semantic connections between audio and text"""
        # Create visualizations
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))

        # 1. Transcript length distribution by label
        transcript_lengths = [len(t.split()) if isinstance(t, str) else 0 for t in results['transcripts']]
        transcript_df = pd.DataFrame({
            'length': transcript_lengths,
            'label': results['labels'],
            'category': results['categories']
        })

        sns.boxplot(data=transcript_df, x='label', y='length', ax=axes[0, 0])
        axes[0, 0].set_title('Transcript Length Distribution by Label')
        axes[0, 0].set_ylabel('Number of Words')

        # 2. Feature correlation heatmap (using spectral features)
        valid_spectral = [f for f in results['spectral_features'] if f is not None]
        if valid_spectral:
            spectral_array = np.array(valid_spectral)
            correlation_matrix = np.corrcoef(spectral_array.T)

            sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', ax=axes[0, 1])
            axes[0, 1].set_title('Spectral Features Correlation')

        # 3. PCA visualization of acoustic features
        if valid_spectral:
            scaler = StandardScaler()
            spectral_scaled = scaler.fit_transform(spectral_array)

            pca = PCA(n_components=2)
            pca_result = pca.fit_transform(spectral_scaled)

            # Get corresponding labels for valid spectral features
            valid_labels = [results['labels'][i] for i, f in enumerate(results['spectral_features']) if f is not None]

            scatter = axes[0, 2].scatter(pca_result[:, 0], pca_result[:, 1],
                                       c=[hash(label) for label in valid_labels],
                                       alpha=0.6)
            axes[0, 2].set_title('PCA of Acoustic Features')
            axes[0, 2].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
            axes[0, 2].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')

        # 4. Text-based analysis
        # Word frequency analysis
        all_words = []
        for transcript in results['transcripts']:
            if isinstance(transcript, str):
                words = transcript.lower().split()
                all_words.extend(words)

        from collections import Counter
        word_freq = Counter(all_words)
        common_words = word_freq.most_common(10)

        if common_words:
            words, counts = zip(*common_words)
            axes[1, 0].bar(words, counts)
            axes[1, 0].set_title('Most Common Words in Transcripts')
            axes[1, 0].tick_params(axis='x', rotation=45)

        # 5. Semantic similarity analysis
        # Calculate text similarity between different groups
        def calculate_text_similarity(group1_transcripts, group2_transcripts):
            similarities = []
            for t1 in group1_transcripts:
                for t2 in group2_transcripts:
                    if isinstance(t1, str) and isinstance(t2, str):
                        # Simple word overlap similarity
                        words1 = set(t1.lower().split())
                        words2 = set(t2.lower().split())
                        similarity = len(words1.intersection(words2)) / len(words1.union(words2)) if words1.union(words2) else 0
                        similarities.append(similarity)
            return similarities

        # Group transcripts by label
        label_groups = {}
        for i, label in enumerate(results['labels']):
            if label not in label_groups:
                label_groups[label] = []
            label_groups[label].append(results['transcripts'][i])

        # Calculate similarities between groups
        similarity_data = []
        labels = list(label_groups.keys())
        for i, label1 in enumerate(labels):
            for j, label2 in enumerate(labels):
                if i <= j:
                    similarities = calculate_text_similarity(label_groups[label1], label_groups[label2])
                    similarity_data.extend([(label1, label2, sim) for sim in similarities])

        if similarity_data:
            sim_df = pd.DataFrame(similarity_data, columns=['Group1', 'Group2', 'Similarity'])
            sim_df['Pair'] = sim_df['Group1'] + ' vs ' + sim_df['Group2']

            sns.boxplot(data=sim_df, x='Pair', y='Similarity', ax=axes[1, 1])
            axes[1, 1].set_title('Text Similarity Between Groups')
            axes[1, 1].tick_params(axis='x', rotation=45)

        # 6. Audio-Text relationship
        # Correlate audio features with text features
        audio_text_correlations = []

        for i, (transcript, spectral_feat) in enumerate(zip(results['transcripts'], results['spectral_features'])):
            if isinstance(transcript, str) and spectral_feat is not None:
                # Text features
                word_count = len(transcript.split())
                char_count = len(transcript)

                # Audio features (take first few spectral features)
                audio_energy = spectral_feat[0] if len(spectral_feat) > 0 else 0

                audio_text_correlations.append({
                    'word_count': word_count,
                    'char_count': char_count,
                    'audio_energy': audio_energy,
                    'label': results['labels'][i]
                })

        if audio_text_correlations:
            corr_df = pd.DataFrame(audio_text_correlations)

            # Plot correlation between word count and audio energy
            sns.scatterplot(data=corr_df, x='word_count', y='audio_energy',
                          hue='label', ax=axes[1, 2])
            axes[1, 2].set_title('Audio Energy vs Word Count')
            axes[1, 2].set_xlabel('Word Count')
            axes[1, 2].set_ylabel('Audio Energy')

        plt.tight_layout()
        plt.savefig(f'{FEATURES_SAVE_PATH}/semantic_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        return fig

# Initialize and run the analysis
print("Starting Speech Dataset Analysis...")
analyzer = SpeechAnalyzer(BASE_PATH)

# Process the dataset
results = analyzer.process_dataset()

# Save results
analyzer.save_results(results)

# Create semantic visualizations
analyzer.visualize_semantic_connections(results)

print("Analysis complete! Check the saved files in your Drive.")

ValueError: mount failed

In [2]:
# Install required packages for speech analysis
# Run this cell first before running the main analysis

# Install required packages
!pip install librosa
!pip install SpeechRecognition
!pip install opensmile
!pip install pydub
!pip install textblob
!pip install wordcloud
!pip install plotly
!pip install scikit-learn
!pip install seaborn
!pip install matplotlib

# Additional setup for audio processing
!apt-get update
!apt-get install -y portaudio19-dev
!apt-get install -y flac

# Install additional speech processing libraries
!pip install pyaudio
!pip install pocketsphinx

print("All required packages installed successfully!")
print("You can now run the main analysis script.")

# Additional utility functions for enhanced analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

class EnhancedSpeechAnalyzer:
    """Enhanced analyzer with additional visualization and analysis capabilities"""

    def __init__(self, features_path, transcripts_path):
        self.features_path = features_path
        self.transcripts_path = transcripts_path

    def load_saved_data(self):
        """Load previously saved features and transcripts"""
        try:
            # Load transcripts
            transcripts_df = pd.read_csv(f'{self.transcripts_path}/transcripts.csv')

            # Load features
            egemaps_features = np.load(f'{self.features_path}/egemaps_features.npy')
            metadata_df = pd.read_csv(f'{self.features_path}/metadata.csv')

            try:
                logmel_features = np.load(f'{self.features_path}/logmel_features.npy')
            except:
                logmel_features = None

            try:
                spectral_features = np.load(f'{self.features_path}/spectral_features.npy')
            except:
                spectral_features = None

            return {
                'transcripts': transcripts_df,
                'egemaps_features': egemaps_features,
                'logmel_features': logmel_features,
                'spectral_features': spectral_features,
                'metadata': metadata_df
            }
        except Exception as e:
            print(f"Error loading saved data: {e}")
            return None

    def analyze_text_sentiment(self, transcripts_df):
        """Analyze sentiment of transcripts"""
        sentiments = []

        for transcript in transcripts_df['transcript']:
            if isinstance(transcript, str) and transcript != "Unable to transcribe":
                blob = TextBlob(transcript)
                sentiments.append({
                    'polarity': blob.sentiment.polarity,
                    'subjectivity': blob.sentiment.subjectivity
                })
            else:
                sentiments.append({
                    'polarity': 0,
                    'subjectivity': 0
                })

        sentiment_df = pd.DataFrame(sentiments)
        sentiment_df['label'] = transcripts_df['label']
        sentiment_df['category'] = transcripts_df['category']

        return sentiment_df

    def create_word_clouds(self, transcripts_df):
        """Create word clouds for different groups"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))

        labels = transcripts_df['label'].unique()

        for i, label in enumerate(labels):
            if i < 4:  # Maximum 4 subplots
                row = i // 2
                col = i % 2

                # Get transcripts for this label
                label_transcripts = transcripts_df[transcripts_df['label'] == label]['transcript']

                # Combine all transcripts for this label
                combined_text = ' '.join([t for t in label_transcripts if isinstance(t, str) and t != "Unable to transcribe"])

                if combined_text:
                    wordcloud = WordCloud(width=800, height=400,
                                        background_color='white',
                                        max_words=50,
                                        colormap='viridis').generate(combined_text)

                    axes[row, col].imshow(wordcloud, interpolation='bilinear')
                    axes[row, col].set_title(f'Word Cloud - {label}')
                    axes[row, col].axis('off')

        plt.tight_layout()
        plt.savefig(f'{self.features_path}/word_clouds.png', dpi=300, bbox_inches='tight')
        plt.show()

    def create_interactive_dashboard(self, data):
        """Create an interactive dashboard using Plotly"""

        # Create subplots
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Feature Distribution', 'Sentiment Analysis',
                          'Audio Features PCA', 'Text Length vs Audio Energy'),
            specs=[[{"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}]]
        )

        metadata = data['metadata']
        transcripts = data['transcripts']

        # 1. Feature distribution (using first few eGeMAPS features)
        if data['egemaps_features'] is not None:
            features_sample = data['egemaps_features'][:, :5]  # First 5 features

            for i in range(min(5, features_sample.shape[1])):
                fig.add_trace(
                    go.Box(y=features_sample[:, i], name=f'Feature {i+1}'),
                    row=1, col=1
                )

        # 2. Sentiment analysis
        sentiment_df = self.analyze_text_sentiment(transcripts)

        fig.add_trace(
            go.Scatter(
                x=sentiment_df['polarity'],
                y=sentiment_df['subjectivity'],
                mode='markers',
                marker=dict(
                    color=[hash(label) for label in sentiment_df['label']],
                    size=10,
                    opacity=0.7
                ),
                text=sentiment_df['label'],
                name='Sentiment'
            ),
            row=1, col=2
        )

        # 3. PCA of audio features
        if data['spectral_features'] is not None:
            from sklearn.decomposition import PCA
            from sklearn.preprocessing import StandardScaler

            scaler = StandardScaler()
            features_scaled = scaler.fit_transform(data['spectral_features'])

            pca = PCA(n_components=2)
            pca_result = pca.fit_transform(features_scaled)

            fig.add_trace(
                go.Scatter(
                    x=pca_result[:, 0],
                    y=pca_result[:, 1],
                    mode='markers',
                    marker=dict(
                        color=[hash(label) for label in metadata['label']],
                        size=8,
                        opacity=0.7
                    ),
                    text=metadata['label'],
                    name='PCA'
                ),
                row=2, col=1
            )

        # 4. Text length vs audio energy correlation
        text_lengths = [len(t.split()) if isinstance(t, str) else 0 for t in transcripts['transcript']]

        if data['spectral_features'] is not None:
            audio_energy = data['spectral_features'][:, 0]  # First spectral feature as proxy for energy

            fig.add_trace(
                go.Scatter(
                    x=text_lengths,
                    y=audio_energy,
                    mode='markers',
                    marker=dict(
                        color=[hash(label) for label in metadata['label']],
                        size=8,
                        opacity=0.7
                    ),
                    text=metadata['label'],
                    name='Text vs Audio'
                ),
                row=2, col=2
            )

        # Update layout
        fig.update_layout(
            title_text="Speech Analysis Dashboard",
            showlegend=True,
            height=800
        )

        # Save as HTML
        fig.write_html(f'{self.features_path}/interactive_dashboard.html')
        fig.show()

        return fig

    def clustering_analysis(self, data):
        """Perform clustering analysis on the features"""

        if data['egemaps_features'] is None:
            print("No eGeMAPS features available for clustering")
            return

        # Prepare features for clustering
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(data['egemaps_features'])

        # Determine optimal number of clusters
        silhouette_scores = []
        k_range = range(2, min(10, len(features_scaled)))

        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            cluster_labels = kmeans.fit_predict(features_scaled)
            silhouette_avg = silhouette_score(features_scaled, cluster_labels)
            silhouette_scores.append(silhouette_avg)

        # Find optimal k
        optimal_k = k_range[np.argmax(silhouette_scores)]

        # Perform clustering with optimal k
        kmeans = KMeans(n_clusters=optimal_k, random_state=42)
        cluster_labels = kmeans.fit_predict(features_scaled)

        # Visualize clustering results
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))

        # Plot silhouette scores
        axes[0].plot(k_range, silhouette_scores, 'bo-')
        axes[0].set_xlabel('Number of Clusters')
        axes[0].set_ylabel('Silhouette Score')
        axes[0].set_title('Optimal Number of Clusters')
        axes[0].axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k={optimal_k}')
        axes[0].legend()

        # Plot clustering results using PCA
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(features_scaled)

        scatter = axes[1].scatter(pca_result[:, 0], pca_result[:, 1],
                                 c=cluster_labels, cmap='viridis', alpha=0.6)
        axes[1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        axes[1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        axes[1].set_title('Clustering Results (PCA visualization)')
        plt.colorbar(scatter, ax=axes[1])

        plt.tight_layout()
        plt.savefig(f'{self.features_path}/clustering_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        # Create cluster analysis report
        cluster_report = pd.DataFrame({
            'file': data['metadata']['file'],
            'label': data['metadata']['label'],
            'category': data['metadata']['category'],
            'cluster': cluster_labels
        })

        cluster_report.to_csv(f'{self.features_path}/cluster_analysis.csv', index=False)

        # Print cluster statistics
        print("\nCluster Analysis Results:")
        print(f"Optimal number of clusters: {optimal_k}")
        print(f"Silhouette score: {silhouette_scores[optimal_k-2]:.3f}")

        print("\nCluster distribution by label:")
        cluster_label_crosstab = pd.crosstab(cluster_report['cluster'], cluster_report['label'])
        print(cluster_label_crosstab)

        return cluster_report

    def comprehensive_feature_analysis(self, data):
        """Comprehensive analysis of all extracted features"""

        fig, axes = plt.subplots(3, 2, figsize=(16, 18))

        # 1. Feature importance analysis (using correlation with labels)
        if data['egemaps_features'] is not None:
            # Convert labels to numeric for correlation
            label_mapping = {label: i for i, label in enumerate(data['metadata']['label'].unique())}
            numeric_labels = [label_mapping[label] for label in data['metadata']['label']]

            # Calculate correlation between each feature and the labels
            correlations = []
            for i in range(data['egemaps_features'].shape[1]):
                corr = np.corrcoef(data['egemaps_features'][:, i], numeric_labels)[0, 1]
                correlations.append(abs(corr))

            # Plot top 20 most correlated features
            top_features = np.argsort(correlations)[-20:]
            axes[0, 0].barh(range(len(top_features)), [correlations[i] for i in top_features])
            axes[0, 0].set_yticks(range(len(top_features)))
            axes[0, 0].set_yticklabels([f'Feature {i}' for i in top_features])
            axes[0, 0].set_xlabel('Absolute Correlation with Labels')
            axes[0, 0].set_title('Top 20 Most Discriminative Features')

        # 2. Distribution of transcript lengths by category
        transcripts = data['transcripts']
        text_lengths = [len(t.split()) if isinstance(t, str) and t != "Unable to transcribe" else 0
                       for t in transcripts['transcript']]

        length_df = pd.DataFrame({
            'length': text_lengths,
            'label': transcripts['label'],
            'category': transcripts['category']
        })

        sns.violinplot(data=length_df, x='category', y='length', hue='label', ax=axes[0, 1])
        axes[0, 1].set_title('Transcript Length Distribution')
        axes[0, 1].set_ylabel('Number of Words')

        # 3. Feature stability analysis (coefficient of variation)
        if data['egemaps_features'] is not None:
            cv_values = []
            for i in range(data['egemaps_features'].shape[1]):
                feature_values = data['egemaps_features'][:, i]
                cv = np.std(feature_values) / np.mean(feature_values) if np.mean(feature_values) != 0 else 0
                cv_values.append(cv)

            axes[1, 0].hist(cv_values, bins=30, alpha=0.7, color='skyblue')
            axes[1, 0].set_xlabel('Coefficient of Variation')
            axes[1, 0].set_ylabel('Number of Features')
            axes[1, 0].set_title('Feature Stability Distribution')
            axes[1, 0].axvline(x=np.mean(cv_values), color='red', linestyle='--', label=f'Mean CV: {np.mean(cv_values):.2f}')
            axes[1, 0].legend()

        # 4. Semantic similarity heatmap
        # Create TF-IDF vectors for transcripts
        valid_transcripts = [t for t in transcripts['transcript']
                           if isinstance(t, str) and t != "Unable to transcribe" and len(t.strip()) > 0]

        if len(valid_transcripts) > 1:
            vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
            tfidf_matrix = vectorizer.fit_transform(valid_transcripts)

            # Calculate similarity matrix
            similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()

            # Plot heatmap (sample if too large)
            if similarity_matrix.shape[0] > 50:
                indices = np.random.choice(similarity_matrix.shape[0], 50, replace=False)
                similarity_matrix = similarity_matrix[np.ix_(indices, indices)]

            sns.heatmap(similarity_matrix, cmap='coolwarm', center=0, ax=axes[1, 1])
            axes[1, 1].set_title('Transcript Semantic Similarity Matrix')

        # 5. Audio-text feature correlation
        if data['spectral_features'] is not None:
            # Calculate text-based features
            text_features = []
            for transcript in transcripts['transcript']:
                if isinstance(transcript, str) and transcript != "Unable to transcribe":
                    blob = TextBlob(transcript)
                    text_features.append({
                        'word_count': len(transcript.split()),
                        'char_count': len(transcript),
                        'sentence_count': len(blob.sentences),
                        'polarity': blob.sentiment.polarity,
                        'subjectivity': blob.sentiment.subjectivity
                    })
                else:
                    text_features.append({
                        'word_count': 0,
                        'char_count': 0,
                        'sentence_count': 0,
                        'polarity': 0,
                        'subjectivity': 0
                    })

            text_df = pd.DataFrame(text_features)

            # Calculate correlations between audio and text features
            audio_text_corr = []
            for i in range(min(5, data['spectral_features'].shape[1])):  # First 5 audio features
                for text_col in text_df.columns:
                    corr = np.corrcoef(data['spectral_features'][:, i], text_df[text_col])[0, 1]
                    audio_text_corr.append({
                        'audio_feature': f'Audio_{i}',
                        'text_feature': text_col,
                        'correlation': corr
                    })

            corr_df = pd.DataFrame(audio_text_corr)
            corr_pivot = corr_df.pivot(index='audio_feature', columns='text_feature', values='correlation')

            sns.heatmap(corr_pivot, annot=True, cmap='coolwarm', center=0, ax=axes[2, 0])
            axes[2, 0].set_title('Audio-Text Feature Correlations')

        # 6. Classification performance simulation
        if data['egemaps_features'] is not None:
            from sklearn.model_selection import cross_val_score
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.preprocessing import LabelEncoder

            # Prepare data for classification
            X = data['egemaps_features']
            y = data['metadata']['label']

            # Encode labels
            le = LabelEncoder()
            y_encoded = le.fit_transform(y)

            # Perform cross-validation
            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            cv_scores = cross_val_score(rf, X, y_encoded, cv=5, scoring='accuracy')

            # Plot cross-validation scores
            axes[2, 1].bar(range(1, 6), cv_scores, alpha=0.7, color='lightcoral')
            axes[2, 1].axhline(y=np.mean(cv_scores), color='navy', linestyle='--',
                              label=f'Mean CV Score: {np.mean(cv_scores):.3f}')
            axes[2, 1].set_xlabel('Fold')
            axes[2, 1].set_ylabel('Accuracy')
            axes[2, 1].set_title('Cross-Validation Performance')
            axes[2, 1].legend()
            axes[2, 1].set_ylim(0, 1)

        plt.tight_layout()
        plt.savefig(f'{self.features_path}/comprehensive_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        return fig

    def generate_analysis_report(self, data):
        """Generate a comprehensive analysis report"""

        report = []
        report.append("# Speech Dataset Analysis Report\n")
        report.append(f"Generated on: {pd.Timestamp.now()}\n")
        report.append("=" * 50 + "\n")

        # Dataset overview
        report.append("## Dataset Overview\n")
        report.append(f"- Total files processed: {len(data['metadata'])}\n")
        report.append(f"- Categories: {data['metadata']['category'].unique()}\n")
        report.append(f"- Labels: {data['metadata']['label'].unique()}\n")

        label_counts = data['metadata']['label'].value_counts()
        report.append("\n### Label Distribution:\n")
        for label, count in label_counts.items():
            report.append(f"- {label}: {count} files\n")

        # Transcript analysis
        report.append("\n## Transcript Analysis\n")
        transcripts = data['transcripts']

        # Count successful transcriptions
        successful_transcripts = sum(1 for t in transcripts['transcript']
                                   if isinstance(t, str) and t != "Unable to transcribe")
        report.append(f"- Successful transcriptions: {successful_transcripts}/{len(transcripts)}\n")

        # Average transcript length
        text_lengths = [len(t.split()) if isinstance(t, str) and t != "Unable to transcribe" else 0
                       for t in transcripts['transcript']]
        report.append(f"- Average transcript length: {np.mean(text_lengths):.1f} words\n")

        # Feature analysis
        report.append("\n## Feature Analysis\n")
        if data['egemaps_features'] is not None:
            report.append(f"- eGeMAPS features: {data['egemaps_features'].shape[1]} dimensions\n")
            report.append(f"- Feature matrix shape: {data['egemaps_features'].shape}\n")

        if data['spectral_features'] is not None:
            report.append(f"- Spectral features: {data['spectral_features'].shape[1]} dimensions\n")

        if data['logmel_features'] is not None:
            report.append(f"- Log-mel features: {data['logmel_features'].shape[1]} dimensions\n")

        # Sentiment analysis summary
        sentiment_df = self.analyze_text_sentiment(transcripts)
        report.append("\n## Sentiment Analysis Summary\n")
        report.append(f"- Average polarity: {sentiment_df['polarity'].mean():.3f}\n")
        report.append(f"- Average subjectivity: {sentiment_df['subjectivity'].mean():.3f}\n")

        # Group-wise sentiment analysis
        for label in sentiment_df['label'].unique():
            label_sentiment = sentiment_df[sentiment_df['label'] == label]
            report.append(f"- {label} - Polarity: {label_sentiment['polarity'].mean():.3f}, "
                         f"Subjectivity: {label_sentiment['subjectivity'].mean():.3f}\n")

        # Recommendations
        report.append("\n## Recommendations\n")
        report.append("1. **Feature Selection**: Consider using feature selection techniques to identify the most discriminative features.\n")
        report.append("2. **Data Augmentation**: Given the limited dataset size, consider audio augmentation techniques.\n")
        report.append("3. **Deep Learning**: Explore deep learning approaches for better feature representation.\n")
        report.append("4. **Multimodal Learning**: Combine audio and text features for improved classification.\n")
        report.append("5. **Cross-validation**: Use proper cross-validation techniques for robust model evaluation.\n")

        # Save report
        with open(f'{self.features_path}/analysis_report.txt', 'w') as f:
            f.writelines(report)

        print("Analysis report generated and saved!")
        print("".join(report))

        return report

# Usage example:
print("Enhanced Speech Analyzer ready!")
print("To use the enhanced analyzer:")
print("1. First run the main analysis script")
print("2. Then use the following code:")
print("""
# Initialize enhanced analyzer
enhanced_analyzer = EnhancedSpeechAnalyzer(FEATURES_SAVE_PATH, TRANSCRIPTS_SAVE_PATH)

# Load saved data
data = enhanced_analyzer.load_saved_data()

if data is not None:
    # Create word clouds
    enhanced_analyzer.create_word_clouds(data['transcripts'])

    # Create interactive dashboard
    enhanced_analyzer.create_interactive_dashboard(data)

    # Perform clustering analysis
    cluster_report = enhanced_analyzer.clustering_analysis(data)

    # Comprehensive feature analysis
    enhanced_analyzer.comprehensive_feature_analysis(data)

    # Generate analysis report
    enhanced_analyzer.generate_analysis_report(data)
else:
    print("Could not load saved data. Please run the main analysis first.")
""")

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3
Collecting opensmile
  Downloading opensmile-2.5.1-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.12-py3-none-any.whl.metadata (2.7 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.3.1-py3-none-any.whl.metadata (4.3 kB)
Collecting audeer>=2.1.1 (from audinterface>=0.7.0->opensmile)
  Downloading audeer-2.2.2-py3-none-any.whl.metadata (4.1 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.3.2-py3-none-any.whl.metadata (4.7 kB)
Collecting audiofil