<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Extractor_Jul10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages for speech analysis
# Run this cell first before running the main analysis

# Install required packages
!pip install librosa
!pip install SpeechRecognition
!pip install opensmile
!pip install pydub
!pip install textblob
!pip install wordcloud
!pip install plotly
!pip install scikit-learn
!pip install seaborn
!pip install matplotlib

# Additional setup for audio processing
!apt-get update
!apt-get install -y portaudio19-dev
!apt-get install -y flac

# Install additional speech processing libraries
!pip install pyaudio
!pip install pocketsphinx

print("All required packages installed successfully!")
print("You can now run the main analysis script.")

# Additional utility functions for enhanced analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

class EnhancedSpeechAnalyzer:
    """Enhanced analyzer with additional visualization and analysis capabilities"""

    def __init__(self, features_path, transcripts_path):
        self.features_path = features_path
        self.transcripts_path = transcripts_path

    def load_saved_data(self):
        """Load previously saved features and transcripts"""
        try:
            # Load transcripts
            transcripts_df = pd.read_csv(f'{self.transcripts_path}/transcripts.csv')

            # Load features
            egemaps_features = np.load(f'{self.features_path}/egemaps_features.npy')
            metadata_df = pd.read_csv(f'{self.features_path}/metadata.csv')

            try:
                logmel_features = np.load(f'{self.features_path}/logmel_features.npy')
            except:
                logmel_features = None

            try:
                spectral_features = np.load(f'{self.features_path}/spectral_features.npy')
            except:
                spectral_features = None

            return {
                'transcripts': transcripts_df,
                'egemaps_features': egemaps_features,
                'logmel_features': logmel_features,
                'spectral_features': spectral_features,
                'metadata': metadata_df
            }
        except Exception as e:
            print(f"Error loading saved data: {e}")
            return None

    def analyze_text_sentiment(self, transcripts_df):
        """Analyze sentiment of transcripts"""
        sentiments = []

        for transcript in transcripts_df['transcript']:
            if isinstance(transcript, str) and transcript != "Unable to transcribe":
                blob = TextBlob(transcript)
                sentiments.append({
                    'polarity': blob.sentiment.polarity,
                    'subjectivity': blob.sentiment.subjectivity
                })
            else:
                sentiments.append({
                    'polarity': 0,
                    'subjectivity': 0
                })

        sentiment_df = pd.DataFrame(sentiments)
        sentiment_df['label'] = transcripts_df['label']
        sentiment_df['category'] = transcripts_df['category']

        return sentiment_df

    def create_word_clouds(self, transcripts_df):
        """Create word clouds for different groups"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))

        labels = transcripts_df['label'].unique()

        for i, label in enumerate(labels):
            if i < 4:  # Maximum 4 subplots
                row = i // 2
                col = i % 2

                # Get transcripts for this label
                label_transcripts = transcripts_df[transcripts_df['label'] == label]['transcript']

                # Combine all transcripts for this label
                combined_text = ' '.join([t for t in label_transcripts if isinstance(t, str) and t != "Unable to transcribe"])

                if combined_text:
                    wordcloud = WordCloud(width=800, height=400,
                                        background_color='white',
                                        max_words=50,
                                        colormap='viridis').generate(combined_text)

                    axes[row, col].imshow(wordcloud, interpolation='bilinear')
                    axes[row, col].set_title(f'Word Cloud - {label}')
                    axes[row, col].axis('off')

        plt.tight_layout()
        plt.savefig(f'{self.features_path}/word_clouds.png', dpi=300, bbox_inches='tight')
        plt.show()

    def create_interactive_dashboard(self, data):
        """Create an interactive dashboard using Plotly"""

        # Create subplots
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Feature Distribution', 'Sentiment Analysis',
                          'Audio Features PCA', 'Text Length vs Audio Energy'),
            specs=[[{"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}]]
        )

        metadata = data['metadata']
        transcripts = data['transcripts']

        # 1. Feature distribution (using first few eGeMAPS features)
        if data['egemaps_features'] is not None:
            features_sample = data['egemaps_features'][:, :5]  # First 5 features

            for i in range(min(5, features_sample.shape[1])):
                fig.add_trace(
                    go.Box(y=features_sample[:, i], name=f'Feature {i+1}'),
                    row=1, col=1
                )

        # 2. Sentiment analysis
        sentiment_df = self.analyze_text_sentiment(transcripts)

        fig.add_trace(
            go.Scatter(
                x=sentiment_df['polarity'],
                y=sentiment_df['subjectivity'],
                mode='markers',
                marker=dict(
                    color=[hash(label) for label in sentiment_df['label']],
                    size=10,
                    opacity=0.7
                ),
                text=sentiment_df['label'],
                name='Sentiment'
            ),
            row=1, col=2
        )

        # 3. PCA of audio features
        if data['spectral_features'] is not None:
            from sklearn.decomposition import PCA
            from sklearn.preprocessing import StandardScaler

            scaler = StandardScaler()
            features_scaled = scaler.fit_transform(data['spectral_features'])

            pca = PCA(n_components=2)
            pca_result = pca.fit_transform(features_scaled)

            fig.add_trace(
                go.Scatter(
                    x=pca_result[:, 0],
                    y=pca_result[:, 1],
                    mode='markers',
                    marker=dict(
                        color=[hash(label) for label in metadata['label']],
                        size=8,
                        opacity=0.7
                    ),
                    text=metadata['label'],
                    name='PCA'
                ),
                row=2, col=1
            )

        # 4. Text length vs audio energy correlation
        text_lengths = [len(t.split()) if isinstance(t, str) else 0 for t in transcripts['transcript']]

        if data['spectral_features'] is not None:
            audio_energy = data['spectral_features'][:, 0]  # First spectral feature as proxy for energy

            fig.add_trace(
                go.Scatter(
                    x=text_lengths,
                    y=audio_energy,
                    mode='markers',
                    marker=dict(
                        color=[hash(label) for label in metadata['label']],
                        size=8,
                        opacity=0.7
                    ),
                    text=metadata['label'],
                    name='Text vs Audio'
                ),
                row=2, col=2
            )

        # Update layout
        fig.update_layout(
            title_text="Speech Analysis Dashboard",
            showlegend=True,
            height=800
        )

        # Save as HTML
        fig.write_html(f'{self.features_path}/interactive_dashboard.html')
        fig.show()

        return fig

    def clustering_analysis(self, data):
        """Perform clustering analysis on the features"""

        if data['egemaps_features'] is None:
            print("No eGeMAPS features available for clustering")
            return

        # Prepare features for clustering
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(data['egemaps_features'])

        # Determine optimal number of clusters
        silhouette_scores = []
        k_range = range(2, min(10, len(features_scaled)))

        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            cluster_labels = kmeans.fit_predict(features_scaled)
            silhouette_avg = silhouette_score(features_scaled, cluster_labels)
            silhouette_scores.append(silhouette_avg)

        # Find optimal k
        optimal_k = k_range[np.argmax(silhouette_scores)]

        # Perform clustering with optimal k
        kmeans = KMeans(n_clusters=optimal_k, random_state=42)
        cluster_labels = kmeans.fit_predict(features_scaled)

        # Visualize clustering results
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))

        # Plot silhouette scores
        axes[0].plot(k_range, silhouette_scores, 'bo-')
        axes[0].set_xlabel('Number of Clusters')
        axes[0].set_ylabel('Silhouette Score')
        axes[0].set_title('Optimal Number of Clusters')
        axes[0].axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k={optimal_k}')
        axes[0].legend()

        # Plot clustering results using PCA
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(features_scaled)

        scatter = axes[1].scatter(pca_result[:, 0], pca_result[:, 1],
                                 c=cluster_labels, cmap='viridis', alpha=0.6)
        axes[1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        axes[1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        axes[1].set_title('Clustering Results (PCA visualization)')
        plt.colorbar(scatter, ax=axes[1])

        plt.tight_layout()
        plt.savefig(f'{self.features_path}/clustering_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        # Create cluster analysis report
        cluster_report = pd.DataFrame({
            'file': data['metadata']['file'],
            'label': data['metadata']['label'],
            'category': data['metadata']['category'],
            'cluster': cluster_labels
        })

        cluster_report.to_csv(f'{self.features_path}/cluster_analysis.csv', index=False)

        # Print cluster statistics
        print("\nCluster Analysis Results:")
        print(f"Optimal number of clusters: {optimal_k}")
        print(f"Silhouette score: {silhouette_scores[optimal_k-2]:.3f}")

        print("\nCluster distribution by label:")
        cluster_label_crosstab = pd.crosstab(cluster_report['cluster'], cluster_report['label'])
        print(cluster_label_crosstab)

        return cluster_report

    def comprehensive_feature_analysis(self, data):
        """Comprehensive analysis of all extracted features"""

        fig, axes = plt.subplots(3, 2, figsize=(16, 18))

        # 1. Feature importance analysis (using correlation with labels)
        if data['egemaps_features'] is not None:
            # Convert labels to numeric for correlation
            label_mapping = {label: i for i, label in enumerate(data['metadata']['label'].unique())}
            numeric_labels = [label_mapping[label] for label in data['metadata']['label']]

            # Calculate correlation between each feature and the labels
            correlations = []
            for i in range(data['egemaps_features'].shape[1]):
                corr = np.corrcoef(data['egemaps_features'][:, i], numeric_labels)[0, 1]
                correlations.append(abs(corr))

            # Plot top 20 most correlated features
            top_features = np.argsort(correlations)[-20:]
            axes[0, 0].barh(range(len(top_features)), [correlations[i] for i in top_features])
            axes[0, 0].set_yticks(range(len(top_features)))
            axes[0, 0].set_yticklabels([f'Feature {i}' for i in top_features])
            axes[0, 0].set_xlabel('Absolute Correlation with Labels')
            axes[0, 0].set_title('Top 20 Most Discriminative Features')

        # 2. Distribution of transcript lengths by category
        transcripts = data['transcripts']
        text_lengths = [len(t.split()) if isinstance(t, str) and t != "Unable to transcribe" else 0
                       for t in transcripts['transcript']]

        length_df = pd.DataFrame({
            'length': text_lengths,
            'label': transcripts['label'],
            'category': transcripts['category']
        })

        sns.violinplot(data=length_df, x='category', y='length', hue='label', ax=axes[0, 1])
        axes[0, 1].set_title('Transcript Length Distribution')
        axes[0, 1].set_ylabel('Number of Words')

        # 3. Feature stability analysis (coefficient of variation)
        if data['egemaps_features'] is not None:
            cv_values = []
            for i in range(data['egemaps_features'].shape[1]):
                feature_values = data['egemaps_features'][:, i]
                cv = np.std(feature_values) / np.mean(feature_values) if np.mean(feature_values) != 0 else 0
                cv_values.append(cv)

            axes[1, 0].hist(cv_values, bins=30, alpha=0.7, color='skyblue')
            axes[1, 0].set_xlabel('Coefficient of Variation')
            axes[1, 0].set_ylabel('Number of Features')
            axes[1, 0].set_title('Feature Stability Distribution')
            axes[1, 0].axvline(x=np.mean(cv_values), color='red', linestyle='--', label=f'Mean CV: {np.mean(cv_values):.2f}')
            axes[1, 0].legend()

        # 4. Semantic similarity heatmap
        # Create TF-IDF vectors for transcripts
        valid_transcripts = [t for t in transcripts['transcript']
                           if isinstance(t, str) and t != "Unable to transcribe" and len(t.strip()) > 0]

        if len(valid_transcripts) > 1:
            vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
            tfidf_matrix = vectorizer.fit_transform(valid_transcripts)

            # Calculate similarity matrix
            similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()

            # Plot heatmap (sample if too large)
            if similarity_matrix.shape[0] > 50:
                indices = np.random.choice(similarity_matrix.shape[0], 50, replace=False)
                similarity_matrix = similarity_matrix[np.ix_(indices, indices)]

            sns.heatmap(similarity_matrix, cmap='coolwarm', center=0, ax=axes[1, 1])
            axes[1, 1].set_title('Transcript Semantic Similarity Matrix')

        # 5. Audio-text feature correlation
        if data['spectral_features'] is not None:
            # Calculate text-based features
            text_features = []
            for transcript in transcripts['transcript']:
                if isinstance(transcript, str) and transcript != "Unable to transcribe":
                    blob = TextBlob(transcript)
                    text_features.append({
                        'word_count': len(transcript.split()),
                        'char_count': len(transcript),
                        'sentence_count': len(blob.sentences),
                        'polarity': blob.sentiment.polarity,
                        'subjectivity': blob.sentiment.subjectivity
                    })
                else:
                    text_features.append({
                        'word_count': 0,
                        'char_count': 0,
                        'sentence_count': 0,
                        'polarity': 0,
                        'subjectivity': 0
                    })

            text_df = pd.DataFrame(text_features)

            # Calculate correlations between audio and text features
            audio_text_corr = []
            for i in range(min(5, data['spectral_features'].shape[1])):  # First 5 audio features
                for text_col in text_df.columns:
                    corr = np.corrcoef(data['spectral_features'][:, i], text_df[text_col])[0, 1]
                    audio_text_corr.append({
                        'audio_feature': f'Audio_{i}',
                        'text_feature': text_col,
                        'correlation': corr
                    })

            corr_df = pd.DataFrame(audio_text_corr)
            corr_pivot = corr_df.pivot(index='audio_feature', columns='text_feature', values='correlation')

            sns.heatmap(corr_pivot, annot=True, cmap='coolwarm', center=0, ax=axes[2, 0])
            axes[2, 0].set_title('Audio-Text Feature Correlations')

        # 6. Classification performance simulation
        if data['egemaps_features'] is not None:
            from sklearn.model_selection import cross_val_score
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.preprocessing import LabelEncoder

            # Prepare data for classification
            X = data['egemaps_features']
            y = data['metadata']['label']

            # Encode labels
            le = LabelEncoder()
            y_encoded = le.fit_transform(y)

            # Perform cross-validation
            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            cv_scores = cross_val_score(rf, X, y_encoded, cv=5, scoring='accuracy')

            # Plot cross-validation scores
            axes[2, 1].bar(range(1, 6), cv_scores, alpha=0.7, color='lightcoral')
            axes[2, 1].axhline(y=np.mean(cv_scores), color='navy', linestyle='--',
                              label=f'Mean CV Score: {np.mean(cv_scores):.3f}')
            axes[2, 1].set_xlabel('Fold')
            axes[2, 1].set_ylabel('Accuracy')
            axes[2, 1].set_title('Cross-Validation Performance')
            axes[2, 1].legend()
            axes[2, 1].set_ylim(0, 1)

        plt.tight_layout()
        plt.savefig(f'{self.features_path}/comprehensive_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        return fig

    def generate_analysis_report(self, data):
        """Generate a comprehensive analysis report"""

        report = []
        report.append("# Speech Dataset Analysis Report\n")
        report.append(f"Generated on: {pd.Timestamp.now()}\n")
        report.append("=" * 50 + "\n")

        # Dataset overview
        report.append("## Dataset Overview\n")
        report.append(f"- Total files processed: {len(data['metadata'])}\n")
        report.append(f"- Categories: {data['metadata']['category'].unique()}\n")
        report.append(f"- Labels: {data['metadata']['label'].unique()}\n")

        label_counts = data['metadata']['label'].value_counts()
        report.append("\n### Label Distribution:\n")
        for label, count in label_counts.items():
            report.append(f"- {label}: {count} files\n")

        # Transcript analysis
        report.append("\n## Transcript Analysis\n")
        transcripts = data['transcripts']

        # Count successful transcriptions
        successful_transcripts = sum(1 for t in transcripts['transcript']
                                   if isinstance(t, str) and t != "Unable to transcribe")
        report.append(f"- Successful transcriptions: {successful_transcripts}/{len(transcripts)}\n")

        # Average transcript length
        text_lengths = [len(t.split()) if isinstance(t, str) and t != "Unable to transcribe" else 0
                       for t in transcripts['transcript']]
        report.append(f"- Average transcript length: {np.mean(text_lengths):.1f} words\n")

        # Feature analysis
        report.append("\n## Feature Analysis\n")
        if data['egemaps_features'] is not None:
            report.append(f"- eGeMAPS features: {data['egemaps_features'].shape[1]} dimensions\n")
            report.append(f"- Feature matrix shape: {data['egemaps_features'].shape}\n")

        if data['spectral_features'] is not None:
            report.append(f"- Spectral features: {data['spectral_features'].shape[1]} dimensions\n")

        if data['logmel_features'] is not None:
            report.append(f"- Log-mel features: {data['logmel_features'].shape[1]} dimensions\n")

        # Sentiment analysis summary
        sentiment_df = self.analyze_text_sentiment(transcripts)
        report.append("\n## Sentiment Analysis Summary\n")
        report.append(f"- Average polarity: {sentiment_df['polarity'].mean():.3f}\n")
        report.append(f"- Average subjectivity: {sentiment_df['subjectivity'].mean():.3f}\n")

        # Group-wise sentiment analysis
        for label in sentiment_df['label'].unique():
            label_sentiment = sentiment_df[sentiment_df['label'] == label]
            report.append(f"- {label} - Polarity: {label_sentiment['polarity'].mean():.3f}, "
                         f"Subjectivity: {label_sentiment['subjectivity'].mean():.3f}\n")

        # Recommendations
        report.append("\n## Recommendations\n")
        report.append("1. **Feature Selection**: Consider using feature selection techniques to identify the most discriminative features.\n")
        report.append("2. **Data Augmentation**: Given the limited dataset size, consider audio augmentation techniques.\n")
        report.append("3. **Deep Learning**: Explore deep learning approaches for better feature representation.\n")
        report.append("4. **Multimodal Learning**: Combine audio and text features for improved classification.\n")
        report.append("5. **Cross-validation**: Use proper cross-validation techniques for robust model evaluation.\n")

        # Save report
        with open(f'{self.features_path}/analysis_report.txt', 'w') as f:
            f.writelines(report)

        print("Analysis report generated and saved!")
        print("".join(report))

        return report

# Usage example:
print("Enhanced Speech Analyzer ready!")
print("To use the enhanced analyzer:")
print("1. First run the main analysis script")
print("2. Then use the following code:")
print("""
# Initialize enhanced analyzer
enhanced_analyzer = EnhancedSpeechAnalyzer(FEATURES_SAVE_PATH, TRANSCRIPTS_SAVE_PATH)

# Load saved data
data = enhanced_analyzer.load_saved_data()

if data is not None:
    # Create word clouds
    enhanced_analyzer.create_word_clouds(data['transcripts'])

    # Create interactive dashboard
    enhanced_analyzer.create_interactive_dashboard(data)

    # Perform clustering analysis
    cluster_report = enhanced_analyzer.clustering_analysis(data)

    # Comprehensive feature analysis
    enhanced_analyzer.comprehensive_feature_analysis(data)

    # Generate analysis report
    enhanced_analyzer.generate_analysis_report(data)
else:
    print("Could not load saved data. Please run the main analysis first.")
""")