<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Jul16_Speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install librosa soundfile opensmile speechbrain transformers torch openai-whisper
!pip install torch-geometric

Collecting opensmile
  Downloading opensmile-2.5.1-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.12-py3-none-any.whl.metadata (2.7 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.3.1-py3-none-any.whl.metadata (4.3 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-

In [13]:
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
!ls -la /content/drive/MyDrive/Voice/extracted/ADReSSo21

total 12
drwx------ 2 root root 4096 Jun  7 05:38 checkpoints
drwx------ 2 root root 4096 May 31 04:40 diagnosis
drwx------ 2 root root 4096 May 31 04:39 progression


# EDA

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set up the plotting style
plt.style.use('default')
sns.set_palette("husl")

class ADReSSoEDA:
    def __init__(self, base_path):
        self.base_path = base_path
        self.diagnosis_path = os.path.join(base_path, 'diagnosis')
        self.progression_path = os.path.join(base_path, 'progression')
        self.dataset_info = {}
        self.audio_features = {}

    def scan_dataset_structure(self):
        """Scan and analyze the dataset structure"""
        print("=== DATASET STRUCTURE ANALYSIS ===\n")

        structure = {}

        # Scan diagnosis folder
        if os.path.exists(self.diagnosis_path):
            structure['diagnosis'] = {}
            for subset in ['train', 'test-dist']:
                subset_path = os.path.join(self.diagnosis_path, subset)
                if os.path.exists(subset_path):
                    structure['diagnosis'][subset] = {}

                    # Count audio files
                    audio_path = os.path.join(subset_path, 'audio')
                    if os.path.exists(audio_path):
                        structure['diagnosis'][subset]['audio'] = {}
                        for label in ['ad', 'cn']:
                            label_path = os.path.join(audio_path, label)
                            if os.path.exists(label_path):
                                files = [f for f in os.listdir(label_path) if f.endswith('.wav')]
                                structure['diagnosis'][subset]['audio'][label] = len(files)

                    # Count segmentation files
                    seg_path = os.path.join(subset_path, 'segmentation')
                    if os.path.exists(seg_path):
                        structure['diagnosis'][subset]['segmentation'] = {}
                        for label in ['ad', 'cn']:
                            label_path = os.path.join(seg_path, label)
                            if os.path.exists(label_path):
                                files = [f for f in os.listdir(label_path) if f.endswith('.csv')]
                                structure['diagnosis'][subset]['segmentation'][label] = len(files)

        # Scan progression folder
        if os.path.exists(self.progression_path):
            structure['progression'] = {}
            for subset in ['test-dist']:
                subset_path = os.path.join(self.progression_path, subset)
                if os.path.exists(subset_path):
                    structure['progression'][subset] = {}

                    # Count audio files
                    audio_path = os.path.join(subset_path, 'audio')
                    if os.path.exists(audio_path):
                        files = [f for f in os.listdir(audio_path) if f.endswith('.wav')]
                        structure['progression'][subset]['audio'] = len(files)

                    # Count segmentation files
                    seg_path = os.path.join(subset_path, 'segmentation')
                    if os.path.exists(seg_path):
                        files = [f for f in os.listdir(seg_path) if f.endswith('.csv')]
                        structure['progression'][subset]['segmentation'] = len(files)

        self.dataset_info = structure
        self.print_structure()
        return structure

    def print_structure(self):
        """Print dataset structure in a readable format"""
        print("Dataset Structure:")
        for task, task_data in self.dataset_info.items():
            print(f"\n{task.upper()} TASK:")
            for subset, subset_data in task_data.items():
                print(f"  {subset}:")
                if 'audio' in subset_data:
                    if isinstance(subset_data['audio'], dict):
                        for label, count in subset_data['audio'].items():
                            print(f"    Audio files ({label}): {count}")
                    else:
                        print(f"    Audio files: {subset_data['audio']}")
                if 'segmentation' in subset_data:
                    if isinstance(subset_data['segmentation'], dict):
                        for label, count in subset_data['segmentation'].items():
                            print(f"    Segmentation files ({label}): {count}")
                    else:
                        print(f"    Segmentation files: {subset_data['segmentation']}")

    def analyze_audio_properties(self, sample_size=10):
        """Analyze basic audio properties"""
        print("\n=== AUDIO PROPERTIES ANALYSIS ===\n")

        audio_stats = []

        # Analyze diagnosis training data
        for label in ['ad', 'cn']:
            audio_dir = os.path.join(self.diagnosis_path, 'train', 'audio', label)
            if os.path.exists(audio_dir):
                wav_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

                # Sample files for analysis
                sample_files = wav_files[:sample_size] if len(wav_files) > sample_size else wav_files

                for file in sample_files:
                    filepath = os.path.join(audio_dir, file)
                    try:
                        # Load audio file
                        y, sr = librosa.load(filepath, sr=None)

                        # Calculate basic properties
                        duration = len(y) / sr
                        rms_energy = np.sqrt(np.mean(y**2))
                        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y))

                        # Calculate spectral features
                        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
                        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)

                        audio_stats.append({
                            'file': file,
                            'label': label,
                            'duration': duration,
                            'sample_rate': sr,
                            'rms_energy': rms_energy,
                            'zero_crossing_rate': zero_crossing_rate,
                            'spectral_centroid_mean': np.mean(spectral_centroids),
                            'spectral_rolloff_mean': np.mean(spectral_rolloff),
                            'file_size_mb': os.path.getsize(filepath) / (1024*1024)
                        })

                    except Exception as e:
                        print(f"Error processing {file}: {e}")

        self.audio_features = pd.DataFrame(audio_stats)

        if not self.audio_features.empty:
            # Print summary statistics
            print("Audio Properties Summary:")
            print(self.audio_features.groupby('label').agg({
                'duration': ['mean', 'std', 'min', 'max'],
                'sample_rate': ['mean', 'std'],
                'rms_energy': ['mean', 'std'],
                'zero_crossing_rate': ['mean', 'std'],
                'spectral_centroid_mean': ['mean', 'std'],
                'file_size_mb': ['mean', 'std']
            }).round(4))

            # Create visualizations
            self.plot_audio_properties()

        return self.audio_features

    def plot_audio_properties(self):
        """Create visualizations for audio properties"""
        if self.audio_features.empty:
            return

        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        fig.suptitle('Audio Properties Analysis by Label', fontsize=16)

        # Duration distribution
        axes[0, 0].hist([self.audio_features[self.audio_features['label'] == 'ad']['duration'],
                        self.audio_features[self.audio_features['label'] == 'cn']['duration']],
                       bins=10, alpha=0.7, label=['AD', 'CN'])
        axes[0, 0].set_title('Duration Distribution')
        axes[0, 0].set_xlabel('Duration (seconds)')
        axes[0, 0].legend()

        # RMS Energy
        axes[0, 1].boxplot([self.audio_features[self.audio_features['label'] == 'ad']['rms_energy'],
                           self.audio_features[self.audio_features['label'] == 'cn']['rms_energy']],
                          labels=['AD', 'CN'])
        axes[0, 1].set_title('RMS Energy Distribution')
        axes[0, 1].set_ylabel('RMS Energy')

        # Zero Crossing Rate
        axes[0, 2].boxplot([self.audio_features[self.audio_features['label'] == 'ad']['zero_crossing_rate'],
                           self.audio_features[self.audio_features['label'] == 'cn']['zero_crossing_rate']],
                          labels=['AD', 'CN'])
        axes[0, 2].set_title('Zero Crossing Rate Distribution')
        axes[0, 2].set_ylabel('Zero Crossing Rate')

        # Spectral Centroid
        axes[1, 0].boxplot([self.audio_features[self.audio_features['label'] == 'ad']['spectral_centroid_mean'],
                           self.audio_features[self.audio_features['label'] == 'cn']['spectral_centroid_mean']],
                          labels=['AD', 'CN'])
        axes[1, 0].set_title('Spectral Centroid Distribution')
        axes[1, 0].set_ylabel('Spectral Centroid (Hz)')

        # Spectral Rolloff
        axes[1, 1].boxplot([self.audio_features[self.audio_features['label'] == 'ad']['spectral_rolloff_mean'],
                           self.audio_features[self.audio_features['label'] == 'cn']['spectral_rolloff_mean']],
                          labels=['AD', 'CN'])
        axes[1, 1].set_title('Spectral Rolloff Distribution')
        axes[1, 1].set_ylabel('Spectral Rolloff (Hz)')

        # File Size
        axes[1, 2].boxplot([self.audio_features[self.audio_features['label'] == 'ad']['file_size_mb'],
                           self.audio_features[self.audio_features['label'] == 'cn']['file_size_mb']],
                          labels=['AD', 'CN'])
        axes[1, 2].set_title('File Size Distribution')
        axes[1, 2].set_ylabel('File Size (MB)')

        plt.tight_layout()
        plt.show()

    def analyze_segmentation_files(self, sample_size=5):
        """Analyze segmentation CSV files"""
        print("\n=== SEGMENTATION FILES ANALYSIS ===\n")

        segmentation_stats = []

        for label in ['ad', 'cn']:
            seg_dir = os.path.join(self.diagnosis_path, 'train', 'segmentation', label)
            if os.path.exists(seg_dir):
                csv_files = [f for f in os.listdir(seg_dir) if f.endswith('.csv')]

                # Sample files for analysis
                sample_files = csv_files[:sample_size] if len(csv_files) > sample_size else csv_files

                for file in sample_files:
                    filepath = os.path.join(seg_dir, file)
                    try:
                        df = pd.read_csv(filepath)

                        # Basic stats about segmentation
                        segmentation_stats.append({
                            'file': file,
                            'label': label,
                            'num_segments': len(df),
                            'columns': list(df.columns),
                            'total_duration': df.iloc[:, 1].max() if len(df.columns) > 1 else 0,
                            'avg_segment_length': df.iloc[:, 1].diff().mean() if len(df.columns) > 1 else 0
                        })

                        # Print first few rows of first file as example
                        if file == sample_files[0]:
                            print(f"Sample segmentation file ({label}): {file}")
                            print(df.head())
                            print(f"Shape: {df.shape}")
                            print(f"Columns: {df.columns.tolist()}")
                            print("-" * 50)

                    except Exception as e:
                        print(f"Error processing {file}: {e}")

        if segmentation_stats:
            seg_df = pd.DataFrame(segmentation_stats)

            print("Segmentation Statistics:")
            print(seg_df.groupby('label').agg({
                'num_segments': ['mean', 'std', 'min', 'max'],
                'total_duration': ['mean', 'std'],
                'avg_segment_length': ['mean', 'std']
            }).round(4))

            # Plot segmentation statistics
            fig, axes = plt.subplots(1, 2, figsize=(12, 5))

            # Number of segments
            axes[0].boxplot([seg_df[seg_df['label'] == 'ad']['num_segments'],
                            seg_df[seg_df['label'] == 'cn']['num_segments']],
                           labels=['AD', 'CN'])
            axes[0].set_title('Number of Segments per File')
            axes[0].set_ylabel('Number of Segments')

            # Average segment length
            axes[1].boxplot([seg_df[seg_df['label'] == 'ad']['avg_segment_length'],
                            seg_df[seg_df['label'] == 'cn']['avg_segment_length']],
                           labels=['AD', 'CN'])
            axes[1].set_title('Average Segment Length')
            axes[1].set_ylabel('Average Length (seconds)')

            plt.tight_layout()
            plt.show()

        return segmentation_stats

    def statistical_analysis(self):
        """Perform statistical analysis between AD and CN groups"""
        if self.audio_features.empty:
            return

        print("\n=== STATISTICAL ANALYSIS ===\n")

        ad_data = self.audio_features[self.audio_features['label'] == 'ad']
        cn_data = self.audio_features[self.audio_features['label'] == 'cn']

        features_to_test = ['duration', 'rms_energy', 'zero_crossing_rate',
                          'spectral_centroid_mean', 'spectral_rolloff_mean']

        print("Statistical Tests (AD vs CN):")
        print("-" * 60)

        for feature in features_to_test:
            if feature in ad_data.columns and feature in cn_data.columns:
                # T-test
                t_stat, p_value = stats.ttest_ind(ad_data[feature], cn_data[feature])

                # Mann-Whitney U test (non-parametric)
                u_stat, u_p_value = stats.mannwhitneyu(ad_data[feature], cn_data[feature])

                print(f"{feature}:")
                print(f"  T-test: t={t_stat:.4f}, p={p_value:.4f}")
                print(f"  Mann-Whitney U: U={u_stat:.4f}, p={u_p_value:.4f}")
                print(f"  AD mean: {ad_data[feature].mean():.4f} ± {ad_data[feature].std():.4f}")
                print(f"  CN mean: {cn_data[feature].mean():.4f} ± {cn_data[feature].std():.4f}")
                print()

    def run_complete_eda(self):
        """Run complete EDA analysis"""
        print("Starting Complete EDA for ADReSSo21 Dataset")
        print("=" * 50)

        # 1. Dataset structure analysis
        self.scan_dataset_structure()

        # 2. Audio properties analysis
        self.analyze_audio_properties()

        # 3. Segmentation analysis
        self.analyze_segmentation_files()

        # 4. Statistical analysis
        self.statistical_analysis()

        print("\n=== EDA COMPLETE ===")
        print("Summary of findings:")
        print(f"- Total diagnosis training files analyzed: {len(self.audio_features)}")
        print(f"- Labels: {self.audio_features['label'].value_counts().to_dict() if not self.audio_features.empty else 'None'}")

        return self.audio_features

# Usage example:
if __name__ == "__main__":
    # Initialize EDA
    dataset_path = "/content/drive/MyDrive/Voice/extracted/ADReSSo21"
    eda = ADReSSoEDA(dataset_path)

    # Run complete analysis
    results = eda.run_complete_eda()

    # Optional: Save results
    if not results.empty:
        results.to_csv("adresso_audio_features.csv", index=False)
        print(f"\nResults saved to: adresso_audio_features.csv")

Starting Complete EDA for ADReSSo21 Dataset
=== DATASET STRUCTURE ANALYSIS ===

Dataset Structure:

DIAGNOSIS TASK:
  train:
    Audio files (ad): 87
    Audio files (cn): 79
    Segmentation files (ad): 87
    Segmentation files (cn): 79

PROGRESSION TASK:
  test-dist:
    Audio files: 32
    Segmentation files: 15

=== AUDIO PROPERTIES ANALYSIS ===

