<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Feature_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Installation script for Google Colab
# Run this cell first to install all required dependencies

!pip install librosa
!pip install opensmile
!pip install transformers
!pip install torch torchaudio
!pip install SpeechRecognition
!pip install pydub
!pip install scipy
!pip install scikit-learn

# For audio processing
!apt-get update -qq
!apt-get install -qq ffmpeg

print("All dependencies installed successfully!")

# Additional setup for speech recognition
import speech_recognition as sr
print(f"SpeechRecognition version: {sr.__version__}")

# Test imports
try:
    import librosa
    import opensmile
    import torch
    import torchaudio
    from transformers import Wav2Vec2Processor, Wav2Vec2Model
    import pandas as pd
    import numpy as np
    print("✅ All imports successful!")
except ImportError as e:
    print(f"❌ Import error: {e}")

# Create the main processing script
processing_script = '''
# ADReSSo21 Dataset Processing - Simplified Version for Colab

import os
import pandas as pd
import numpy as np
import librosa
import warnings
warnings.filterwarnings('ignore')

class SimpleADReSSo21Processor:
    def __init__(self, base_path="/content/drive/MyDrive/Voice/extracted/ADReSSo21"):
        self.base_path = base_path

    def show_acoustic_features_info(self):
        """Step 1: Show each acoustic feature description"""
        features_info = {
            'eGeMAPS': {
                'description': 'Extended Geneva Minimalistic Acoustic Parameter Set',
                'features': 88,
                'includes': ['Frequency features', 'Energy features', 'Spectral features', 'Temporal features']
            },
            'TRILL': {
                'description': 'Triplet Loss Network for Universal Speech Representations',
                'features': 512,
                'includes': ['Self-supervised learned representations', 'Language-agnostic features']
            },
            'Allosaurus': {
                'description': 'Universal phonetic recognition features',
                'features': 'Variable',
                'includes': ['Phonetic transcriptions', 'Cross-lingual phonetic features']
            },
            'Wav2Vec2': {
                'description': 'Self-supervised speech representations',
                'features': 768,
                'includes': ['Contextualized speech representations', 'Transformer-based features']
            },
            'MFCCs': {
                'description': 'Mel-Frequency Cepstral Coefficients',
                'features': 52,
                'includes': ['13 MFCC coefficients × 4 statistics (mean, std, max, min)']
            },
            'Log-Mel': {
                'description': 'Log-Mel Spectrogram Features',
                'features': 320,
                'includes': ['80 mel-scale frequency bins × 4 statistics']
            },
            'Delta': {
                'description': 'Delta and Delta-Delta Features',
                'features': 52,
                'includes': ['First derivatives (velocity)', 'Second derivatives (acceleration)']
            }
        }

        print("🎵 ACOUSTIC FEATURES OVERVIEW 🎵\\n")
        print("="*60)

        for name, info in features_info.items():
            print(f"📊 {name}")
            print(f"   Description: {info['description']}")
            print(f"   Features: {info['features']}")
            print(f"   Includes: {', '.join(info['includes'])}")
            print("-"*40)

        return features_info

    def extract_basic_acoustic_features(self, audio_path):
        """Extract basic acoustic features that work reliably in Colab"""
        try:
            # Load audio
            audio, sr = librosa.load(audio_path, sr=16000)

            # Basic spectral features
            spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
            zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)

            # MFCC features
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

            # Mel-spectrogram
            mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=40)
            log_mel = librosa.power_to_db(mel_spec, ref=np.max)

            # Delta features
            delta_mfccs = librosa.feature.delta(mfccs)
            delta2_mfccs = librosa.feature.delta(mfccs, order=2)

            # Compile features
            features = {}

            # Basic spectral statistics
            features.update({
                'spectral_centroid_mean': np.mean(spectral_centroids),
                'spectral_centroid_std': np.std(spectral_centroids),
                'spectral_rolloff_mean': np.mean(spectral_rolloff),
                'spectral_rolloff_std': np.std(spectral_rolloff),
                'spectral_bandwidth_mean': np.mean(spectral_bandwidth),
                'spectral_bandwidth_std': np.std(spectral_bandwidth),
                'zero_crossing_rate_mean': np.mean(zero_crossing_rate),
                'zero_crossing_rate_std': np.std(zero_crossing_rate),
            })

            # MFCC statistics
            for i in range(13):
                features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
                features[f'mfcc_{i}_std'] = np.std(mfccs[i])

            # Log-Mel statistics (first 20 bands)
            for i in range(20):
                features[f'logmel_{i}_mean'] = np.mean(log_mel[i])
                features[f'logmel_{i}_std'] = np.std(log_mel[i])

            # Delta MFCC statistics
            for i in range(13):
                features[f'delta_mfcc_{i}_mean'] = np.mean(delta_mfccs[i])
                features[f'delta2_mfcc_{i}_mean'] = np.mean(delta2_mfccs[i])

            return features

        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
            return {}

    def simple_transcribe(self, audio_path):
        """Simple transcription placeholder - replace with actual ASR"""
        # This is a placeholder - in practice, you would use:
        # - Google Speech-to-Text API
        # - Whisper model
        # - Other ASR services

        filename = os.path.basename(audio_path)
        return f"[Transcription placeholder for {filename}]"

    def process_sample_files(self):
        """Process the sample files provided"""
        sample_files = [
            "/content/drive/MyDrive/Voice/extracted/ADReSSo21/diagnosis/train/audio/ad/adrso024.wav",
            "/content/drive/MyDrive/Voice/extracted/ADReSSo21/diagnosis/train/audio/cn/adrso002.wav",
            "/content/drive/MyDrive/Voice/extracted/ADReSSo21/progression/train/audio/decline/adrsp003.wav",
            "/content/drive/MyDrive/Voice/extracted/ADReSSo21/progression/train/audio/no_decline/adrsp001.wav",
            "/content/drive/MyDrive/Voice/extracted/ADReSSo21/progression/test-dist/audio/adrspt1.wav"
        ]

        results = []
        transcripts = []

        print("🎤 PROCESSING AUDIO FILES 🎤\\n")

        for audio_path in sample_files:
            if os.path.exists(audio_path):
                print(f"Processing: {os.path.basename(audio_path)}")

                # Extract features
                features = self.extract_basic_acoustic_features(audio_path)

                # Transcribe
                transcript = self.simple_transcribe(audio_path)

                # Determine task and label from path
                path_parts = audio_path.split('/')
                task = 'diagnosis' if 'diagnosis' in path_parts else 'progression'

                if 'ad' in path_parts:
                    label = 'ad'
                elif 'cn' in path_parts:
                    label = 'cn'
                elif 'decline' in path_parts:
                    label = 'decline'
                elif 'no_decline' in path_parts:
                    label = 'no_decline'
                else:
                    label = 'test'

                file_id = os.path.basename(audio_path).replace('.wav', '')

                # Compile result
                result = {
                    'file_id': file_id,
                    'task': task,
                    'label': label,
                    'audio_path': audio_path,
                    'transcript': transcript,
                    **features
                }

                results.append(result)

                # Save transcript info
                transcripts.append({
                    'file_id': file_id,
                    'task': task,
                    'label': label,
                    'transcript': transcript,
                    'audio_path': audio_path
                })

                print(f"✅ Completed: {file_id}")
                print(f"   Features extracted: {len(features)}")
                print(f"   Transcript: {transcript[:50]}...")
                print()
            else:
                print(f"❌ File not found: {audio_path}")

        return pd.DataFrame(results), pd.DataFrame(transcripts)

    def save_transcripts_to_files(self, transcripts_df, output_dir='/content'):
        """Step 3: Save transcripts to individual files"""
        os.makedirs(output_dir, exist_ok=True)

        saved_files = []

        print("💾 SAVING TRANSCRIPT FILES 💾\\n")

        for idx, row in transcripts_df.iterrows():
            filename = f"{row['file_id']}_transcript.txt"
            filepath = os.path.join(output_dir, filename)

            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(f"File ID: {row['file_id']}\\n")
                f.write(f"Task: {row['task']}\\n")
                f.write(f"Label: {row['label']}\\n")
                f.write(f"Audio Path: {row['audio_path']}\\n")
                f.write(f"Transcript: {row['transcript']}\\n")

            saved_files.append(filepath)
            print(f"✅ Saved: {filename}")

        print(f"\\n📁 Total files saved: {len(saved_files)}")
        return saved_files

    def extract_linguistic_features(self, text):
        """Step 5: Extract linguistic features for BERT preparation"""
        # Basic text statistics
        words = text.split()
        sentences = [s.strip() for s in text.split('.') if s.strip()]

        features = {
            'text': text,  # Original text for BERT
            'word_count': len(words),
            'char_count': len(text),
            'sentence_count': len(sentences),
            'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
            'avg_sentence_length': np.mean([len(s.split()) for s in sentences]) if sentences else 0,
        }

        # Lexical diversity
        if words:
            unique_words = set(word.lower() for word in words)
            features['lexical_diversity'] = len(unique_words) / len(words)
            features['unique_words'] = len(unique_words)
        else:
            features['lexical_diversity'] = 0
            features['unique_words'] = 0

        # Simple complexity measures
        features['complexity_score'] = features['unique_words'] / features['sentence_count'] if features['sentence_count'] > 0 else 0

        # BERT preparation features
        features['bert_input'] = text  # Clean text for BERT tokenization
        features['bert_length'] = len(text.split())  # For sequence length planning

        return features

def run_complete_pipeline():
    """Run the complete ADReSSo21 processing pipeline"""

    print("🚀 ADReSSo21 DATASET PROCESSING PIPELINE 🚀\\n")
    print("="*60)

    # Initialize processor
    processor = SimpleADReSSo21Processor()

    # Step 0 & 1: Show acoustic features
    print("\\n📋 STEP 0-1: ACOUSTIC FEATURES OVERVIEW")
    features_info = processor.show_acoustic_features_info()

    # Step 2: Process files and extract transcripts
    print("\\n🎵 STEP 2: EXTRACTING FEATURES AND TRANSCRIPTS")
    results_df, transcripts_df = processor.process_sample_files()

    # Step 3: Save transcript files
    print("\\n💾 STEP 3: SAVING TRANSCRIPT FILES")
    transcript_files = processor.save_transcripts_to_files(transcripts_df)

    # Step 4: Show transcripts table
    print("\\n📊 STEP 4: TRANSCRIPTS TABLE")
    print("="*60)
    print(transcripts_df.to_string(index=False))

    # Step 5: Extract linguistic features for BERT
    print("\\n🤖 STEP 5: LINGUISTIC FEATURES FOR BERT")
    print("="*60)

    linguistic_features = []
    for idx, row in transcripts_df.iterrows():
        ling_feats = processor.extract_linguistic_features(row['transcript'])
        ling_feats['file_id'] = row['file_id']
        ling_feats['task'] = row['task']
        ling_feats['label'] = row['label']
        linguistic_features.append(ling_feats)

    linguistic_df = pd.DataFrame(linguistic_features)

    print("Linguistic Features Summary:")
    print("-" * 40)
    for col in ['word_count', 'sentence_count', 'lexical_diversity', 'complexity_score']:
        if col in linguistic_df.columns:
            print(f"{col}: mean={linguistic_df[col].mean():.2f}, std={linguistic_df[col].std():.2f}")

    print("\\nDetailed Linguistic Features:")
    print(linguistic_df[['file_id', 'task', 'label', 'word_count', 'sentence_count', 'lexical_diversity']].to_string(index=False))

    # Save all results
    print("\\n💾 SAVING RESULTS")
    print("="*30)

    # Save complete features
    results_df.to_csv('/content/complete_acoustic_features.csv', index=False)
    print("✅ Saved: /content/complete_acoustic_features.csv")

    # Save transcripts table
    transcripts_df.to_csv('/content/transcripts_table.csv', index=False)
    print("✅ Saved: /content/transcripts_table.csv")

    # Save linguistic features
    linguistic_df.to_csv('/content/linguistic_features_for_bert.csv', index=False)
    print("✅ Saved: /content/linguistic_features_for_bert.csv")

    # Create BERT-ready dataset
    bert_ready_df = linguistic_df[['file_id', 'task', 'label', 'bert_input', 'bert_length']].copy()
    bert_ready_df.to_csv('/content/bert_ready_dataset.csv', index=False)
    print("✅ Saved: /content/bert_ready_dataset.csv")

    print("\\n🎉 PIPELINE COMPLETED SUCCESSFULLY! 🎉")
    print("="*60)
    print("\\nSummary:")
    print(f"- Processed {len(results_df)} audio files")
    print(f"- Extracted {len([col for col in results_df.columns if col not in ['file_id', 'task', 'label', 'audio_path', 'transcript']])} acoustic features per file")
    print(f"- Created {len(transcript_files)} transcript files")
    print(f"- Generated linguistic features for BERT processing")
    print("\\nOutput files in /content/:")
    print("- complete_acoustic_features.csv")
    print("- transcripts_table.csv")
    print("- linguistic_features_for_bert.csv")
    print("- bert_ready_dataset.csv")
    print("- Individual transcript .txt files")

    return results_df, transcripts_df, linguistic_df

# Additional utility functions for BERT preparation
def prepare_bert_inputs(linguistic_df, max_length=512):
    """Prepare inputs specifically for BERT model"""

    bert_inputs = []

    for idx, row in linguistic_df.iterrows():
        # Clean and prepare text
        text = row['bert_input'].strip()

        # Truncate if too long (BERT has max sequence length)
        words = text.split()
        if len(words) > max_length - 2:  # Account for [CLS] and [SEP] tokens
            text = ' '.join(words[:max_length-2])

        bert_input = {
            'file_id': row['file_id'],
            'task': row['task'],
            'label': row['label'],
            'text': text,
            'length': len(text.split()),
            'ready_for_tokenization': True
        }

        bert_inputs.append(bert_input)

    return pd.DataFrame(bert_inputs)

def show_feature_statistics(results_df):
    """Show statistics for extracted acoustic features"""

    print("\\n📈 ACOUSTIC FEATURES STATISTICS")
    print("="*50)

    # Get feature columns (exclude metadata)
    feature_cols = [col for col in results_df.columns
                   if col not in ['file_id', 'task', 'label', 'audio_path', 'transcript']]

    print(f"Total acoustic features extracted: {len(feature_cols)}")
    print("\\nFeature categories:")

    categories = {
        'Spectral': [col for col in feature_cols if 'spectral' in col],
        'MFCC': [col for col in feature_cols if 'mfcc' in col and 'delta' not in col],
        'Log-Mel': [col for col in feature_cols if 'logmel' in col],
        'Delta': [col for col in feature_cols if 'delta' in col],
        'Other': [col for col in feature_cols if not any(cat in col for cat in ['spectral', 'mfcc', 'logmel', 'delta'])]
    }

    for category, cols in categories.items():
        if cols:
            print(f"- {category}: {len(cols)} features")

    # Show sample statistics
    print("\\nSample feature statistics (first 5 features):")
    sample_features = feature_cols[:5]
    for feat in sample_features:
        values = results_df[feat].dropna()
        if len(values) > 0:
            print(f"{feat}: mean={values.mean():.4f}, std={values.std():.4f}")

print("✅ Processing script created successfully!")
print("\\n🚀 To run the complete pipeline, execute:")
print("results_df, transcripts_df, linguistic_df = run_complete_pipeline()")
'''

# Save the processing script
with open('/content/adresso21_processor.py', 'w') as f:
    f.write(processing_script)

print("\\n📄 Main processing script saved as: /content/adresso21_processor.py")
print("\\n🔧 SETUP COMPLETE! Ready to process ADReSSo21 dataset.")

Collecting opensmile
  Downloading opensmile-2.5.1-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.2.3-py3-none-any.whl.metadata (4.2 kB)
Collecting audeer>=2.1.1 (from audinterface>=0.7.0->opensmile)
  Downloading audeer-2.2.1-py3-none-any.whl.metadata (4.1 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.3.2-py3-none-any.whl.metadata (4.7 kB)
Collecting audiofile>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audiofile-1.5.1-py3-none-any.whl.metadata (4.9 kB)
Collecting audmath>=1.4.1 (from audinterface>=0.7.0->opensmile)
  Downloading audmath-1.4.1-py3-none-any.whl.metadata (3.6 kB)
Collecting audresample<2.0.0,>=1.1.0 (from audinterface>=0.7.0->opensmile)
  Downloading audresample-1.3.3-py3-none-manylinux_2_17_x86_64.whl.metadat

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
All dependencies installed successfully!
SpeechRecognition version: 3.14.3
✅ All imports successful!
\n📄 Main processing script saved as: /content/adresso21_processor.py
\n🔧 SETUP COMPLETE! Ready to process ADReSS

In [None]:
import os
import pandas as pd
import numpy as np
import librosa
import scipy.stats
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import opensmile
import speech_recognition as sr
from pydub import AudioSegment
import warnings
warnings.filterwarnings('ignore')

class ADReSSo21Processor:
    def __init__(self, base_path):
        self.base_path = base_path
        self.transcripts = {}
        self.acoustic_features = {}
        self.linguistic_features = {}

        # Initialize models
        self.wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
        )

    def get_file_paths(self):
        """Get all audio and segmentation file paths"""
        paths = {
            'diagnosis': {
                'train': {
                    'audio': {'ad': [], 'cn': []},
                    'segmentation': {'ad': [], 'cn': []}
                }
            },
            'progression': {
                'train': {
                    'audio': {'decline': [], 'no_decline': []},
                    'segmentation': {'decline': [], 'no_decline': []}
                },
                'test-dist': {
                    'audio': [],
                    'segmentation': []
                }
            }
        }

        # Populate paths based on directory structure
        for task in ['diagnosis', 'progression']:
            task_path = os.path.join(self.base_path, task)
            if task == 'diagnosis':
                for split in ['train']:
                    for data_type in ['audio', 'segmentation']:
                        for label in ['ad', 'cn']:
                            dir_path = os.path.join(task_path, split, data_type, label)
                            if os.path.exists(dir_path):
                                files = [f for f in os.listdir(dir_path) if f.endswith('.wav' if data_type == 'audio' else '.csv')]
                                paths[task][split][data_type][label] = [os.path.join(dir_path, f) for f in files]
            else:  # progression
                for split in ['train', 'test-dist']:
                    if split == 'train':
                        for data_type in ['audio', 'segmentation']:
                            for label in ['decline', 'no_decline']:
                                dir_path = os.path.join(task_path, split, data_type, label)
                                if os.path.exists(dir_path):
                                    files = [f for f in os.listdir(dir_path) if f.endswith('.wav' if data_type == 'audio' else '.csv')]
                                    paths[task][split][data_type][label] = [os.path.join(dir_path, f) for f in files]
                    else:  # test-dist
                        for data_type in ['audio', 'segmentation']:
                            dir_path = os.path.join(task_path, split, data_type)
                            if os.path.exists(dir_path):
                                files = [f for f in os.listdir(dir_path) if f.endswith('.wav' if data_type == 'audio' else '.csv')]
                                paths[task][split][data_type] = [os.path.join(dir_path, f) for f in files]

        return paths

    def extract_egmaps_features(self, audio_path):
        """Extract eGeMAPS features using openSMILE"""
        try:
            features = self.smile.process_file(audio_path)
            return features.values.flatten()
        except Exception as e:
            print(f"Error extracting eGeMAPS from {audio_path}: {e}")
            return np.zeros(88)  # eGeMAPS has 88 features

    def extract_trill_features(self, audio_path):
        """Extract TRILL features (placeholder - requires TensorFlow Hub)"""
        # Note: This would require tensorflow_hub and the TRILL model
        # For now, returning placeholder features
        try:
            audio, sr = librosa.load(audio_path, sr=16000)
            # Placeholder: extract spectral features as proxy
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
            return np.mean(mfccs, axis=1)
        except Exception as e:
            print(f"Error extracting TRILL from {audio_path}: {e}")
            return np.zeros(512)  # TRILL typically has 512 dimensions

    def extract_allsaurus_features(self, audio_path):
        """Extract Allosaurus features (placeholder - requires allosaurus library)"""
        try:
            audio, sr = librosa.load(audio_path, sr=16000)
            # Placeholder: extract phonetic-related features
            spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
            zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)

            features = np.concatenate([
                np.mean(spectral_centroids),
                np.std(spectral_centroids),
                np.mean(spectral_rolloff),
                np.std(spectral_rolloff),
                np.mean(zero_crossing_rate),
                np.std(zero_crossing_rate)
            ])
            return features
        except Exception as e:
            print(f"Error extracting Allosaurus from {audio_path}: {e}")
            return np.zeros(6)

    def extract_wav2vec2_features(self, audio_path):
        """Extract Wav2Vec2 features"""
        try:
            audio, sr = torchaudio.load(audio_path)
            if sr != 16000:
                resampler = torchaudio.transforms.Resample(sr, 16000)
                audio = resampler(audio)

            # Process with Wav2Vec2
            inputs = self.wav2vec2_processor(audio.squeeze().numpy(),
                                           sampling_rate=16000,
                                           return_tensors="pt")

            with torch.no_grad():
                outputs = self.wav2vec2_model(**inputs)
                features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

            return features
        except Exception as e:
            print(f"Error extracting Wav2Vec2 from {audio_path}: {e}")
            return np.zeros(768)  # Wav2Vec2 base has 768 dimensions

    def extract_mfcc_features(self, audio_path, n_mfcc=13):
        """Extract MFCC features"""
        try:
            audio, sr = librosa.load(audio_path, sr=16000)
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

            # Statistical features
            features = []
            features.extend(np.mean(mfccs, axis=1))
            features.extend(np.std(mfccs, axis=1))
            features.extend(np.max(mfccs, axis=1))
            features.extend(np.min(mfccs, axis=1))

            return np.array(features)
        except Exception as e:
            print(f"Error extracting MFCC from {audio_path}: {e}")
            return np.zeros(n_mfcc * 4)

    def extract_log_mel_features(self, audio_path, n_mels=80):
        """Extract Log-Mel spectrogram features"""
        try:
            audio, sr = librosa.load(audio_path, sr=16000)
            mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
            log_mel = librosa.power_to_db(mel_spec, ref=np.max)

            # Statistical features
            features = []
            features.extend(np.mean(log_mel, axis=1))
            features.extend(np.std(log_mel, axis=1))
            features.extend(np.max(log_mel, axis=1))
            features.extend(np.min(log_mel, axis=1))

            return np.array(features)
        except Exception as e:
            print(f"Error extracting Log-Mel from {audio_path}: {e}")
            return np.zeros(n_mels * 4)

    def extract_delta_features(self, audio_path):
        """Extract Delta and Delta-Delta MFCC features"""
        try:
            audio, sr = librosa.load(audio_path, sr=16000)
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

            # Delta features
            delta_mfccs = librosa.feature.delta(mfccs)
            delta2_mfccs = librosa.feature.delta(mfccs, order=2)

            # Statistical features for each
            features = []
            for feat in [delta_mfccs, delta2_mfccs]:
                features.extend(np.mean(feat, axis=1))
                features.extend(np.std(feat, axis=1))

            return np.array(features)
        except Exception as e:
            print(f"Error extracting Delta features from {audio_path}: {e}")
            return np.zeros(13 * 4)  # 13 deltas + 13 delta-deltas, mean+std each

    def extract_all_acoustic_features(self, audio_path):
        """Extract all acoustic features"""
        features = {}

        print(f"Processing: {os.path.basename(audio_path)}")

        features['eGeMAPS'] = self.extract_egmaps_features(audio_path)
        features['TRILL'] = self.extract_trill_features(audio_path)
        features['Allosaurus'] = self.extract_allsaurus_features(audio_path)
        features['Wav2Vec2'] = self.extract_wav2vec2_features(audio_path)
        features['MFCCs'] = self.extract_mfcc_features(audio_path)
        features['Log-Mel'] = self.extract_log_mel_features(audio_path)
        features['Delta'] = self.extract_delta_features(audio_path)

        return features

    def transcribe_audio(self, audio_path):
        """Transcribe audio using speech recognition"""
        try:
            # Convert to WAV if needed
            recognizer = sr.Recognizer()

            with sr.AudioFile(audio_path) as source:
                audio_data = recognizer.record(source)

            # Try Google Speech Recognition (free tier)
            try:
                transcript = recognizer.recognize_google(audio_data)
                return transcript
            except sr.UnknownValueError:
                return "Could not understand audio"
            except sr.RequestError as e:
                return f"Error with speech recognition service: {e}"

        except Exception as e:
            print(f"Error transcribing {audio_path}: {e}")
            return "Transcription failed"

    def process_dataset(self):
        """Process the entire dataset"""
        file_paths = self.get_file_paths()
        results = []

        # Process diagnosis task
        for split in ['train']:
            for label in ['ad', 'cn']:
                audio_files = file_paths['diagnosis'][split]['audio'][label]
                for audio_path in audio_files:
                    file_id = os.path.basename(audio_path).replace('.wav', '')

                    # Extract acoustic features
                    acoustic_feats = self.extract_all_acoustic_features(audio_path)

                    # Transcribe
                    transcript = self.transcribe_audio(audio_path)

                    results.append({
                        'file_id': file_id,
                        'task': 'diagnosis',
                        'split': split,
                        'label': label,
                        'audio_path': audio_path,
                        'transcript': transcript,
                        **{f'acoustic_{k}': v for k, v in acoustic_feats.items()}
                    })

        # Process progression task
        for split in ['train']:
            for label in ['decline', 'no_decline']:
                audio_files = file_paths['progression'][split]['audio'][label]
                for audio_path in audio_files:
                    file_id = os.path.basename(audio_path).replace('.wav', '')

                    # Extract acoustic features
                    acoustic_feats = self.extract_all_acoustic_features(audio_path)

                    # Transcribe
                    transcript = self.transcribe_audio(audio_path)

                    results.append({
                        'file_id': file_id,
                        'task': 'progression',
                        'split': split,
                        'label': label,
                        'audio_path': audio_path,
                        'transcript': transcript,
                        **{f'acoustic_{k}': v for k, v in acoustic_feats.items()}
                    })

        # Process test-dist
        audio_files = file_paths['progression']['test-dist']['audio']
        for audio_path in audio_files:
            file_id = os.path.basename(audio_path).replace('.wav', '')

            # Extract acoustic features
            acoustic_feats = self.extract_all_acoustic_features(audio_path)

            # Transcribe
            transcript = self.transcribe_audio(audio_path)

            results.append({
                'file_id': file_id,
                'task': 'progression',
                'split': 'test-dist',
                'label': 'unknown',
                'audio_path': audio_path,
                'transcript': transcript,
                **{f'acoustic_{k}': v for k, v in acoustic_feats.items()}
            })

        return pd.DataFrame(results)

    def save_transcripts(self, df, output_dir='/content'):
        """Save transcripts to files"""
        os.makedirs(output_dir, exist_ok=True)

        transcript_files = []
        for idx, row in df.iterrows():
            filename = f"{row['file_id']}_transcript.txt"
            filepath = os.path.join(output_dir, filename)

            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(f"File ID: {row['file_id']}\n")
                f.write(f"Task: {row['task']}\n")
                f.write(f"Label: {row['label']}\n")
                f.write(f"Transcript: {row['transcript']}\n")

            transcript_files.append(filepath)

        return transcript_files

    def extract_linguistic_features(self, text):
        """Extract linguistic features for BERT preparation"""
        features = {}

        # Basic text statistics
        features['word_count'] = len(text.split())
        features['char_count'] = len(text)
        features['sentence_count'] = len([s for s in text.split('.') if s.strip()])
        features['avg_word_length'] = np.mean([len(word) for word in text.split()])

        # Lexical diversity
        words = text.lower().split()
        unique_words = set(words)
        features['lexical_diversity'] = len(unique_words) / len(words) if words else 0

        # Part-of-speech complexity (simplified)
        # This would typically require NLTK or spaCy
        features['complexity_score'] = len(unique_words) / features['sentence_count'] if features['sentence_count'] > 0 else 0

        return features

# Usage example
def main():
    # Initialize processor
    base_path = "/content/drive/MyDrive/Voice/extracted/ADReSSo21"
    processor = ADReSSo21Processor(base_path)

    print("=== Step 0: Acoustic Features Overview ===")
    print("Available acoustic features:")
    features_info = {
        'eGeMAPS': 'Extended Geneva Minimalistic Acoustic Parameter Set (88 features)',
        'TRILL': 'Triplet Loss Network for speech representations (512 features)',
        'Allosaurus': 'Universal phonetic features (variable dimensions)',
        'Wav2Vec2': 'Self-supervised speech representations (768 features)',
        'MFCCs': 'Mel-Frequency Cepstral Coefficients (52 features: 13*4 statistics)',
        'Log-Mel': 'Log-Mel spectrogram features (320 features: 80*4 statistics)',
        'Delta': 'Delta and Delta-Delta MFCC features (52 features)'
    }

    for name, desc in features_info.items():
        print(f"- {name}: {desc}")

    print("\n=== Step 1: Processing Dataset ===")
    df = processor.process_dataset()

    print("\n=== Step 2: Extracting Transcripts ===")
    # Transcripts are already extracted in process_dataset()

    print("\n=== Step 3: Saving Transcript Files ===")
    transcript_files = processor.save_transcripts(df)
    print(f"Saved {len(transcript_files)} transcript files to /content/")

    print("\n=== Step 4: Transcripts Table ===")
    transcript_df = df[['file_id', 'task', 'label', 'transcript']].copy()
    print(transcript_df.head(10))

    print("\n=== Step 5: Linguistic Features for BERT ===")
    linguistic_features = []
    for idx, row in df.iterrows():
        ling_feats = processor.extract_linguistic_features(row['transcript'])
        ling_feats['file_id'] = row['file_id']
        ling_feats['text'] = row['transcript']  # For BERT input
        linguistic_features.append(ling_feats)

    linguistic_df = pd.DataFrame(linguistic_features)
    print("Linguistic features extracted:")
    print(linguistic_df.head())

    # Save all results
    df.to_csv('/content/complete_features.csv', index=False)
    linguistic_df.to_csv('/content/linguistic_features.csv', index=False)
    transcript_df.to_csv('/content/transcripts_table.csv', index=False)

    print("\n=== Processing Complete ===")
    print("Files saved:")
    print("- /content/complete_features.csv (all features)")
    print("- /content/linguistic_features.csv (for BERT)")
    print("- /content/transcripts_table.csv (transcripts table)")

    return df, linguistic_df, transcript_df

if __name__ == "__main__":
    df, linguistic_df, transcript_df = main()

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]