<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/ADReSSo21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install SpeechRecognition pydub

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub, SpeechRecognition
Successfully installed SpeechRecognition-3.14.3 pydub-0.25.1


In [2]:
# Step-by-Step Audio Transcript Extractor for ADReSSo21 Dataset
# This script will:
# 1. Mount Google Drive
# 2. Extract dataset files
# 3. Find all WAV files
# 4. Extract transcripts from audio using speech recognition
# 5. Save organized transcripts

import os
import tarfile
import pandas as pd
import numpy as np
from pathlib import Path
import librosa
import speech_recognition as sr
import soundfile as sf
from pydub import AudioSegment
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("ADReSSo21 AUDIO TRANSCRIPT EXTRACTOR")
print("="*60)

# STEP 1: MOUNT GOOGLE DRIVE
print("\nSTEP 1: Mounting Google Drive...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✓ Google Drive mounted successfully!")
except:
    print("⚠ Not running in Colab or Drive already mounted")

# STEP 2: INSTALL REQUIRED PACKAGES
print("\nSTEP 2: Installing required packages...")
print("Installing speech recognition and audio processing libraries...")

# Install packages (run once)
!pip install SpeechRecognition
!pip install pydub
!pip install librosa
!pip install soundfile
!apt-get install -y ffmpeg

print("✓ Packages ready (make sure to install them first)")

# STEP 3: SET UP PATHS AND CONFIGURATION
print("\nSTEP 3: Setting up paths and configuration...")

BASE_PATH = "/content/drive/MyDrive/Voice/"
EXTRACT_PATH = "/content/drive/MyDrive/Voice/extracted/"
OUTPUT_PATH = "/content/drive/MyDrive/Voice/transcripts/"

# Create directories
os.makedirs(EXTRACT_PATH, exist_ok=True)
os.makedirs(OUTPUT_PATH, exist_ok=True)

datasets = {
    'progression_train': 'ADReSSo21-progression-train.tgz',
    'progression_test': 'ADReSSo21-progression-test.tgz',
    'diagnosis_train': 'ADReSSo21-diagnosis-train.tgz'
}

print(f"✓ Base path: {BASE_PATH}")
print(f"✓ Extract path: {EXTRACT_PATH}")
print(f"✓ Output path: {OUTPUT_PATH}")

# STEP 4: EXTRACT DATASET FILES
print("\nSTEP 4: Extracting dataset files...")

def extract_datasets():
    """Extract all tgz files"""
    for dataset_name, filename in datasets.items():
        file_path = os.path.join(BASE_PATH, filename)

        if os.path.exists(file_path):
            print(f"  Extracting {filename}...")
            try:
                with tarfile.open(file_path, 'r:gz') as tar:
                    tar.extractall(path=EXTRACT_PATH)
                print(f"  ✓ {filename} extracted successfully")
            except Exception as e:
                print(f"  ⚠ Error extracting {filename}: {e}")
        else:
            print(f"  ⚠ {filename} not found at {file_path}")

extract_datasets()

# STEP 5: FIND ALL WAV FILES
print("\nSTEP 5: Finding all WAV files...")

def find_wav_files():
    """Find all WAV files and organize by dataset and label"""
    wav_files = {
        'progression_train': {'decline': [], 'no_decline': []},
        'progression_test': [],
        'diagnosis_train': {'ad': [], 'cn': []}
    }

    # Progression training files
    prog_train_base = os.path.join(EXTRACT_PATH, "ADReSSo21/progression/train/audio/")

    # Decline cases
    decline_path = os.path.join(prog_train_base, "decline/")
    if os.path.exists(decline_path):
        decline_wavs = [f for f in os.listdir(decline_path) if f.endswith('.wav')]
        wav_files['progression_train']['decline'] = [os.path.join(decline_path, f) for f in decline_wavs]
        print(f"  Found {len(decline_wavs)} decline WAV files")

    # No decline cases
    no_decline_path = os.path.join(prog_train_base, "no_decline/")
    if os.path.exists(no_decline_path):
        no_decline_wavs = [f for f in os.listdir(no_decline_path) if f.endswith('.wav')]
        wav_files['progression_train']['no_decline'] = [os.path.join(no_decline_path, f) for f in no_decline_wavs]
        print(f"  Found {len(no_decline_wavs)} no_decline WAV files")

    # Progression test files
    prog_test_path = os.path.join(EXTRACT_PATH, "ADReSSo21/progression/test-dist/audio/")
    if os.path.exists(prog_test_path):
        test_wavs = [f for f in os.listdir(prog_test_path) if f.endswith('.wav')]
        wav_files['progression_test'] = [os.path.join(prog_test_path, f) for f in test_wavs]
        print(f"  Found {len(test_wavs)} test WAV files")

    # Diagnosis training files
    diag_train_base = os.path.join(EXTRACT_PATH, "ADReSSo21/diagnosis/train/audio/")

    # AD cases
    ad_path = os.path.join(diag_train_base, "ad/")
    if os.path.exists(ad_path):
        ad_wavs = [f for f in os.listdir(ad_path) if f.endswith('.wav')]
        wav_files['diagnosis_train']['ad'] = [os.path.join(ad_path, f) for f in ad_wavs]
        print(f"  Found {len(ad_wavs)} AD WAV files")

    # CN cases
    cn_path = os.path.join(diag_train_base, "cn/")
    if os.path.exists(cn_path):
        cn_wavs = [f for f in os.listdir(cn_path) if f.endswith('.wav')]
        wav_files['diagnosis_train']['cn'] = [os.path.join(cn_path, f) for f in cn_wavs]
        print(f"  Found {len(cn_wavs)} CN WAV files")

    return wav_files

wav_files = find_wav_files()

# STEP 6: AUDIO PREPROCESSING FUNCTIONS
print("\nSTEP 6: Setting up audio preprocessing...")

def preprocess_audio(audio_path, target_sr=16000):
    """Preprocess audio file for speech recognition"""
    try:
        # Load audio with librosa
        audio, sr = librosa.load(audio_path, sr=target_sr)

        # Normalize audio
        audio = librosa.util.normalize(audio)

        # Remove silence
        audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)

        return audio_trimmed, target_sr
    except Exception as e:
        print(f"    Error preprocessing {audio_path}: {e}")
        return None, None

def convert_to_wav_if_needed(audio_path):
    """Convert audio to WAV format if needed"""
    try:
        if not audio_path.endswith('.wav'):
            # Convert using pydub
            audio = AudioSegment.from_file(audio_path)
            wav_path = audio_path.rsplit('.', 1)[0] + '_converted.wav'
            audio.export(wav_path, format="wav")
            return wav_path
        return audio_path
    except Exception as e:
        print(f"    Error converting {audio_path}: {e}")
        return audio_path

# STEP 7: SPEECH RECOGNITION FUNCTION
print("\nSTEP 7: Setting up speech recognition...")

def extract_transcript_from_audio(audio_path, method='google'):
    """Extract transcript from audio file using speech recognition"""
    recognizer = sr.Recognizer()

    try:
        # Convert to WAV if needed
        wav_path = convert_to_wav_if_needed(audio_path)

        # Preprocess audio
        audio_data, sr_rate = preprocess_audio(wav_path, target_sr=16000)

        if audio_data is None:
            return None, "Preprocessing failed"

        # Save preprocessed audio temporarily
        temp_wav = audio_path.replace('.wav', '_temp.wav')
        sf.write(temp_wav, audio_data, sr_rate)

        # Use speech recognition
        with sr.AudioFile(temp_wav) as source:
            # Adjust for ambient noise
            recognizer.adjust_for_ambient_noise(source, duration=0.5)
            audio = recognizer.listen(source)

        # Try different recognition methods
        transcript = None
        error_msg = ""

        if method == 'google':
            try:
                transcript = recognizer.recognize_google(audio)
            except sr.UnknownValueError:
                error_msg = "Google Speech Recognition could not understand audio"
            except sr.RequestError as e:
                error_msg = f"Google Speech Recognition error: {e}"

        # Fallback to other methods if Google fails
        if transcript is None:
            try:
                transcript = recognizer.recognize_sphinx(audio)
                method = 'sphinx'
            except sr.UnknownValueError:
                error_msg += "; Sphinx could not understand audio"
            except sr.RequestError as e:
                error_msg += f"; Sphinx error: {e}"

        # Clean up temporary file
        if os.path.exists(temp_wav):
            os.remove(temp_wav)

        if transcript:
            return transcript.strip(), method
        else:
            return None, error_msg

    except Exception as e:
        return None, f"Error processing audio: {str(e)}"

# STEP 8: PROCESS ALL AUDIO FILES AND EXTRACT TRANSCRIPTS
print("\nSTEP 8: Processing audio files and extracting transcripts...")
print("This may take a while depending on the number and length of audio files...")

def process_audio_files(wav_files):
    """Process all audio files and extract transcripts"""
    all_transcripts = []

    # Process progression training data
    print("\n  Processing progression training data...")
    for label in ['decline', 'no_decline']:
        files = wav_files['progression_train'][label]
        print(f"    Processing {len(files)} {label} files...")

        for i, audio_path in enumerate(files):
            print(f"      Processing {i+1}/{len(files)}: {os.path.basename(audio_path)}")

            transcript, method_or_error = extract_transcript_from_audio(audio_path)

            all_transcripts.append({
                'file_id': os.path.splitext(os.path.basename(audio_path))[0],
                'file_path': audio_path,
                'dataset': 'progression_train',
                'label': label,
                'transcript': transcript,
                'recognition_method': method_or_error if transcript else None,
                'error': None if transcript else method_or_error,
                'success': transcript is not None
            })

    # Process progression test data
    print("\n  Processing progression test data...")
    files = wav_files['progression_test']
    print(f"    Processing {len(files)} test files...")

    for i, audio_path in enumerate(files):
        print(f"      Processing {i+1}/{len(files)}: {os.path.basename(audio_path)}")

        transcript, method_or_error = extract_transcript_from_audio(audio_path)

        all_transcripts.append({
            'file_id': os.path.splitext(os.path.basename(audio_path))[0],
            'file_path': audio_path,
            'dataset': 'progression_test',
            'label': 'test',
            'transcript': transcript,
            'recognition_method': method_or_error if transcript else None,
            'error': None if transcript else method_or_error,
            'success': transcript is not None
        })

    # Process diagnosis training data
    print("\n  Processing diagnosis training data...")
    for label in ['ad', 'cn']:
        files = wav_files['diagnosis_train'][label]
        print(f"    Processing {len(files)} {label} files...")

        for i, audio_path in enumerate(files):
            print(f"      Processing {i+1}/{len(files)}: {os.path.basename(audio_path)}")

            transcript, method_or_error = extract_transcript_from_audio(audio_path)

            all_transcripts.append({
                'file_id': os.path.splitext(os.path.basename(audio_path))[0],
                'file_path': audio_path,
                'dataset': 'diagnosis_train',
                'label': label,
                'transcript': transcript,
                'recognition_method': method_or_error if transcript else None,
                'error': None if transcript else method_or_error,
                'success': transcript is not None
            })

    return all_transcripts

# Process all files
transcripts = process_audio_files(wav_files)

# STEP 9: SAVE RESULTS
print("\nSTEP 9: Saving transcription results...")

# Convert to DataFrame
df = pd.DataFrame(transcripts)

# Save complete results
complete_output = os.path.join(OUTPUT_PATH, "all_transcripts.csv")
df.to_csv(complete_output, index=False)
print(f"✓ Saved complete results to: {complete_output}")

# Save successful transcripts only
successful_df = df[df['success'] == True].copy()
success_output = os.path.join(OUTPUT_PATH, "successful_transcripts.csv")
successful_df.to_csv(success_output, index=False)
print(f"✓ Saved successful transcripts to: {success_output}")

# Save by dataset
datasets_to_save = df['dataset'].unique()
for dataset in datasets_to_save:
    dataset_df = df[df['dataset'] == dataset].copy()
    dataset_output = os.path.join(OUTPUT_PATH, f"{dataset}_transcripts.csv")
    dataset_df.to_csv(dataset_output, index=False)
    print(f"✓ Saved {dataset} transcripts to: {dataset_output}")

# STEP 10: DISPLAY SUMMARY STATISTICS
print("\nSTEP 10: Summary Statistics")
print("="*50)

total_files = len(df)
successful = len(successful_df)
failed = total_files - successful

print(f"Total audio files processed: {total_files}")
print(f"Successful transcriptions: {successful} ({successful/total_files*100:.1f}%)")
print(f"Failed transcriptions: {failed} ({failed/total_files*100:.1f}%)")

print(f"\nDataset breakdown:")
for dataset in df['dataset'].unique():
    dataset_total = len(df[df['dataset'] == dataset])
    dataset_success = len(df[(df['dataset'] == dataset) & (df['success'] == True)])
    print(f"  {dataset}: {dataset_success}/{dataset_total} successful ({dataset_success/dataset_total*100:.1f}%)")

print(f"\nLabel distribution (successful transcripts only):")
if not successful_df.empty:
    print(successful_df['label'].value_counts())

print(f"\nRecognition methods used:")
if not successful_df.empty:
    print(successful_df['recognition_method'].value_counts())

# Show sample transcripts
print(f"\nSample successful transcripts:")
sample_transcripts = successful_df['transcript'].dropna().head(3)
for i, transcript in enumerate(sample_transcripts):
    print(f"  Sample {i+1}: {transcript[:200]}...")

# Show common errors
print(f"\nMost common errors:")
error_df = df[df['success'] == False]
if not error_df.empty:
    error_counts = error_df['error'].value_counts().head(5)
    for error, count in error_counts.items():
        print(f"  {error}: {count} files")

print("\n" + "="*60)
print("TRANSCRIPT EXTRACTION COMPLETE!")
print(f"All results saved in: {OUTPUT_PATH}")
print("="*60)

ADReSSo21 AUDIO TRANSCRIPT EXTRACTOR

STEP 1: Mounting Google Drive...
Mounted at /content/drive
✓ Google Drive mounted successfully!

STEP 2: Installing required packages...
Installing speech recognition and audio processing libraries...
✓ Packages ready (make sure to install them first)

STEP 3: Setting up paths and configuration...
✓ Base path: /content/drive/MyDrive/Voice/
✓ Extract path: /content/drive/MyDrive/Voice/extracted/
✓ Output path: /content/drive/MyDrive/Voice/transcripts/

STEP 4: Extracting dataset files...
  Extracting ADReSSo21-progression-train.tgz...
  ✓ ADReSSo21-progression-train.tgz extracted successfully
  Extracting ADReSSo21-progression-test.tgz...
  ✓ ADReSSo21-progression-test.tgz extracted successfully
  Extracting ADReSSo21-diagnosis-train.tgz...
  ✓ ADReSSo21-diagnosis-train.tgz extracted successfully

STEP 5: Finding all WAV files...
  Found 15 decline WAV files
  Found 58 no_decline WAV files
  Found 32 test WAV files
  Found 87 AD WAV files
  Found 79