<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/ADReSSo21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tarfile
import pandas as pd
import numpy as np
from pathlib import Path
import shutil

class ADReSSo21TranscriptExtractor:
    def __init__(self, base_path="/drive/MyDrive/Voice/"):
        self.base_path = base_path
        self.datasets = {
            'progression_train': 'ADReSSo21-progression-train.tgz',
            'progression_test': 'ADReSSo21-progression-test.tgz',
            'diagnosis_train': 'ADReSSo21-diagnosis-train.tgz'
        }
        self.extracted_path = os.path.join(base_path, "extracted")

    def extract_datasets(self):
        """Extract all tgz files to the extracted directory"""
        print("Extracting datasets...")

        # Create extraction directory
        os.makedirs(self.extracted_path, exist_ok=True)

        for dataset_name, filename in self.datasets.items():
            file_path = os.path.join(self.base_path, filename)

            if os.path.exists(file_path):
                print(f"Extracting {filename}...")
                with tarfile.open(file_path, 'r:gz') as tar:
                    tar.extractall(path=self.extracted_path)
                print(f"✓ {filename} extracted successfully")
            else:
                print(f"⚠ Warning: {filename} not found at {file_path}")

    def find_csv_files(self, directory):
        """Recursively find all CSV files in a directory"""
        csv_files = []
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith('.csv'):
                    csv_files.append(os.path.join(root, file))
        return csv_files

    def read_transcript_csv(self, csv_path):
        """Read and process a single CSV transcript file"""
        try:
            # Try different encodings as CSV files might have different encodings
            encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']

            for encoding in encodings:
                try:
                    df = pd.read_csv(csv_path, encoding=encoding)
                    break
                except UnicodeDecodeError:
                    continue
            else:
                print(f"⚠ Could not read {csv_path} with any encoding")
                return None

            # Get filename without extension for ID
            file_id = os.path.splitext(os.path.basename(csv_path))[0]

            # Add file info
            df['file_id'] = file_id
            df['file_path'] = csv_path

            return df

        except Exception as e:
            print(f"Error reading {csv_path}: {str(e)}")
            return None

    def extract_progression_transcripts(self):
        """Extract transcripts from progression datasets"""
        transcripts = {
            'train': {'decline': [], 'no_decline': []},
            'test': []
        }

        # Process training data
        train_path = os.path.join(self.extracted_path, "ADReSSo21/progression/train/segmentation")

        if os.path.exists(train_path):
            # Decline cases
            decline_path = os.path.join(train_path, "decline")
            if os.path.exists(decline_path):
                csv_files = self.find_csv_files(decline_path)
                print(f"Found {len(csv_files)} CSV files in decline directory")

                for csv_file in csv_files:
                    df = self.read_transcript_csv(csv_file)
                    if df is not None:
                        df['label'] = 'decline'
                        transcripts['train']['decline'].append(df)

            # No decline cases
            no_decline_path = os.path.join(train_path, "no_decline")
            if os.path.exists(no_decline_path):
                csv_files = self.find_csv_files(no_decline_path)
                print(f"Found {len(csv_files)} CSV files in no_decline directory")

                for csv_file in csv_files:
                    df = self.read_transcript_csv(csv_file)
                    if df is not None:
                        df['label'] = 'no_decline'
                        transcripts['train']['no_decline'].append(df)

        # Process test data
        test_path = os.path.join(self.extracted_path, "ADReSSo21/progression/test-dist/segmentation")

        if os.path.exists(test_path):
            csv_files = self.find_csv_files(test_path)
            print(f"Found {len(csv_files)} CSV files in test directory")

            for csv_file in csv_files:
                df = self.read_transcript_csv(csv_file)
                if df is not None:
                    df['label'] = 'test'
                    transcripts['test'].append(df)

        return transcripts

    def extract_diagnosis_transcripts(self):
        """Extract transcripts from diagnosis dataset"""
        transcripts = {'ad': [], 'cn': []}

        base_path = os.path.join(self.extracted_path, "ADReSSo21/diagnosis/train/segmentation")

        if os.path.exists(base_path):
            # AD (Alzheimer's Disease) cases
            ad_path = os.path.join(base_path, "ad")
            if os.path.exists(ad_path):
                csv_files = self.find_csv_files(ad_path)
                print(f"Found {len(csv_files)} CSV files in AD directory")

                for csv_file in csv_files:
                    df = self.read_transcript_csv(csv_file)
                    if df is not None:
                        df['label'] = 'ad'
                        transcripts['ad'].append(df)

            # CN (Cognitive Normal) cases
            cn_path = os.path.join(base_path, "cn")
            if os.path.exists(cn_path):
                csv_files = self.find_csv_files(cn_path)
                print(f"Found {len(csv_files)} CSV files in CN directory")

                for csv_file in csv_files:
                    df = self.read_transcript_csv(csv_file)
                    if df is not None:
                        df['label'] = 'cn'
                        transcripts['cn'].append(df)

        return transcripts

    def combine_and_save_transcripts(self, transcripts, dataset_name):
        """Combine transcript dataframes and save to CSV"""
        all_transcripts = []

        if dataset_name == 'progression':
            # Combine training data
            for label in ['decline', 'no_decline']:
                if transcripts['train'][label]:
                    combined = pd.concat(transcripts['train'][label], ignore_index=True)
                    all_transcripts.append(combined)

            # Combine test data
            if transcripts['test']:
                combined_test = pd.concat(transcripts['test'], ignore_index=True)
                all_transcripts.append(combined_test)

        elif dataset_name == 'diagnosis':
            # Combine AD and CN data
            for label in ['ad', 'cn']:
                if transcripts[label]:
                    combined = pd.concat(transcripts[label], ignore_index=True)
                    all_transcripts.append(combined)

        if all_transcripts:
            final_df = pd.concat(all_transcripts, ignore_index=True)

            # Save to CSV
            output_path = os.path.join(self.base_path, f"{dataset_name}_transcripts.csv")
            final_df.to_csv(output_path, index=False)
            print(f"✓ Saved {len(final_df)} transcript records to {output_path}")

            return final_df

        return None

    def display_sample_data(self, df, dataset_name):
        """Display sample data and statistics"""
        print(f"\n=== {dataset_name.upper()} DATASET SUMMARY ===")
        print(f"Total records: {len(df)}")

        if 'label' in df.columns:
            print("\nLabel distribution:")
            print(df['label'].value_counts())

        print(f"\nColumns: {list(df.columns)}")

        print(f"\nSample data:")
        print(df.head())

        # Show some transcript samples if available
        text_columns = [col for col in df.columns if 'text' in col.lower() or 'transcript' in col.lower() or 'word' in col.lower()]
        if text_columns:
            print(f"\nSample transcript content from column '{text_columns[0]}':")
            for i, text in enumerate(df[text_columns[0]].dropna().head(3)):
                print(f"Sample {i+1}: {str(text)[:200]}...")

    def run_extraction(self):
        """Main method to run the complete extraction process"""
        print("Starting ADReSSo21 transcript extraction...")

        # Extract datasets
        self.extract_datasets()

        # Extract progression transcripts
        print("\n" + "="*50)
        print("EXTRACTING PROGRESSION TRANSCRIPTS")
        print("="*50)
        progression_transcripts = self.extract_progression_transcripts()
        progression_df = self.combine_and_save_transcripts(progression_transcripts, 'progression')

        if progression_df is not None:
            self.display_sample_data(progression_df, 'progression')

        # Extract diagnosis transcripts
        print("\n" + "="*50)
        print("EXTRACTING DIAGNOSIS TRANSCRIPTS")
        print("="*50)
        diagnosis_transcripts = self.extract_diagnosis_transcripts()
        diagnosis_df = self.combine_and_save_transcripts(diagnosis_transcripts, 'diagnosis')

        if diagnosis_df is not None:
            self.display_sample_data(diagnosis_df, 'diagnosis')

        print("\n" + "="*50)
        print("EXTRACTION COMPLETE!")
        print("="*50)

        return progression_df, diagnosis_df

# Usage
if __name__ == "__main__":
    # Initialize extractor
    extractor = ADReSSo21TranscriptExtractor()

    # Run extraction
    progression_df, diagnosis_df = extractor.run_extraction()

    # Optional: Access individual datasets
    # You can also use these methods individually:
    # extractor.extract_datasets()
    # progression_transcripts = extractor.extract_progression_transcripts()
    # diagnosis_transcripts = extractor.extract_diagnosis_transcripts()

Starting ADReSSo21 transcript extraction...
Extracting datasets...

EXTRACTING PROGRESSION TRANSCRIPTS

EXTRACTING DIAGNOSIS TRANSCRIPTS

EXTRACTION COMPLETE!


In [2]:
# Simple step-by-step transcript extraction for ADReSSo21 dataset
import os
import tarfile
import pandas as pd

# Set your paths
BASE_PATH = "/drive/MyDrive/Voice/"
EXTRACT_PATH = "/drive/MyDrive/Voice/extracted/"

# Step 1: Extract the datasets
def extract_all_datasets():
    """Extract all three tgz files"""
    datasets = [
        'ADReSSo21-progression-train.tgz',
        'ADReSSo21-progression-test.tgz',
        'ADReSSo21-diagnosis-train.tgz'
    ]

    os.makedirs(EXTRACT_PATH, exist_ok=True)

    for dataset in datasets:
        file_path = os.path.join(BASE_PATH, dataset)
        if os.path.exists(file_path):
            print(f"Extracting {dataset}...")
            with tarfile.open(file_path, 'r:gz') as tar:
                tar.extractall(path=EXTRACT_PATH)
            print(f"✓ Done")
        else:
            print(f"⚠ {dataset} not found")

# Step 2: Read all CSV files from a directory
def read_all_csvs_from_directory(directory_path, label=None):
    """Read all CSV files from a directory and combine them"""
    all_data = []

    if not os.path.exists(directory_path):
        print(f"Directory not found: {directory_path}")
        return pd.DataFrame()

    csv_files = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))

    print(f"Found {len(csv_files)} CSV files in {directory_path}")

    for csv_file in csv_files:
        try:
            # Try multiple encodings
            for encoding in ['utf-8', 'latin-1', 'cp1252']:
                try:
                    df = pd.read_csv(csv_file, encoding=encoding)
                    break
                except UnicodeDecodeError:
                    continue

            # Add metadata
            df['file_id'] = os.path.splitext(os.path.basename(csv_file))[0]
            df['file_path'] = csv_file
            if label:
                df['label'] = label

            all_data.append(df)

        except Exception as e:
            print(f"Error reading {csv_file}: {e}")

    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        return pd.DataFrame()

# Step 3: Extract transcripts step by step

print("Step 1: Extracting datasets...")
extract_all_datasets()

print("\nStep 2: Reading progression training data...")
# Progression training - decline cases
decline_path = os.path.join(EXTRACT_PATH, "ADReSSo21/progression/train/segmentation/decline/")
decline_df = read_all_csvs_from_directory(decline_path, label='decline')

# Progression training - no decline cases
no_decline_path = os.path.join(EXTRACT_PATH, "ADReSSo21/progression/train/segmentation/no_decline/")
no_decline_df = read_all_csvs_from_directory(no_decline_path, label='no_decline')

# Combine progression training data
if not decline_df.empty and not no_decline_df.empty:
    progression_train_df = pd.concat([decline_df, no_decline_df], ignore_index=True)
elif not decline_df.empty:
    progression_train_df = decline_df
elif not no_decline_df.empty:
    progression_train_df = no_decline_df
else:
    progression_train_df = pd.DataFrame()

print(f"Progression training data: {len(progression_train_df)} records")

print("\nStep 3: Reading progression test data...")
# Progression test data
test_path = os.path.join(EXTRACT_PATH, "ADReSSo21/progression/test-dist/segmentation/")
progression_test_df = read_all_csvs_from_directory(test_path, label='test')
print(f"Progression test data: {len(progression_test_df)} records")

print("\nStep 4: Reading diagnosis data...")
# Diagnosis - AD cases
ad_path = os.path.join(EXTRACT_PATH, "ADReSSo21/diagnosis/train/segmentation/ad/")
ad_df = read_all_csvs_from_directory(ad_path, label='ad')

# Diagnosis - CN cases
cn_path = os.path.join(EXTRACT_PATH, "ADReSSo21/diagnosis/train/segmentation/cn/")
cn_df = read_all_csvs_from_directory(cn_path, label='cn')

# Combine diagnosis data
if not ad_df.empty and not cn_df.empty:
    diagnosis_df = pd.concat([ad_df, cn_df], ignore_index=True)
elif not ad_df.empty:
    diagnosis_df = ad_df
elif not cn_df.empty:
    diagnosis_df = cn_df
else:
    diagnosis_df = pd.DataFrame()

print(f"Diagnosis data: {len(diagnosis_df)} records")

print("\nStep 5: Saving results...")

# Save progression data
if not progression_train_df.empty:
    progression_train_df.to_csv(os.path.join(BASE_PATH, "progression_train_transcripts.csv"), index=False)
    print("✓ Saved progression_train_transcripts.csv")

if not progression_test_df.empty:
    progression_test_df.to_csv(os.path.join(BASE_PATH, "progression_test_transcripts.csv"), index=False)
    print("✓ Saved progression_test_transcripts.csv")

# Save diagnosis data
if not diagnosis_df.empty:
    diagnosis_df.to_csv(os.path.join(BASE_PATH, "diagnosis_transcripts.csv"), index=False)
    print("✓ Saved diagnosis_transcripts.csv")

print("\nStep 6: Summary")
print("="*50)

# Display summaries
if not progression_train_df.empty:
    print(f"\nProgression Training Data:")
    print(f"Total records: {len(progression_train_df)}")
    print(f"Columns: {list(progression_train_df.columns)}")
    if 'label' in progression_train_df.columns:
        print("Label distribution:")
        print(progression_train_df['label'].value_counts())
    print("\nSample data:")
    print(progression_train_df.head(3))

if not progression_test_df.empty:
    print(f"\nProgression Test Data:")
    print(f"Total records: {len(progression_test_df)}")
    print(f"Columns: {list(progression_test_df.columns)}")
    print("\nSample data:")
    print(progression_test_df.head(3))

if not diagnosis_df.empty:
    print(f"\nDiagnosis Data:")
    print(f"Total records: {len(diagnosis_df)}")
    print(f"Columns: {list(diagnosis_df.columns)}")
    if 'label' in diagnosis_df.columns:
        print("Label distribution:")
        print(diagnosis_df['label'].value_counts())
    print("\nSample data:")
    print(diagnosis_df.head(3))

print("\n" + "="*50)
print("EXTRACTION COMPLETE!")
print("Your transcript files are saved in:", BASE_PATH)
print("="*50)

Step 1: Extracting datasets...
⚠ ADReSSo21-progression-train.tgz not found
⚠ ADReSSo21-progression-test.tgz not found
⚠ ADReSSo21-diagnosis-train.tgz not found

Step 2: Reading progression training data...
Directory not found: /drive/MyDrive/Voice/extracted/ADReSSo21/progression/train/segmentation/decline/
Directory not found: /drive/MyDrive/Voice/extracted/ADReSSo21/progression/train/segmentation/no_decline/
Progression training data: 0 records

Step 3: Reading progression test data...
Directory not found: /drive/MyDrive/Voice/extracted/ADReSSo21/progression/test-dist/segmentation/
Progression test data: 0 records

Step 4: Reading diagnosis data...
Directory not found: /drive/MyDrive/Voice/extracted/ADReSSo21/diagnosis/train/segmentation/ad/
Directory not found: /drive/MyDrive/Voice/extracted/ADReSSo21/diagnosis/train/segmentation/cn/
Diagnosis data: 0 records

Step 5: Saving results...

Step 6: Summary

EXTRACTION COMPLETE!
Your transcript files are saved in: /drive/MyDrive/Voice/


In [6]:
!pip install SpeechRecognition pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
# Step-by-Step Audio Transcript Extractor for ADReSSo21 Dataset
# This script will:
# 1. Mount Google Drive
# 2. Extract dataset files
# 3. Find all WAV files
# 4. Extract transcripts from audio using speech recognition
# 5. Save organized transcripts

import os
import tarfile
import pandas as pd
import numpy as np
from pathlib import Path
import librosa
import speech_recognition as sr
import soundfile as sf
from pydub import AudioSegment
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("ADReSSo21 AUDIO TRANSCRIPT EXTRACTOR")
print("="*60)

# STEP 1: MOUNT GOOGLE DRIVE
print("\nSTEP 1: Mounting Google Drive...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✓ Google Drive mounted successfully!")
except:
    print("⚠ Not running in Colab or Drive already mounted")

# STEP 2: INSTALL REQUIRED PACKAGES
print("\nSTEP 2: Installing required packages...")
print("Installing speech recognition and audio processing libraries...")

# Uncomment these lines to install packages (run once)
# !pip install SpeechRecognition
# !pip install pydub
# !pip install librosa
# !pip install soundfile
# !apt-get install -y ffmpeg

print("✓ Packages ready (make sure to install them first)")

# STEP 3: SET UP PATHS AND CONFIGURATION
print("\nSTEP 3: Setting up paths and configuration...")

BASE_PATH = "/content/drive/MyDrive/Voice/"
EXTRACT_PATH = "/content/drive/MyDrive/Voice/extracted/"
OUTPUT_PATH = "/content/drive/MyDrive/Voice/transcripts/"

# Create directories
os.makedirs(EXTRACT_PATH, exist_ok=True)
os.makedirs(OUTPUT_PATH, exist_ok=True)

datasets = {
    'progression_train': 'ADReSSo21-progression-train.tgz',
    'progression_test': 'ADReSSo21-progression-test.tgz',
    'diagnosis_train': 'ADReSSo21-diagnosis-train.tgz'
}

print(f"✓ Base path: {BASE_PATH}")
print(f"✓ Extract path: {EXTRACT_PATH}")
print(f"✓ Output path: {OUTPUT_PATH}")

# STEP 4: EXTRACT DATASET FILES
print("\nSTEP 4: Extracting dataset files...")

def extract_datasets():
    """Extract all tgz files"""
    for dataset_name, filename in datasets.items():
        file_path = os.path.join(BASE_PATH, filename)

        if os.path.exists(file_path):
            print(f"  Extracting {filename}...")
            try:
                with tarfile.open(file_path, 'r:gz') as tar:
                    tar.extractall(path=EXTRACT_PATH)
                print(f"  ✓ {filename} extracted successfully")
            except Exception as e:
                print(f"  ⚠ Error extracting {filename}: {e}")
        else:
            print(f"  ⚠ {filename} not found at {file_path}")

extract_datasets()

# STEP 5: FIND ALL WAV FILES
print("\nSTEP 5: Finding all WAV files...")

def find_wav_files():
    """Find all WAV files and organize by dataset and label"""
    wav_files = {
        'progression_train': {'decline': [], 'no_decline': []},
        'progression_test': [],
        'diagnosis_train': {'ad': [], 'cn': []}
    }

    # Progression training files
    prog_train_base = os.path.join(EXTRACT_PATH, "ADReSSo21/progression/train/audio/")

    # Decline cases
    decline_path = os.path.join(prog_train_base, "decline/")
    if os.path.exists(decline_path):
        decline_wavs = [f for f in os.listdir(decline_path) if f.endswith('.wav')]
        wav_files['progression_train']['decline'] = [os.path.join(decline_path, f) for f in decline_wavs]
        print(f"  Found {len(decline_wavs)} decline WAV files")

    # No decline cases
    no_decline_path = os.path.join(prog_train_base, "no_decline/")
    if os.path.exists(no_decline_path):
        no_decline_wavs = [f for f in os.listdir(no_decline_path) if f.endswith('.wav')]
        wav_files['progression_train']['no_decline'] = [os.path.join(no_decline_path, f) for f in no_decline_wavs]
        print(f"  Found {len(no_decline_wavs)} no_decline WAV files")

    # Progression test files
    prog_test_path = os.path.join(EXTRACT_PATH, "ADReSSo21/progression/test-dist/audio/")
    if os.path.exists(prog_test_path):
        test_wavs = [f for f in os.listdir(prog_test_path) if f.endswith('.wav')]
        wav_files['progression_test'] = [os.path.join(prog_test_path, f) for f in test_wavs]
        print(f"  Found {len(test_wavs)} test WAV files")

    # Diagnosis training files
    diag_train_base = os.path.join(EXTRACT_PATH, "ADReSSo21/diagnosis/train/audio/")

    # AD cases
    ad_path = os.path.join(diag_train_base, "ad/")
    if os.path.exists(ad_path):
        ad_wavs = [f for f in os.listdir(ad_path) if f.endswith('.wav')]
        wav_files['diagnosis_train']['ad'] = [os.path.join(ad_path, f) for f in ad_wavs]
        print(f"  Found {len(ad_wavs)} AD WAV files")

    # CN cases
    cn_path = os.path.join(diag_train_base, "cn/")
    if os.path.exists(cn_path):
        cn_wavs = [f for f in os.listdir(cn_path) if f.endswith('.wav')]
        wav_files['diagnosis_train']['cn'] = [os.path.join(cn_path, f) for f in cn_wavs]
        print(f"  Found {len(cn_wavs)} CN WAV files")

    return wav_files

wav_files = find_wav_files()

# STEP 6: AUDIO PREPROCESSING FUNCTIONS
print("\nSTEP 6: Setting up audio preprocessing...")

def preprocess_audio(audio_path, target_sr=16000):
    """Preprocess audio file for speech recognition"""
    try:
        # Load audio with librosa
        audio, sr = librosa.load(audio_path, sr=target_sr)

        # Normalize audio
        audio = librosa.util.normalize(audio)

        # Remove silence
        audio_trimmed, _ = librosa.effects.trim(audio, top_db=20)

        return audio_trimmed, target_sr
    except Exception as e:
        print(f"    Error preprocessing {audio_path}: {e}")
        return None, None

def convert_to_wav_if_needed(audio_path):
    """Convert audio to WAV format if needed"""
    try:
        if not audio_path.endswith('.wav'):
            # Convert using pydub
            audio = AudioSegment.from_file(audio_path)
            wav_path = audio_path.rsplit('.', 1)[0] + '_converted.wav'
            audio.export(wav_path, format="wav")
            return wav_path
        return audio_path
    except Exception as e:
        print(f"    Error converting {audio_path}: {e}")
        return audio_path

# STEP 7: SPEECH RECOGNITION FUNCTION
print("\nSTEP 7: Setting up speech recognition...")

def extract_transcript_from_audio(audio_path, method='google'):
    """Extract transcript from audio file using speech recognition"""
    recognizer = sr.Recognizer()

    try:
        # Convert to WAV if needed
        wav_path = convert_to_wav_if_needed(audio_path)

        # Preprocess audio
        audio_data, sr_rate = preprocess_audio(wav_path, target_sr=16000)

        if audio_data is None:
            return None, "Preprocessing failed"

        # Save preprocessed audio temporarily
        temp_wav = audio_path.replace('.wav', '_temp.wav')
        sf.write(temp_wav, audio_data, sr_rate)

        # Use speech recognition
        with sr.AudioFile(temp_wav) as source:
            # Adjust for ambient noise
            recognizer.adjust_for_ambient_noise(source, duration=0.5)
            audio = recognizer.listen(source)

        # Try different recognition methods
        transcript = None
        error_msg = ""

        if method == 'google':
            try:
                transcript = recognizer.recognize_google(audio)
            except sr.UnknownValueError:
                error_msg = "Google Speech Recognition could not understand audio"
            except sr.RequestError as e:
                error_msg = f"Google Speech Recognition error: {e}"

        # Fallback to other methods if Google fails
        if transcript is None:
            try:
                transcript = recognizer.recognize_sphinx(audio)
                method = 'sphinx'
            except sr.UnknownValueError:
                error_msg += "; Sphinx could not understand audio"
            except sr.RequestError as e:
                error_msg += f"; Sphinx error: {e}"

        # Clean up temporary file
        if os.path.exists(temp_wav):
            os.remove(temp_wav)

        if transcript:
            return transcript.strip(), method
        else:
            return None, error_msg

    except Exception as e:
        return None, f"Error processing audio: {str(e)}"

# STEP 8: PROCESS ALL AUDIO FILES AND EXTRACT TRANSCRIPTS
print("\nSTEP 8: Processing audio files and extracting transcripts...")
print("This may take a while depending on the number and length of audio files...")

def process_audio_files(wav_files):
    """Process all audio files and extract transcripts"""
    all_transcripts = []

    # Process progression training data
    print("\n  Processing progression training data...")
    for label in ['decline', 'no_decline']:
        files = wav_files['progression_train'][label]
        print(f"    Processing {len(files)} {label} files...")

        for i, audio_path in enumerate(files):
            print(f"      Processing {i+1}/{len(files)}: {os.path.basename(audio_path)}")

            transcript, method_or_error = extract_transcript_from_audio(audio_path)

            all_transcripts.append({
                'file_id': os.path.splitext(os.path.basename(audio_path))[0],
                'file_path': audio_path,
                'dataset': 'progression_train',
                'label': label,
                'transcript': transcript,
                'recognition_method': method_or_error if transcript else None,
                'error': None if transcript else method_or_error,
                'success': transcript is not None
            })

    # Process progression test data
    print("\n  Processing progression test data...")
    files = wav_files['progression_test']
    print(f"    Processing {len(files)} test files...")

    for i, audio_path in enumerate(files):
        print(f"      Processing {i+1}/{len(files)}: {os.path.basename(audio_path)}")

        transcript, method_or_error = extract_transcript_from_audio(audio_path)

        all_transcripts.append({
            'file_id': os.path.splitext(os.path.basename(audio_path))[0],
            'file_path': audio_path,
            'dataset': 'progression_test',
            'label': 'test',
            'transcript': transcript,
            'recognition_method': method_or_error if transcript else None,
            'error': None if transcript else method_or_error,
            'success': transcript is not None
        })

    # Process diagnosis training data
    print("\n  Processing diagnosis training data...")
    for label in ['ad', 'cn']:
        files = wav_files['diagnosis_train'][label]
        print(f"    Processing {len(files)} {label} files...")

        for i, audio_path in enumerate(files):
            print(f"      Processing {i+1}/{len(files)}: {os.path.basename(audio_path)}")

            transcript, method_or_error = extract_transcript_from_audio(audio_path)

            all_transcripts.append({
                'file_id': os.path.splitext(os.path.basename(audio_path))[0],
                'file_path': audio_path,
                'dataset': 'diagnosis_train',
                'label': label,
                'transcript': transcript,
                'recognition_method': method_or_error if transcript else None,
                'error': None if transcript else method_or_error,
                'success': transcript is not None
            })

    return all_transcripts

# Process all files
transcripts = process_audio_files(wav_files)

# STEP 9: SAVE RESULTS
print("\nSTEP 9: Saving transcription results...")

# Convert to DataFrame
df = pd.DataFrame(transcripts)

# Save complete results
complete_output = os.path.join(OUTPUT_PATH, "all_transcripts.csv")
df.to_csv(complete_output, index=False)
print(f"✓ Saved complete results to: {complete_output}")

# Save successful transcripts only
successful_df = df[df['success'] == True].copy()
success_output = os.path.join(OUTPUT_PATH, "successful_transcripts.csv")
successful_df.to_csv(success_output, index=False)
print(f"✓ Saved successful transcripts to: {success_output}")

# Save by dataset
datasets_to_save = df['dataset'].unique()
for dataset in datasets_to_save:
    dataset_df = df[df['dataset'] == dataset].copy()
    dataset_output = os.path.join(OUTPUT_PATH, f"{dataset}_transcripts.csv")
    dataset_df.to_csv(dataset_output, index=False)
    print(f"✓ Saved {dataset} transcripts to: {dataset_output}")

# STEP 10: DISPLAY SUMMARY STATISTICS
print("\nSTEP 10: Summary Statistics")
print("="*50)

total_files = len(df)
successful = len(successful_df)
failed = total_files - successful

print(f"Total audio files processed: {total_files}")
print(f"Successful transcriptions: {successful} ({successful/total_files*100:.1f}%)")
print(f"Failed transcriptions: {failed} ({failed/total_files*100:.1f}%)")

print(f"\nDataset breakdown:")
for dataset in df['dataset'].unique():
    dataset_total = len(df[df['dataset'] == dataset])
    dataset_success = len(df[(df['dataset'] == dataset) & (df['success'] == True)])
    print(f"  {dataset}: {dataset_success}/{dataset_total} successful ({dataset_success/dataset_total*100:.1f}%)")

print(f"\nLabel distribution (successful transcripts only):")
if not successful_df.empty:
    print(successful_df['label'].value_counts())

print(f"\nRecognition methods used:")
if not successful_df.empty:
    print(successful_df['recognition_method'].value_counts())

# Show sample transcripts
print(f"\nSample successful transcripts:")
sample_transcripts = successful_df['transcript'].dropna().head(3)
for i, transcript in enumerate(sample_transcripts):
    print(f"  Sample {i+1}: {transcript[:200]}...")

# Show common errors
print(f"\nMost common errors:")
error_df = df[df['success'] == False]
if not error_df.empty:
    error_counts = error_df['error'].value_counts().head(5)
    for error, count in error_counts.items():
        print(f"  {error}: {count} files")

print("\n" + "="*60)
print("TRANSCRIPT EXTRACTION COMPLETE!")
print(f"All results saved in: {OUTPUT_PATH}")
print("="*60)

ADReSSo21 AUDIO TRANSCRIPT EXTRACTOR

STEP 1: Mounting Google Drive...
Mounted at /content/drive
✓ Google Drive mounted successfully!

STEP 2: Installing required packages...
Installing speech recognition and audio processing libraries...
✓ Packages ready (make sure to install them first)

STEP 3: Setting up paths and configuration...
✓ Base path: /content/drive/MyDrive/Voice/
✓ Extract path: /content/drive/MyDrive/Voice/extracted/
✓ Output path: /content/drive/MyDrive/Voice/transcripts/

STEP 4: Extracting dataset files...
  Extracting ADReSSo21-progression-train.tgz...
  ✓ ADReSSo21-progression-train.tgz extracted successfully
  Extracting ADReSSo21-progression-test.tgz...
