<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Detecting_dementia_from_speech_and_transcripts_using_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Set Up Google Colab Environment

In [None]:
import os
from google.colab import drive
import tarfile

# Mount Google Drive
drive.mount('/content/drive')

# Install required libraries
!pip install torch torchvision torchaudio
!pip install transformers
!pip install librosa
!pip install numpy pandas scikit-learn
!pip install matplotlib

# Extract datasets
data_dir = '/content/drive/MyDrive/Voice/'
extract_dir = '/content/ADReSSo21/'

os.makedirs(extract_dir, exist_ok=True)

datasets = [
    'ADReSSo21-diagnosis-train.tgz',
    'ADReSSo21-progression-test.tgz',
    'ADReSSo21-progression-train.tgz'
]

for dataset in datasets:
    tar_path = os.path.join(data_dir, dataset)
    with tarfile.open(tar_path, 'r:gz') as tar:
        tar.extractall(extract_dir)
    print(f"Extracted {dataset}")

# Verify GPU availability
import torch
print("GPU Available:", torch.cuda.is_available())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracted ADReSSo21-diagnosis-train.tgz
Extracted ADReSSo21-progression-test.tgz
Extracted ADReSSo21-progression-train.tgz
GPU Available: True


# Step 2: Prepare the Dataset

In [13]:
import librosa
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import glob

# Define base path
base_dir = '/content/ADReSSo21/ADReSSo21/'
train_base_dir = os.path.join(base_dir, 'diagnosis/train')
test_base_dir = os.path.join(base_dir, 'progression/test-dist')

# Function to extract log-Mel spectrogram and MFCCs with delta and delta-delta
def extract_audio_features(audio_path, sr=16000, n_mels=128, n_mfcc=13):
    # Load audio
    y, sr = librosa.load(audio_path, sr=sr)

    # Log-Mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

    # MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    # Delta and delta-delta
    delta_mfcc = librosa.feature.delta(mfcc)
    delta_delta_mfcc = librosa.feature.delta(mfcc, order=2)

    # Stack features as 3-channel image
    log_mel_image = np.stack([log_mel_spec, librosa.feature.delta(log_mel_spec), librosa.feature.delta(log_mel_spec, order=2)], axis=-1)
    mfcc_image = np.stack([mfcc, delta_mfcc, delta_delta_mfcc], axis=-1)

    return log_mel_image, mfcc_image

# Load dataset
def load_dataset(train_base_dir, test_base_dir=None):
    data = []
    train_audio_dir = os.path.join(train_base_dir, 'audio')
    train_transcript_dir = os.path.join(train_base_dir, 'segmentation')

    # Debug: List directory contents
    print("Checking train base directory:", train_base_dir)
    if os.path.exists(train_base_dir):
        print("Files in train base:", os.listdir(train_base_dir))
    else:
        print("Train base directory does not exist:", train_base_dir)

    # Check for metadata file
    metadata_file = None
    for fname in ['diagnosis.csv', 'metadata.csv', 'labels.csv', 'adresso-train-mmse-scores.csv']:
        if os.path.exists(os.path.join(train_base_dir, fname)):
            metadata_file = os.path.join(train_base_dir, fname)
            break

    if metadata_file:
        print("Found metadata file:", metadata_file)
        metadata = pd.read_csv(metadata_file)
        print("Metadata columns:", metadata.columns.tolist())
        for _, row in metadata.iterrows():
            # Try different column names for audio ID
            audio_id = str(row.get('id', row.get('audio_file', row.get('filename', row.get('subject_id', '')))))
            if not audio_id:
                continue
            audio_file = os.path.join(train_audio_dir, f"{audio_id}.wav")
            transcript_file = os.path.join(train_transcript_dir, f"{audio_id}.cha")
            if not os.path.exists(transcript_file):
                transcript_file = os.path.join(train_transcript_dir, f"{audio_id}.txt")
            if not os.path.exists(transcript_file):
                transcript_file = os.path.join(train_audio_dir, f"{audio_id}.cha")
            if not os.path.exists(transcript_file):
                transcript_file = os.path.join(train_audio_dir, f"{audio_id}.txt")
            if os.path.exists(audio_file) and os.path.exists(transcript_file):
                # Try different column names for label
                diagnosis = row.get('label', row.get('diagnosis', row.get('mmse', '')))
                label = 1 if str(diagnosis).lower() in ['ad', 'dementia', 'alzheimer'] or (isinstance(diagnosis, (int, float)) and diagnosis < 27) else 0
                data.append({
                    'audio_path': audio_file,
                    'transcript_path': transcript_file,
                    'label': label
                })
            else:
                print(f"Missing pair for ID {audio_id}: Audio exists={os.path.exists(audio_file)}, Transcript exists={os.path.exists(transcript_file)}")
        print(f"Loaded {len(data)} samples from metadata")

    # Fallback: Pair audio and transcript files
    else:
        print("No metadata file found, pairing audio and transcript files")
        audio_files = glob.glob(os.path.join(train_base_dir, '**/*.wav'), recursive=True)
        print(f"Found {len(audio_files)} audio files in {train_base_dir}")
        print("Sample audio files:", audio_files[:5])
        for audio_file in audio_files:
            audio_id = os.path.basename(audio_file).replace('.wav', '')
            # Try multiple transcript locations
            transcript_files = [
                os.path.join(train_transcript_dir, f"{audio_id}.cha"),
                os.path.join(train_transcript_dir, f"{audio_id}.txt"),
                os.path.join(train_audio_dir, f"{audio_id}.cha"),
                os.path.join(train_audio_dir, f"{audio_id}.txt"),
                os.path.join(train_transcript_dir, f"S{audio_id[-3:]}.cha"),  # Try S001 format
                os.path.join(train_audio_dir, f"S{audio_id[-3:]}.cha")
            ]
            transcript_file = None
            for tf in transcript_files:
                if os.path.exists(tf):
                    transcript_file = tf
                    break
            if transcript_file:
                label = 1 if 'ad' in audio_id.lower() or 'dementia' in audio_id.lower() or 'cd' in audio_id.lower() else 0
                data.append({
                    'audio_path': audio_file,
                    'transcript_path': transcript_file,
                    'label': label
                })
            else:
                print(f"No transcript found for {audio_id}. Checked: {transcript_files}")
        print(f"Loaded {len(data)} samples from file pairing")

    train_df = pd.DataFrame(data)
    print(f"Total samples loaded: {len(train_df)}")

    # Debug: Show sample data
    if not train_df.empty:
        print("Sample data:", train_df.head().to_dict())

    # Check if train_df is empty
    if train_df.empty:
        raise ValueError(f"No valid audio-transcript pairs found. Check directories:\n- Audio: {train_audio_dir}\n- Transcripts: {train_transcript_dir}\nRun '!ls -R /content/ADReSSo21/ADReSSo21/diagnosis/train/' to inspect.\nSample audio files: {glob.glob(os.path.join(train_audio_dir, '*.wav'))[:5]}")

    # Split train and validation (65%-35%)
    train_df, val_df = train_test_split(train_df, test_size=0.35, random_state=42)

    # Load test data
    test_df = pd.DataFrame()
    if test_base_dir and os.path.exists(test_base_dir):
        data = []
        test_audio_dir = os.path.join(test_base_dir, 'audio')
        test_transcript_dir = os.path.join(test_base_dir, 'segmentation')
        audio_files = glob.glob(os.path.join(test_base_dir, '**/*.wav'), recursive=True)
        print(f"Found {len(audio_files)} test audio files in {test_base_dir}")
        for audio_file in audio_files:
            audio_id = os.path.basename(audio_file).replace('.wav', '')
            transcript_file = os.path.join(test_transcript_dir, f"{audio_id}.cha")
            if not os.path.exists(transcript_file):
                transcript_file = os.path.join(test_transcript_dir, f"{audio_id}.txt")
            if not os.path.exists(transcript_file):
                transcript_file = os.path.join(test_audio_dir, f"{audio_id}.cha")
            if not os.path.exists(transcript_file):
                transcript_file = os.path.join(test_audio_dir, f"{audio_id}.txt")
            if os.path.exists(transcript_file):
                label = 0  # Placeholder: Modify based on test metadata
                data.append({
                    'audio_path': audio_file,
                    'transcript_path': transcript_file,
                    'label': label
                })
        test_df = pd.DataFrame(data)
        print(f"Test samples loaded: {len(test_df)}")

    return train_df, val_df, test_df

# Preprocess dataset
try:
    train_df, val_df, test_df = load_dataset(train_base_dir, test_base_dir)
    print("Training samples:", len(train_df))
    print("Validation samples:", len(val_df))
    print("Test samples:", len(test_df))
except ValueError as e:
    print("Error:", e)

Checking train base directory: /content/ADReSSo21/ADReSSo21/diagnosis/train
Files in train base: ['segmentation', 'audio', 'adresso-train-mmse-scores.csv']
Found metadata file: /content/ADReSSo21/ADReSSo21/diagnosis/train/adresso-train-mmse-scores.csv
Metadata columns: ['Unnamed: 0', 'adressfname', 'mmse', 'dx']
Loaded 0 samples from metadata
Total samples loaded: 0
Error: No valid audio-transcript pairs found. Check directories:
- Audio: /content/ADReSSo21/ADReSSo21/diagnosis/train/audio
- Transcripts: /content/ADReSSo21/ADReSSo21/diagnosis/train/segmentation
Run '!ls -R /content/ADReSSo21/ADReSSo21/diagnosis/train/' to inspect.
Sample audio files: []
