<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Detecting_dementia_from_speech_and_transcripts_using_transformers_May24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Step 1: Environment Setup and Dependencies
# Run this cell first to install all required packages

!pip install transformers torch torchvision torchaudio
!pip install librosa pandas numpy scikit-learn
!pip install datasets accelerate

# Import all necessary libraries
import os
import pandas as pd
import numpy as np
import tarfile
import csv
from pathlib import Path
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, BertModel,
    ViTFeatureExtractor, ViTModel,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Mount Google Drive (uncomment if using Google Drive)
from google.colab import drive
drive.mount('/content/drive')

Using device: cuda
Mounted at /content/drive


In [4]:
# Step 2: Data Extraction and Organization

def extract_tgz_archive(archive_path, extract_to):
    """
    Extract .tgz archive to specified directory
    """
    try:
        with tarfile.open(archive_path, 'r:gz') as tar:
            tar.extractall(path=extract_to)
        print(f"Successfully extracted {archive_path} to {extract_to}")
    except Exception as e:
        print(f"Error extracting {archive_path}: {str(e)}")

def collect_adresso_data(base_path, task_type, split_type):
    """
    Collect ADReSSo data and create DataFrame with file paths and labels

    Args:
        base_path: Path to extracted ADReSSo data
        task_type: 'diagnosis' or 'progression'
        split_type: 'train' or 'test'

    Returns:
        DataFrame with columns: audio_path, transcript_path, label, participant_id
    """
    data_list = []

    # Define paths - try multiple possible directory structures
    possible_paths = [
        os.path.join(base_path, f"ADReSSo21-{task_type}-{split_type}"),
        os.path.join(base_path, f"ADReSSo21/{task_type}/{split_type}"),
        os.path.join(base_path, f"{task_type}-{split_type}"),
        os.path.join(base_path, task_type, split_type)
    ]

    data_root = None
    for path in possible_paths:
        if os.path.exists(path):
            data_root = path
            break

    if data_root is None:
        print(f"Could not find data directory for {task_type}-{split_type}")
        print(f"Searched in: {possible_paths}")
        print(f"Available directories in {base_path}:")
        if os.path.exists(base_path):
            for item in os.listdir(base_path):
                item_path = os.path.join(base_path, item)
                if os.path.isdir(item_path):
                    print(f"  - {item}")
        return pd.DataFrame()

    print(f"Found data root: {data_root}")

    # Look for audio and transcript directories
    audio_base = os.path.join(data_root, "audio")
    transcript_base = os.path.join(data_root, "segmentation")

    # If not found, try alternative names
    if not os.path.exists(audio_base):
        for alt_name in ["Audio", "wav", "sound"]:
            alt_path = os.path.join(data_root, alt_name)
            if os.path.exists(alt_path):
                audio_base = alt_path
                break

    if not os.path.exists(transcript_base):
        for alt_name in ["transcripts", "text", "csv", "Transcripts"]:
            alt_path = os.path.join(data_root, alt_name)
            if os.path.exists(alt_path):
                transcript_base = alt_path
                break

    print(f"Audio directory: {audio_base} (exists: {os.path.exists(audio_base)})")
    print(f"Transcript directory: {transcript_base} (exists: {os.path.exists(transcript_base)})")

    if not os.path.exists(audio_base) or not os.path.exists(transcript_base):
        print("Could not find both audio and transcript directories")
        return pd.DataFrame()

    # Define label mapping based on task type
    if task_type == 'diagnosis':
        label_dirs = ['ad', 'cn']  # Alzheimer's Disease, Cognitively Normal
    else:  # progression
        label_dirs = ['decline', 'no_decline']

    # Check if label subdirectories exist, if not, process all files in the base directory
    has_label_subdirs = any(os.path.exists(os.path.join(audio_base, label_dir)) for label_dir in label_dirs)

    if has_label_subdirs:
        print("Found label subdirectories")
        for label_dir in label_dirs:
            audio_dir = os.path.join(audio_base, label_dir)
            transcript_dir = os.path.join(transcript_base, label_dir)

            if os.path.exists(audio_dir) and os.path.exists(transcript_dir):
                # Get all audio files
                audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

                for audio_file in audio_files:
                    # Extract participant ID (assuming format like 'S001.wav')
                    participant_id = audio_file.split('.')[0]

                    # Find corresponding transcript file
                    transcript_file = f"{participant_id}.csv"

                    audio_path = os.path.join(audio_dir, audio_file)
                    transcript_path = os.path.join(transcript_dir, transcript_file)

                    # Check if both files exist
                    if os.path.exists(audio_path) and os.path.exists(transcript_path):
                        data_list.append({
                            'audio_path': audio_path,
                            'transcript_path': transcript_path,
                            'label': label_dir,
                            'participant_id': participant_id
                        })
    else:
        print("No label subdirectories found, processing all files")
        # Process all files in the base directories
        if os.path.exists(audio_base):
            audio_files = [f for f in os.listdir(audio_base) if f.endswith('.wav')]

            for audio_file in audio_files:
                participant_id = audio_file.split('.')[0]

                # Look for corresponding transcript
                transcript_file = f"{participant_id}.csv"
                transcript_path = os.path.join(transcript_base, transcript_file)

                if not os.path.exists(transcript_path):
                    # Try alternative extensions
                    for ext in ['.txt', '.tsv']:
                        alt_transcript = f"{participant_id}{ext}"
                        alt_path = os.path.join(transcript_base, alt_transcript)
                        if os.path.exists(alt_path):
                            transcript_path = alt_path
                            break

                audio_path = os.path.join(audio_base, audio_file)

                if os.path.exists(transcript_path):
                    # Default label assignment - you may need to modify this based on your data
                    label = 'unknown'  # You'll need to determine labels from metadata or filenames

                    data_list.append({
                        'audio_path': audio_path,
                        'transcript_path': transcript_path,
                        'label': label,
                        'participant_id': participant_id
                    })

    df = pd.DataFrame(data_list)
    print(f"Collected {len(df)} samples for {task_type}-{split_type}")

    if len(df) > 0:
        print(f"Label distribution:\n{df['label'].value_counts()}")
    else:
        print("No data collected. Please check your directory structure.")

    return df

# Example usage - update paths according to your Drive structure
BASE_PATH = "/content/drive/MyDrive/ADReSSo_extracted"  # Update this path

# Extract archives if needed (uncomment and update paths)
# extract_tgz_archive("/path/to/ADReSSo21-diagnosis-train.tgz", BASE_PATH)
# extract_tgz_archive("/path/to/ADReSSo21-diagnosis-test.tgz", BASE_PATH)

# Collect training data
train_df = collect_adresso_data(BASE_PATH, 'diagnosis', 'train')

# Only proceed if we have data
if len(train_df) > 0:
    test_df = collect_adresso_data(BASE_PATH, 'diagnosis', 'test')

    # Split training data into train and validation only if we have labels
    if train_df['label'].nunique() > 1:
        train_split, val_split = train_test_split(
            train_df,
            test_size=0.35,
            stratify=train_df['label'],
            random_state=42
        )
    else:
        # If only one label, do simple split
        train_split, val_split = train_test_split(
            train_df,
            test_size=0.35,
            random_state=42
        )

    print(f"\nDataset splits:")
    print(f"Training: {len(train_split)} samples")
    print(f"Validation: {len(val_split)} samples")
    print(f"Test: {len(test_df)} samples")
else:
    print("\nNo training data found. Please check your dataset path and structure.")
    print("Make sure you have:")
    print("1. Extracted the ADReSSo dataset archives")
    print("2. Updated the BASE_PATH variable to point to the correct directory")
    print("3. The directory structure matches the expected format")

Could not find data directory for diagnosis-train
Searched in: ['/content/drive/MyDrive/ADReSSo_extracted/ADReSSo21-diagnosis-train', '/content/drive/MyDrive/ADReSSo_extracted/ADReSSo21/diagnosis/train', '/content/drive/MyDrive/ADReSSo_extracted/diagnosis-train', '/content/drive/MyDrive/ADReSSo_extracted/diagnosis/train']
Available directories in /content/drive/MyDrive/ADReSSo_extracted:

No training data found. Please check your dataset path and structure.
Make sure you have:
1. Extracted the ADReSSo dataset archives
2. Updated the BASE_PATH variable to point to the correct directory
3. The directory structure matches the expected format
