<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Detecting_dementia_from_speech_and_transcripts_using_transformers_May24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 1: Environment Setup and Dependencies
# Run this cell first to install all required packages

!pip install transformers torch torchvision torchaudio
!pip install librosa pandas numpy scikit-learn
!pip install datasets accelerate

# Import all necessary libraries
import os
import pandas as pd
import numpy as np
import tarfile
import csv
from pathlib import Path
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, BertModel,
    ViTFeatureExtractor, ViTModel,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Mount Google Drive (uncomment if using Google Drive)
# from google.colab import drive
# drive.mount('/content/drive')

Using device: cuda


In [2]:
# Step 2: Data Extraction and Organization

def extract_tgz_archive(archive_path, extract_to):
    """
    Extract .tgz archive to specified directory
    """
    try:
        with tarfile.open(archive_path, 'r:gz') as tar:
            tar.extractall(path=extract_to)
        print(f"Successfully extracted {archive_path} to {extract_to}")
    except Exception as e:
        print(f"Error extracting {archive_path}: {str(e)}")

def collect_adresso_data(base_path, task_type, split_type):
    """
    Collect ADReSSo data and create DataFrame with file paths and labels

    Args:
        base_path: Path to extracted ADReSSo data
        task_type: 'diagnosis' or 'progression'
        split_type: 'train' or 'test'

    Returns:
        DataFrame with columns: audio_path, transcript_path, label, participant_id
    """
    data_list = []

    # Define paths
    audio_base = os.path.join(base_path, f"ADReSSo21-{task_type}-{split_type}", "audio")
    transcript_base = os.path.join(base_path, f"ADReSSo21-{task_type}-{split_type}", "segmentation")

    # Define label mapping based on task type
    if task_type == 'diagnosis':
        label_dirs = ['ad', 'cn']  # Alzheimer's Disease, Cognitively Normal
    else:  # progression
        label_dirs = ['decline', 'no_decline']

    for label_dir in label_dirs:
        audio_dir = os.path.join(audio_base, label_dir)
        transcript_dir = os.path.join(transcript_base, label_dir)

        if os.path.exists(audio_dir) and os.path.exists(transcript_dir):
            # Get all audio files
            audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

            for audio_file in audio_files:
                # Extract participant ID (assuming format like 'S001.wav')
                participant_id = audio_file.split('.')[0]

                # Find corresponding transcript file
                transcript_file = f"{participant_id}.csv"

                audio_path = os.path.join(audio_dir, audio_file)
                transcript_path = os.path.join(transcript_dir, transcript_file)

                # Check if both files exist
                if os.path.exists(audio_path) and os.path.exists(transcript_path):
                    data_list.append({
                        'audio_path': audio_path,
                        'transcript_path': transcript_path,
                        'label': label_dir,
                        'participant_id': participant_id
                    })

    df = pd.DataFrame(data_list)
    print(f"Collected {len(df)} samples for {task_type}-{split_type}")
    print(f"Label distribution:\n{df['label'].value_counts()}")

    return df

# Example usage - update paths according to your Drive structure
BASE_PATH = "/content/drive/MyDrive/ADReSSo_extracted"  # Update this path

# Extract archives if needed (uncomment and update paths)
# extract_tgz_archive("/path/to/ADReSSo21-diagnosis-train.tgz", BASE_PATH)
# extract_tgz_archive("/path/to/ADReSSo21-diagnosis-test.tgz", BASE_PATH)

# Collect training data
train_df = collect_adresso_data(BASE_PATH, 'diagnosis', 'train')
test_df = collect_adresso_data(BASE_PATH, 'diagnosis', 'test')

# Split training data into train and validation
train_split, val_split = train_test_split(
    train_df,
    test_size=0.35,
    stratify=train_df['label'],
    random_state=42
)

print(f"\nDataset splits:")
print(f"Training: {len(train_split)} samples")
print(f"Validation: {len(val_split)} samples")
print(f"Test: {len(test_df)} samples")


Collected 0 samples for diagnosis-train


KeyError: 'label'