<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/July16_Speech_CompleteV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Initial Setup and Data Loading
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import soundfile as sf
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install required libraries
!pip install -q librosa soundfile speechbrain whisper-jax transformers datasets torch torchaudio
!pip install -q scikit-learn networkx plotly umap-learn

# Memory optimization imports
import psutil
import torch
torch.cuda.empty_cache()
gc.collect()

def get_memory_usage():
    """Monitor memory usage"""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    return memory_info.rss / (1024 * 1024)  # MB

print(f"Initial memory usage: {get_memory_usage():.2f} MB")

# Define paths
BASE_PATH = "/content/drive/MyDrive/Voice/extracted/ADReSSo21"
PATHS = {
    'diagnosis_train_audio_ad': f"{BASE_PATH}/diagnosis/train/audio/ad",
    'diagnosis_train_audio_cn': f"{BASE_PATH}/diagnosis/train/audio/cn",
    'diagnosis_train_seg_ad': f"{BASE_PATH}/diagnosis/train/segmentation/ad",
    'diagnosis_train_seg_cn': f"{BASE_PATH}/diagnosis/train/segmentation/cn",
    'progression_test_audio': f"{BASE_PATH}/progression/test-dist/audio",
    'progression_test_seg': f"{BASE_PATH}/progression/test-dist/segmentation",
    'features': f"{BASE_PATH}/features",
    'transcripts': f"{BASE_PATH}/transcripts",
    'models': f"{BASE_PATH}/models",
    'visualizations': f"{BASE_PATH}/visualizations"
}

# Create output directories
for path in PATHS.values():
    os.makedirs(path, exist_ok=True)

# Create subdirectories for transcripts
transcript_dirs = [
    f"{PATHS['transcripts']}/train/ad",
    f"{PATHS['transcripts']}/train/cn",
    f"{PATHS['transcripts']}/test-dist",
    f"{PATHS['features']}/audio",
    f"{PATHS['features']}/text",
    f"{PATHS['models']}/bert",
    f"{PATHS['models']}/vit",
    f"{PATHS['models']}/two_branch"
]

for dir_path in transcript_dirs:
    os.makedirs(dir_path, exist_ok=True)

def scan_dataset():
    """Scan and organize dataset files"""
    dataset_info = {
        'train_ad_audio': [],
        'train_cn_audio': [],
        'train_ad_seg': [],
        'train_cn_seg': [],
        'test_audio': [],
        'test_seg': []
    }

    # Scan training data
    if os.path.exists(PATHS['diagnosis_train_audio_ad']):
        dataset_info['train_ad_audio'] = [f for f in os.listdir(PATHS['diagnosis_train_audio_ad']) if f.endswith('.wav')]

    if os.path.exists(PATHS['diagnosis_train_audio_cn']):
        dataset_info['train_cn_audio'] = [f for f in os.listdir(PATHS['diagnosis_train_audio_cn']) if f.endswith('.wav')]

    if os.path.exists(PATHS['diagnosis_train_seg_ad']):
        dataset_info['train_ad_seg'] = [f for f in os.listdir(PATHS['diagnosis_train_seg_ad']) if f.endswith('.csv')]

    if os.path.exists(PATHS['diagnosis_train_seg_cn']):
        dataset_info['train_cn_seg'] = [f for f in os.listdir(PATHS['diagnosis_train_seg_cn']) if f.endswith('.csv')]

    # Scan test data
    if os.path.exists(PATHS['progression_test_audio']):
        dataset_info['test_audio'] = [f for f in os.listdir(PATHS['progression_test_audio']) if f.endswith('.wav')]

    if os.path.exists(PATHS['progression_test_seg']):
        dataset_info['test_seg'] = [f for f in os.listdir(PATHS['progression_test_seg']) if f.endswith('.csv')]

    return dataset_info

# Scan dataset
dataset_info = scan_dataset()

# Print dataset summary
print("Dataset Summary:")
print(f"Train AD audio files: {len(dataset_info['train_ad_audio'])}")
print(f"Train CN audio files: {len(dataset_info['train_cn_audio'])}")
print(f"Train AD segmentation files: {len(dataset_info['train_ad_seg'])}")
print(f"Train CN segmentation files: {len(dataset_info['train_cn_seg'])}")
print(f"Test audio files: {len(dataset_info['test_audio'])}")
print(f"Test segmentation files: {len(dataset_info['test_seg'])}")

# Create metadata DataFrame
def create_metadata_df():
    """Create metadata DataFrame for efficient data management"""
    metadata = []

    # Add training data
    for file in dataset_info['train_ad_audio']:
        metadata.append({
            'file_id': file.replace('.wav', ''),
            'audio_path': f"{PATHS['diagnosis_train_audio_ad']}/{file}",
            'seg_path': f"{PATHS['diagnosis_train_seg_ad']}/{file.replace('.wav', '.csv')}",
            'label': 'ad',
            'split': 'train'
        })

    for file in dataset_info['train_cn_audio']:
        metadata.append({
            'file_id': file.replace('.wav', ''),
            'audio_path': f"{PATHS['diagnosis_train_audio_cn']}/{file}",
            'seg_path': f"{PATHS['diagnosis_train_seg_cn']}/{file.replace('.wav', '.csv')}",
            'label': 'cn',
            'split': 'train'
        })

    # Add test data
    for file in dataset_info['test_audio']:
        metadata.append({
            'file_id': file.replace('.wav', ''),
            'audio_path': f"{PATHS['progression_test_audio']}/{file}",
            'seg_path': f"{PATHS['progression_test_seg']}/{file.replace('.wav', '.csv')}",
            'label': 'unknown',
            'split': 'test'
        })

    return pd.DataFrame(metadata)

metadata_df = create_metadata_df()
print(f"\nMetadata DataFrame created with {len(metadata_df)} entries")
print(f"Memory usage after setup: {get_memory_usage():.2f} MB")

# Save metadata
metadata_df.to_csv(f"{BASE_PATH}/metadata.csv", index=False)
print("Setup completed successfully!")

# Display first few rows
print("\nMetadata sample:")
print(metadata_df.head())

# Check class distribution
print(f"\nClass distribution in training set:")
print(metadata_df[metadata_df['split'] == 'train']['label'].value_counts())