<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Mel_specto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Extraction and labeling

In [3]:
import zipfile
import os
import pandas as pd
from glob import glob

# Step 1: Extract the zip file
zip_path = '/content/drive/MyDrive/AD/Mel_Spectrograms.zip'
extract_path = '/content'

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction completed!")
print("Contents of /content:")
print(os.listdir('/content'))

# Check if Mel_Spectrograms folder exists
if 'Mel_Spectrograms' in os.listdir('/content'):
    print("\nMel_Spectrograms folder found!")
    mel_path = '/content/Mel_Spectrograms'
    print("Contents of Mel_Spectrograms:")
    print(os.listdir(mel_path))

    # Check subdirectories
    subdirs = os.listdir(mel_path)
    print(f"\nSubdirectories: {subdirs}")

    for subdir in subdirs:
        subdir_path = os.path.join(mel_path, subdir)
        if os.path.isdir(subdir_path):
            files = os.listdir(subdir_path)
            print(f"Files in {subdir}: {len(files)} files")
            if len(files) > 0:
                print(f"Sample files: {files[:5]}")
else:
    print("Mel_Spectrograms folder NOT found!")
    # List all contents to see what was actually extracted
    for root, dirs, files in os.walk('/content'):
        level = root.replace('/content', '').count(os.sep)
        indent = ' ' * 2 * level
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 2 * (level + 1)
        for file in files:
            print(f'{subindent}{file}')

# Now try to find the correct paths
base_path = '/content/Mel_Spectrograms'
if not os.path.exists(base_path):
    # Try to find where the folder actually is
    for root, dirs, files in os.walk('/content'):
        if 'Mel_Spectrograms' in dirs:
            base_path = os.path.join(root, 'Mel_Spectrograms')
            break

print(f"\nUsing base path: {base_path}")

# Check if AD and CN directories exist
ad_path = os.path.join(base_path, 'AD')
cn_path = os.path.join(base_path, 'CN')

print(f"AD path exists: {os.path.exists(ad_path)}")
print(f"CN path exists: {os.path.exists(cn_path)}")

if os.path.exists(ad_path):
    print(f"AD directory contents: {os.listdir(ad_path)[:10]}")  # Show first 10 files
if os.path.exists(cn_path):
    print(f"CN directory contents: {os.listdir(cn_path)[:10]}")  # Show first 10 files

# Try different file extensions
extensions = ['*.png', '*.jpg', '*.jpeg', '*.npy', '*.npz']
ad_files = []
cn_files = []

for ext in extensions:
    if os.path.exists(ad_path):
        ad_files = glob(os.path.join(ad_path, ext))
        if ad_files:
            print(f"Found {len(ad_files)} AD files with extension {ext}")
            break

for ext in extensions:
    if os.path.exists(cn_path):
        cn_files = glob(os.path.join(cn_path, ext))
        if cn_files:
            print(f"Found {len(cn_files)} CN files with extension {ext}")
            break

# Create labels
labels = [1] * len(ad_files) + [0] * len(cn_files)
file_paths = ad_files + cn_files

print(f"\nTotal AD files: {len(ad_files)}")
print(f"Total CN files: {len(cn_files)}")

# Create DataFrame
df = pd.DataFrame({
    'file_path': file_paths,
    'label': labels
})

print(f"\nDataset shape: {df.shape}")
print("\nFirst few entries:")
print(df.head())

# Save to CSV
if not df.empty:
    df.to_csv('/content/alzheimers_dataset.csv', index=False)
    print("\nDataset saved to /content/alzheimers_dataset.csv")
else:
    print("\nNo files found. Please check the zip file contents.")

Extraction completed!
Contents of /content:
['.config', 'alzheimers_dataset.csv', 'drive', 'Mel_Spectrograms', 'sample_data']

Mel_Spectrograms folder found!
Contents of Mel_Spectrograms:
['AD', 'CN']

Subdirectories: ['AD', 'CN']
Files in AD: 87 files
Sample files: ['adrso063.npy', 'adrso192.npy', 'adrso144.npy', 'adrso110.npy', 'adrso122.npy']
Files in CN: 79 files
Sample files: ['adrso316.npy', 'adrso012.npy', 'adrso310.npy', 'adrso160.npy', 'adrso309.npy']

Using base path: /content/Mel_Spectrograms
AD path exists: True
CN path exists: True
AD directory contents: ['adrso063.npy', 'adrso192.npy', 'adrso144.npy', 'adrso110.npy', 'adrso122.npy', 'adrso060.npy', 'adrso039.npy', 'adrso106.npy', 'adrso112.npy', 'adrso250.npy']
CN directory contents: ['adrso316.npy', 'adrso012.npy', 'adrso310.npy', 'adrso160.npy', 'adrso309.npy', 'adrso292.npy', 'adrso014.npy', 'adrso315.npy', 'adrso157.npy', 'adrso022.npy']
Found 87 AD files with extension *.npy
Found 79 CN files with extension *.npy

To

In [4]:
import numpy as np
import pandas as pd
from scipy.ndimage import convolve1d
import os

# Load the dataset
df = pd.read_csv('/content/alzheimers_dataset.csv')
print(f"Dataset loaded with {len(df)} samples")

# Function to compute delta features
def compute_delta(features, N=2):
    """
    Compute delta features using gradient approximation
    features: 2D array (time, features)
    N: window size for delta computation
    """
    # Create delta coefficients
    weights = np.arange(-N, N+1)
    weights = weights/np.sum(weights**2)  # Normalize

    # Apply convolution along time axis
    delta = convolve1d(features, weights, axis=0, mode='nearest')
    return delta

# Function to compute log-mel spectrogram
def compute_log_mel(mel_spectrogram):
    """
    Compute log mel spectrogram
    """
    # Add small constant to avoid log(0)
    log_mel = np.log(mel_spectrogram + 1e-8)
    return log_mel

# Function to process all features for one spectrogram
def process_spectrogram(file_path):
    """
    Load spectrogram and compute all features
    Returns: log_mel, delta, delta_delta
    """
    # Load the mel spectrogram
    mel_spectrogram = np.load(file_path)

    # Ensure it's 2D (time, features)
    if mel_spectrogram.ndim == 3:
        mel_spectrogram = mel_spectrogram.squeeze()

    # Compute log mel
    log_mel = compute_log_mel(mel_spectrogram)

    # Compute delta
    delta = compute_delta(mel_spectrogram)

    # Compute delta-delta
    delta_delta = compute_delta(delta)

    return log_mel, delta, delta_delta

# Create directories for saving processed features
processed_dir = '/content/Processed_Features'
os.makedirs(processed_dir, exist_ok=True)
os.makedirs(os.path.join(processed_dir, 'log_mel'), exist_ok=True)
os.makedirs(os.path.join(processed_dir, 'delta'), exist_ok=True)
os.makedirs(os.path.join(processed_dir, 'delta_delta'), exist_ok=True)

# Process all files
print("Processing spectrograms...")

# Add new columns for processed file paths
df['log_mel_path'] = ''
df['delta_path'] = ''
df['delta_delta_path'] = ''

for idx, row in df.iterrows():
    file_path = row['file_path']

    try:
        # Process the spectrogram
        log_mel, delta, delta_delta = process_spectrogram(file_path)

        # Create new file paths
        base_name = os.path.basename(file_path).replace('.npy', '')
        log_mel_path = os.path.join(processed_dir, 'log_mel', f'{base_name}_log_mel.npy')
        delta_path = os.path.join(processed_dir, 'delta', f'{base_name}_delta.npy')
        delta_delta_path = os.path.join(processed_dir, 'delta_delta', f'{base_name}_delta_delta.npy')

        # Save processed features
        np.save(log_mel_path, log_mel)
        np.save(delta_path, delta)
        np.save(delta_delta_path, delta_delta)

        # Update DataFrame
        df.at[idx, 'log_mel_path'] = log_mel_path
        df.at[idx, 'delta_path'] = delta_path
        df.at[idx, 'delta_delta_path'] = delta_delta_path

        if idx % 20 == 0:
            print(f"Processed {idx}/{len(df)} files")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("Feature extraction completed!")

# Save updated dataset
df.to_csv('/content/alzheimers_dataset_processed.csv', index=False)
print("Updated dataset saved to /content/alzheimers_dataset_processed.csv")

# Display dataset info
print(f"\nDataset summary:")
print(f"Total samples: {len(df)}")
print(f"AD samples: {df['label'].sum()}")
print(f"CN samples: {len(df) - df['label'].sum()}")

print("\nFirst few entries:")
print(df.head())

Dataset loaded with 166 samples
Processing spectrograms...
Processed 0/166 files


  log_mel = np.log(mel_spectrogram + 1e-8)


Processed 20/166 files
Processed 40/166 files
Processed 60/166 files
Processed 80/166 files
Processed 100/166 files
Processed 120/166 files
Processed 140/166 files
Processed 160/166 files
Feature extraction completed!
Updated dataset saved to /content/alzheimers_dataset_processed.csv

Dataset summary:
Total samples: 166
AD samples: 87
CN samples: 79

First few entries:
                                   file_path  label  \
0  /content/Mel_Spectrograms/AD/adrso063.npy      1   
1  /content/Mel_Spectrograms/AD/adrso192.npy      1   
2  /content/Mel_Spectrograms/AD/adrso144.npy      1   
3  /content/Mel_Spectrograms/AD/adrso110.npy      1   
4  /content/Mel_Spectrograms/AD/adrso122.npy      1   

                                        log_mel_path  \
0  /content/Processed_Features/log_mel/adrso063_l...   
1  /content/Processed_Features/log_mel/adrso192_l...   
2  /content/Processed_Features/log_mel/adrso144_l...   
3  /content/Processed_Features/log_mel/adrso110_l...   
4  /content/Proce

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import cv2
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
import os
import warnings
warnings.filterwarnings('ignore')

# Load the processed dataset
df = pd.read_csv('/content/alzheimers_dataset_processed.csv')
print(f"Dataset loaded with {len(df)} samples")

# Function to load and combine the three channels
def load_three_channel_data(log_mel_path, delta_path, delta_delta_path):
    """
    Load and combine log-mel, delta, and delta-delta features into 3-channel data
    """
    try:
        # Check if all files exist
        if not all(os.path.exists(path) for path in [log_mel_path, delta_path, delta_delta_path]):
            return None

        # Load each feature
        log_mel = np.load(log_mel_path)
        delta = np.load(delta_path)
        delta_delta = np.load(delta_delta_path)

        # Handle different dimensions
        if log_mel.ndim == 1:
            log_mel = log_mel.reshape(-1, 1)
        if delta.ndim == 1:
            delta = delta.reshape(-1, 1)
        if delta_delta.ndim == 1:
            delta_delta = delta_delta.reshape(-1, 1)

        # Ensure all have the same shape (handle NaN/Inf values)
        # Get valid data (remove NaN/Inf)
        valid_idx = ~(np.isnan(log_mel).any(axis=1) | np.isinf(log_mel).any(axis=1) |
                      np.isnan(delta).any(axis=1) | np.isinf(delta).any(axis=1) |
                      np.isnan(delta_delta).any(axis=1) | np.isinf(delta_delta).any(axis=1))

        if np.sum(valid_idx) == 0:
            return None

        log_mel = log_mel[valid_idx]
        delta = delta[valid_idx]
        delta_delta = delta_delta[valid_idx]

        # Ensure all have the same shape
        min_time = min(log_mel.shape[0], delta.shape[0], delta_delta.shape[0])
        if min_time == 0:
            return None

        # Trim to same time dimension
        log_mel = log_mel[:min_time]
        delta = delta[:min_time]
        delta_delta = delta_delta[:min_time]

        # Handle feature dimension
        if log_mel.ndim == 1:
            log_mel = log_mel.reshape(-1, 1)
        if delta.ndim == 1:
            delta = delta.reshape(-1, 1)
        if delta_delta.ndim == 1:
            delta_delta = delta_delta.reshape(-1, 1)

        min_features = min(log_mel.shape[1], delta.shape[1], delta_delta.shape[1])
        if min_features == 0:
            return None

        # Trim to same feature dimension
        log_mel = log_mel[:, :min_features]
        delta = delta[:, :min_features]
        delta_delta = delta_delta[:, :min_features]

        # Stack along the channel dimension
        three_channel = np.stack([log_mel, delta, delta_delta], axis=-1)

        return three_channel
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Function to resize data to fixed size
def resize_data(data, target_shape=(128, 128)):
    """
    Resize 3D data (time, features, channels) to fixed size
    """
    try:
        # Handle case where data might be too small
        if data.shape[0] == 0 or data.shape[1] == 0:
            return None

        # Transpose to (channels, time, features) for easier processing
        data = np.transpose(data, (2, 0, 1))

        resized_channels = []
        for channel in data:
            # Handle very small dimensions
            if channel.shape[0] == 0 or channel.shape[1] == 0:
                return None

            # Resize using OpenCV
            resized = cv2.resize(channel.astype(np.float32),
                               (target_shape[1], target_shape[0]),
                               interpolation=cv2.INTER_LINEAR)
            resized_channels.append(resized)

        # Stack and transpose back to (time, features, channels)
        resized_data = np.stack(resized_channels, axis=-1)
        return resized_data
    except Exception as e:
        print(f"Error resizing data: {e}")
        return None

# Function to standardize the data (more robust)
def standardize_data(data):
    """
    Standardize data to have zero mean and unit variance
    """
    try:
        # Handle NaN/Inf values
        data = np.nan_to_num(data, nan=0.0, posinf=0.0, neginf=0.0)

        original_shape = data.shape
        data_flat = data.reshape(-1, data.shape[-1])  # (time*features, channels)

        # Standardize each channel independently
        standardized_data = np.zeros_like(data_flat)
        for i in range(data_flat.shape[1]):
            channel_data = data_flat[:, i]
            mean = np.mean(channel_data)
            std = np.std(channel_data)
            if std > 1e-8:  # Avoid division by zero
                standardized_data[:, i] = (channel_data - mean) / std
            else:
                standardized_data[:, i] = channel_data - mean

        # Reshape back
        standardized_data = standardized_data.reshape(original_shape)
        return standardized_data
    except Exception as e:
        print(f"Error standardizing data: {e}")
        return None

# Load and preprocess all data
print("\n=== DATA PREPROCESSING ===")
X = []
y = []
valid_indices = []

target_shape = (128, 128)  # You can adjust this based on your data

print("Loading and preprocessing data...")
successful_loads = 0

for idx, row in df.iterrows():
    try:
        # Load three-channel data
        data = load_three_channel_data(row['log_mel_path'], row['delta_path'], row['delta_delta_path'])

        if data is not None and data.shape[0] > 0 and data.shape[1] > 0:
            # Resize to fixed size
            data_resized = resize_data(data, target_shape)

            if data_resized is not None:
                # Standardize the data
                data_standardized = standardize_data(data_resized)

                if data_standardized is not None:
                    X.append(data_standardized)
                    y.append(row['label'])
                    valid_indices.append(idx)
                    successful_loads += 1

                    if successful_loads % 20 == 0:
                        print(f"Processed {successful_loads} samples successfully")

    except Exception as e:
        print(f"Error processing sample {idx}: {e}")
        continue

if len(X) == 0:
    raise ValueError("No data could be loaded successfully. Please check your data files.")

X = np.array(X)
y = np.array(y)

print(f"\nFinal dataset shapes:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# Check for any remaining shape inconsistencies
if len(X) > 0:
    unique_shapes = [x.shape for x in X]
    print(f"Unique shapes in dataset: {pd.Series(unique_shapes).value_counts()}")

# Split the data (80% train, 20% validation) - test set will be separate
print("\n=== DATASET SPLITTING ===")
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")

# Check class distribution in splits
print(f"\nTraining class distribution: {np.bincount(y_train)}")
print(f"Validation class distribution: {np.bincount(y_val)}")

# Calculate class weights for imbalanced dataset
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print(f"\nClass weights: {class_weight_dict}")

# Visualize some samples (if we have data)
if len(X_train) > 0:
    print("\n=== SAMPLE VISUALIZATION ===")
    try:
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))

        # Plot samples from both classes
        for i in range(min(2, len(np.where(y_train == 1)[0]))):
            ad_idx = np.where(y_train == 1)[0][i]

            axes[i, 0].imshow(X_train[ad_idx, :, :, 0], aspect='auto', cmap='viridis')
            axes[i, 0].set_title(f'AD - Log-Mel (Sample {ad_idx})')
            axes[i, 0].axis('off')

            axes[i, 1].imshow(X_train[ad_idx, :, :, 1], aspect='auto', cmap='viridis')
            axes[i, 1].set_title(f'AD - Delta')
            axes[i, 1].axis('off')

            axes[i, 2].imshow(X_train[ad_idx, :, :, 2], aspect='auto', cmap='viridis')
            axes[i, 2].set_title(f'AD - Delta-Delta')
            axes[i, 2].axis('off')

        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Could not create visualization: {e}")

# Save preprocessed data
print("\n=== SAVING PROCESSED DATA ===")
try:
    np.save('/content/X_train.npy', X_train)
    np.save('/content/X_val.npy', X_val)
    np.save('/content/y_train.npy', y_train)
    np.save('/content/y_val.npy', y_val)

    print("Preprocessed data saved!")
    print("Files created:")
    print("- /content/X_train.npy")
    print("- /content/X_val.npy")
    print("- /content/y_train.npy")
    print("- /content/y_val.npy")

    # Create a summary dictionary
    data_summary = {
        'X_train_shape': X_train.shape,
        'X_val_shape': X_val.shape,
        'y_train_shape': y_train.shape,
        'y_val_shape': y_val.shape,
        'class_weights': class_weight_dict,
        'target_shape': target_shape,
        'successful_samples': len(X)
    }

    import pickle
    with open('/content/data_summary.pkl', 'wb') as f:
        pickle.dump(data_summary, f)

    print("\nData summary saved to /content/data_summary.pkl")

except Exception as e:
    print(f"Error saving data: {e}")

print("\n=== PREPROCESSING COMPLETE ===")
print(f"Successfully processed {len(X)} samples out of {len(df)} total samples")
print("Training and validation data is ready for model training!")

# Function to prepare test data (when you provide the test directory)
def prepare_test_data(test_directory):
    """
    Prepare test data from a separate directory
    Expected structure:
    test_directory/
        AD/
        CN/
    """
    print(f"Preparing test data from: {test_directory}")

    # Implement test data loading logic here when you provide the test directory structure
    # This would be similar to the training data loading but without labels or with different label handling

    pass

print("\nTo prepare your separate test directory, call prepare_test_data('path/to/test/directory')")

Dataset loaded with 166 samples

=== DATA PREPROCESSING ===
Loading and preprocessing data...


ValueError: No data could be loaded successfully. Please check your data files.