<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Speech_AD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Set Up Google Colab Environment

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install required libraries
!pip install opensmile pyAudioAnalysis

# Import libraries
import pandas as pd
import numpy as np
import librosa
import opensmile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import os

Mounted at /content/drive
Collecting opensmile
  Downloading opensmile-2.5.1-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting pyAudioAnalysis
  Downloading pyAudioAnalysis-0.3.14.tar.gz (41.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.2.3-py3-none-any.whl.metadata (4.2 kB)
Collecting audeer>=2.1.1 (from audinterface>=0.7.0->opensmile)
  Downloading audeer-2.2.1-py3-none-any.whl.metadata (4.1 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.3.1-py3-none-any.whl.metadata (4.6 kB)
Collecting audiofile>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audiofile-1.5.1-py3-none-any.whl.metadat

- Mount Google Drive to access the .tgz files and CSV files.
- Install opensmile for eGeMAPS acoustic feature extraction, librosa for audio processing, and scikit-learn for machine learning models.
- Import libraries for data handling, feature extraction, and visualization.

# Step 2: Load and Organize Datasets

In [5]:
import pandas as pd
import os

# Define paths to datasets in Google Drive
data_path = '/content/drive/MyDrive/Voice/'
diagnosis_train = data_path + 'ADReSSo21-diagnosis-train.tgz'
progression_train = data_path + 'ADReSSo21-progression-train.tgz'
progression_test = data_path + 'ADReSSo21-progression-test.tgz'

# Create directories for extraction
os.makedirs('/content/diagnosis_train', exist_ok=True)
os.makedirs('/content/progression_train', exist_ok=True)
os.makedirs('/content/progression_test', exist_ok=True)

# Unzip datasets
!tar -xvzf "{diagnosis_train}" -C "/content/diagnosis_train"
!tar -xvzf "{progression_train}" -C "/content/progression_train"
!tar -xvzf "{progression_test}" -C "/content/progression_test"

# Verify extracted files
print("Diagnosis Train Files:", os.listdir('/content/diagnosis_train'))
print("Progression Train Files:", os.listdir('/content/progression_train'))
print("Progression Test Files:", os.listdir('/content/progression_test'))

# Load CSV files
task1 = pd.read_csv(data_path + 'task1.csv')  # AD vs Control labels
task2 = pd.read_csv(data_path + 'task2.csv')  # MMSE scores
task3 = pd.read_csv(data_path + 'task3.csv')  # Cognitive decline labels

# Display dataset info
print("\nTask 1 (AD Classification):")
print(task1.head())
print("\nTask 2 (MMSE Regression):")
print(task2.head())
print("\nTask 3 (Cognitive Decline):")
print(task3.head())

ADReSSo21/diagnosis/
ADReSSo21/diagnosis/README.md
ADReSSo21/diagnosis/train/
ADReSSo21/diagnosis/train/segmentation/
ADReSSo21/diagnosis/train/segmentation/cn/
ADReSSo21/diagnosis/train/segmentation/cn/adrso281.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso308.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso270.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso022.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso298.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso300.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso265.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso186.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso148.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso152.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso182.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso268.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso259.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso276.csv
ADReSSo21/diagnosis/train/segmentation/cn/adrso261.csv
ADReSSo21/diag

# Step 3: Acoustic Feature Extraction (eGeMAPS)

In [12]:
import opensmile
import librosa
import pandas as pd
import os
import numpy as np

# Initialize opensmile for eGeMAPS feature extraction
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

# Function to extract eGeMAPS features from an audio file
def extract_egemaps(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000)  # Load audio
        features = smile.process_signal(y, sr)  # Extract eGeMAPS
        return features.values.flatten()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Paths to audio files (diagnosis train: cn and ad subdirectories)
diagnosis_audio_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/'
cn_audio_path = os.path.join(diagnosis_audio_base, 'cn/')
ad_audio_path = os.path.join(diagnosis_audio_base, 'ad/')

# Collect all .wav files from both cn/ and ad/ directories
audio_files = []
for path in [cn_audio_path, ad_audio_path]:
    if os.path.exists(path):
        audio_files.extend([os.path.join(path, f) for f in os.listdir(path) if f.endswith('.wav')])
    else:
        print(f"Directory not found: {path}")

# Extract features for all audio files
audio_features = []  # Initialize as empty list
audio_ids = []
for audio_path in audio_files:
    audio_file = os.path.basename(audio_path)
    audio_id = audio_file.split('.')[0]  # Extract ID from filename (e.g., adrso123)
    features = extract_egemaps(audio_path)
    if features is not None:
        audio_features.append(features)
        audio_ids.append(audio_id)
    else:
        print(f"Skipping {audio_id} due to feature extraction failure")

# Check if any features were extracted
if not audio_features:
    raise ValueError("No audio features extracted. Check audio files or extraction process.")

# Convert to DataFrame
audio_features_df = pd.DataFrame(audio_features)
audio_features_df['ID'] = audio_ids

# Load task1.csv for labels
data_path = '/content/drive/MyDrive/Voice/'
task1 = pd.read_csv(data_path + 'task1.csv')

# Normalize IDs in task1.csv to match audio file IDs
task1['ID'] = task1['ID'].apply(lambda x: 'adrso' + x.replace('adrsdt', '').zfill(3))

# Merge with task1 labels
task1_data = pd.merge(audio_features_df, task1, on='ID', how='inner')
print("Merged Acoustic Features with Labels:")
print(task1_data.head())
print(f"Number of matched records: {len(task1_data)}")

# Save the merged DataFrame for debugging
task1_data.to_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv', index=False)
print("Saved acoustic features to /content/drive/MyDrive/Voice/task1_acoustic_features.csv")

Merged Acoustic Features with Labels:
           0         1          2          3          4          5  \
0  34.314342  0.172523  31.954039  34.558792  38.469227   6.515188   
1  34.439098  0.178912  29.944578  33.201965  39.123035   9.178457   
2  34.765678  0.144698  31.995552  35.011833  37.706375   5.710823   
3  30.145615  0.129570  28.376390  29.582561  32.609303   4.232912   
4  31.052141  0.345289  22.244028  25.008968  40.717941  18.473913   

            6           7           8           9  ...        80        81  \
0  332.870453  461.649567  120.299301   81.307747  ...  0.544044  1.976285   
1  169.268906  333.093689  160.969574  267.036377  ...  0.434866  2.533154   
2  231.058731  375.145050  145.108765  262.729279  ...  0.130108  1.557071   
3  292.438629  535.989441   76.417542   96.036621  ...  0.086000  1.878543   
4  546.450195  783.379028  370.438660  510.268463  ...  0.291039  1.648093   

         82        83        84        85        86         87        ID

## low number of matches

In [14]:
print("Audio IDs:", audio_ids[:5])
print("Task1 IDs:", task1['ID'].head())

Audio IDs: ['adrso010', 'adrso014', 'adrso015', 'adrso005', 'adrso312']
Task1 IDs: 0    adrso015
1    adrso040
2    adrso026
3    adrso067
4    adrso058
Name: ID, dtype: object


In [15]:
test_features = extract_egemaps('/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/adrso173.wav')
print("Test features shape:", test_features.shape if test_features is not None else "Failed")

Test features shape: (88,)


In [13]:
print("Number of audio files:", len(audio_files))
print("Sample audio files:", audio_files[:5])

Number of audio files: 166
Sample audio files: ['/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/adrso010.wav', '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/adrso014.wav', '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/adrso015.wav', '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/adrso005.wav', '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/adrso312.wav']


In [16]:
import opensmile
import librosa
import pandas as pd
import os
import numpy as np

# Initialize opensmile for eGeMAPS feature extraction
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

# Function to extract eGeMAPS features from an audio file
def extract_egemaps(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000)  # Load audio
        features = smile.process_signal(y, sr)  # Extract eGeMAPS
        return features.values.flatten()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Paths to audio files (diagnosis train: cn and ad subdirectories)
diagnosis_audio_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/'
cn_audio_path = os.path.join(diagnosis_audio_base, 'cn/')
ad_audio_path = os.path.join(diagnosis_audio_base, 'ad/')

# Collect all .wav files from both cn/ and ad/ directories
audio_files = []
for path in [cn_audio_path, ad_audio_path]:
    if os.path.exists(path):
        files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.wav')]
        audio_files.extend(files)
        print(f"Found {len(files)} audio files in {path}")
    else:
        print(f"Directory not found: {path}")

print(f"Total audio files found: {len(audio_files)}")

# Extract features for all audio files
audio_features = []
audio_ids = []
skipped_files = []
for audio_path in audio_files:
    audio_file = os.path.basename(audio_path)
    audio_id = audio_file.split('.')[0]  # Extract ID (e.g., adrso123)
    features = extract_egemaps(audio_path)
    if features is not None:
        audio_features.append(features)
        audio_ids.append(audio_id)
    else:
        print(f"Skipping {audio_id} due to feature extraction failure")
        skipped_files.append(audio_id)

print(f"Extracted features for {len(audio_features)} audio files")
print(f"Skipped {len(skipped_files)} audio files: {skipped_files}")

# Check if any features were extracted
if not audio_features:
    raise ValueError("No audio features extracted. Check audio files or extraction process.")

# Convert to DataFrame
audio_features_df = pd.DataFrame(audio_features)
audio_features_df['ID'] = audio_ids

# Load task1.csv for labels
data_path = '/content/drive/MyDrive/Voice/'
task1 = pd.read_csv(data_path + 'task1.csv')

# Normalize IDs in task1.csv to match audio file IDs
task1['ID'] = task1['ID'].apply(lambda x: 'adrso' + x.replace('adrsdt', '').zfill(3))

# Check ID overlap
audio_id_set = set(audio_ids)
task1_id_set = set(task1['ID'])
print(f"Audio IDs in audio_files: {len(audio_id_set)}")
print(f"Task1 IDs: {len(task1_id_set)}")
print(f"Common IDs: {len(audio_id_set & task1_id_set)}")
print(f"Audio IDs not in task1: {audio_id_set - task1_id_set}")
print(f"Task1 IDs not in audio: {task1_id_set - audio_id_set}")

# Merge with task1 labels
task1_data = pd.merge(audio_features_df, task1, on='ID', how='inner')
print("Merged Acoustic Features with Labels:")
print(task1_data.head())
print(f"Number of matched records: {len(task1_data)}")

# Save the merged DataFrame
task1_data.to_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv', index=False)
print("Saved acoustic features to /content/drive/MyDrive/Voice/task1_acoustic_features.csv")

# Save unmatched IDs for debugging
unmatched_audio_ids = list(audio_id_set - task1_id_set)
unmatched_task1_ids = list(task1_id_set - audio_id_set)
pd.DataFrame({'unmatched_audio_ids': unmatched_audio_ids}).to_csv(
    '/content/drive/MyDrive/Voice/unmatched_audio_ids.csv', index=False
)
pd.DataFrame({'unmatched_task1_ids': unmatched_task1_ids}).to_csv(
    '/content/drive/MyDrive/Voice/unmatched_task1_ids.csv', index=False
)
print("Saved unmatched IDs to /content/drive/MyDrive/Voice/unmatched_{audio,task1}_ids.csv")

Found 79 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/
Found 87 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/ad/
Total audio files found: 166
Extracted features for 166 audio files
Skipped 0 audio files: []
Audio IDs in audio_files: 166
Task1 IDs: 71
Common IDs: 41
Audio IDs not in task1: {'adrso228', 'adrso211', 'adrso291', 'adrso189', 'adrso128', 'adrso177', 'adrso172', 'adrso167', 'adrso157', 'adrso188', 'adrso206', 'adrso156', 'adrso308', 'adrso154', 'adrso178', 'adrso216', 'adrso169', 'adrso202', 'adrso276', 'adrso274', 'adrso285', 'adrso200', 'adrso186', 'adrso197', 'adrso266', 'adrso265', 'adrso160', 'adrso090', 'adrso223', 'adrso161', 'adrso259', 'adrso198', 'adrso170', 'adrso141', 'adrso165', 'adrso122', 'adrso093', 'adrso299', 'adrso074', 'adrso257', 'adrso234', 'adrso283', 'adrso281', 'adrso077', 'adrso209', 'adrso247', 'adrso277', 'adrso309', 'adrso151', 'adrso153', 'adrso307', 'adrso280', 'adrso268', 'adrso289', 'adr

### Explanation of Changes
1. **Debugging Audio Files**:
   - Print the number of `.wav` files in `cn/` and `ad/` directories and the total count.
   - Expect ~108 files (based on ADReSSo train split). If fewer, some audio files are missing.

2. **Tracking Skipped Files**:
   - Maintain a `skipped_files` list to log audio IDs where feature extraction failed.
   - Print the number of skipped files and their IDs.

3. **ID Overlap Analysis**:
   - Compare `audio_ids` (from audio files) with `task1['ID']` (after normalization) using set operations.
   - Print:
     - Number of unique audio IDs.
     - Number of unique `task1` IDs.
     - Number of common IDs (should be close to 108).
     - Audio IDs not in `task1.csv`.
     - `task1.csv` IDs not in audio files.
   - Save unmatched IDs to CSV files for inspection.

4. **Preserved Core Logic**:
   - Kept eGeMAPS extraction, ID normalization (`adrso` + zero-padded ID), and merging logic unchanged.
   - Ensured 88 features are extracted per audio file.

---



In [17]:
import opensmile
import librosa
import pandas as pd
import os
import numpy as np

# Initialize opensmile for eGeMAPS feature extraction
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

# Function to extract eGeMAPS features from an audio file
def extract_egemaps(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000)  # Load audio
        features = smile.process_signal(y, sr)  # Extract eGeMAPS
        return features.values.flatten()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Paths to audio files (diagnosis train: cn and ad subdirectories)
diagnosis_audio_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/'
cn_audio_path = os.path.join(diagnosis_audio_base, 'cn/')
ad_audio_path = os.path.join(diagnosis_audio_base, 'ad/')

# Load task1.csv for labels
data_path = '/content/drive/MyDrive/Voice/'
task1 = pd.read_csv(data_path + 'task1.csv')

# Normalize IDs in task1.csv to match audio file IDs
task1['ID'] = task1['ID'].apply(lambda x: 'adrso' + x.replace('adrsdt', '').zfill(3))
task1_ids = set(task1['ID'])
print(f"Task1.csv contains {len(task1)} IDs")

# Collect .wav files from cn/ and ad/, filtering by task1 IDs
audio_files = []
audio_id_to_path = {}
for path in [cn_audio_path, ad_audio_path]:
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.wav')]
        for f in files:
            audio_id = f.split('.')[0]
            if audio_id in task1_ids:
                audio_files.append(os.path.join(path, f))
                audio_id_to_path[audio_id] = os.path.join(path, f)
        print(f"Found {len(files)} audio files in {path}, {sum(1 for f in files if f.split('.')[0] in task1_ids)} match task1 IDs")
    else:
        print(f"Directory not found: {path}")

print(f"Total audio files matching task1 IDs: {len(audio_files)}")

# Identify task1 IDs without audio files
missing_audio_ids = task1_ids - set(audio_id_to_path.keys())
print(f"Task1 IDs missing audio files ({len(missing_audio_ids)}): {missing_audio_ids}")

# Extract features for matching audio files
audio_features = []
audio_ids = []
skipped_files = []
for audio_path in audio_files:
    audio_file = os.path.basename(audio_path)
    audio_id = audio_file.split('.')[0]
    features = extract_egemaps(audio_path)
    if features is not None:
        audio_features.append(features)
        audio_ids.append(audio_id)
    else:
        print(f"Skipping {audio_id} due to feature extraction failure")
        skipped_files.append(audio_id)

print(f"Extracted features for {len(audio_features)} audio files")
print(f"Skipped {len(skipped_files)} audio files: {skipped_files}")

# Check if any features were extracted
if not audio_features:
    raise ValueError("No audio features extracted. Check audio files or extraction process.")

# Convert to DataFrame
audio_features_df = pd.DataFrame(audio_features)
audio_features_df['ID'] = audio_ids

# Merge with task1 labels
task1_data = pd.merge(audio_features_df, task1, on='ID', how='inner')
print("Merged Acoustic Features with Labels:")
print(task1_data.head())
print(f"Number of matched records: {len(task1_data)}")

# Save the merged DataFrame
task1_data.to_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv', index=False)
print("Saved acoustic features to /content/drive/MyDrive/Voice/task1_acoustic_features.csv")

# Save debugging info
pd.DataFrame({'missing_audio_ids': list(missing_audio_ids)}).to_csv(
    '/content/drive/MyDrive/Voice/missing_audio_ids.csv', index=False
)
pd.DataFrame({'skipped_files': skipped_files}).to_csv(
    '/content/drive/MyDrive/Voice/skipped_files.csv', index=False
)
print("Saved debugging info to /content/drive/MyDrive/Voice/{missing_audio_ids,skipped_files}.csv")

Task1.csv contains 71 IDs
Found 79 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/, 16 match task1 IDs
Found 87 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/ad/, 25 match task1 IDs
Total audio files matching task1 IDs: 41
Task1 IDs missing audio files (30): {'adrso050', 'adrso051', 'adrso067', 'adrso004', 'adrso011', 'adrso062', 'adrso058', 'adrso009', 'adrso044', 'adrso064', 'adrso061', 'adrso052', 'adrso069', 'adrso040', 'adrso006', 'adrso034', 'adrso038', 'adrso065', 'adrso041', 'adrso020', 'adrso048', 'adrso001', 'adrso026', 'adrso066', 'adrso042', 'adrso037', 'adrso029', 'adrso057', 'adrso030', 'adrso013'}
Extracted features for 41 audio files
Skipped 0 audio files: []
Merged Acoustic Features with Labels:
           0         1          2          3          4          5  \
0  34.314342  0.172523  31.954039  34.558792  38.469227   6.515188   
1  34.439098  0.178912  29.944578  33.201965  39.123035   9.178457   
2  34.765678  

In [18]:
import opensmile
import librosa
import pandas as pd
import os
import numpy as np

# Initialize opensmile for eGeMAPS feature extraction
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

# Function to extract eGeMAPS features from an audio file
def extract_egemaps(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000)  # Load audio
        features = smile.process_signal(y, sr)  # Extract eGeMAPS
        return features.values.flatten()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Paths to data
data_path = '/content/drive/MyDrive/Voice/'
diagnosis_audio_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/'
cn_audio_path = os.path.join(diagnosis_audio_base, 'cn/')
ad_audio_path = os.path.join(diagnosis_audio_base, 'ad/')
segmentation_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/'
mmse_scores_file = '/content/diagnosis_train/ADReSSo21/diagnosis/train/adresso-train-mmse-scores.csv'

# Load task1.csv
task1 = pd.read_csv(data_path + 'task1.csv')
task1['ID'] = task1['ID'].apply(lambda x: 'adrso' + x.replace('adrsdt', '').zfill(3))
task1_ids = set(task1['ID'])
print(f"Task1.csv contains {len(task1)} IDs")

# Load MMSE scores file
try:
    mmse_scores = pd.read_csv(mmse_scores_file)
    mmse_ids = set(mmse_scores.get('ID', []))  # Adjust column name if needed
    print(f"MMSE scores file contains {len(mmse_ids)} IDs")
    print(f"Task1 IDs in MMSE scores: {len(task1_ids & mmse_ids)}")
except FileNotFoundError:
    print(f"MMSE scores file not found: {mmse_scores_file}")
    mmse_ids = set()

# Collect segmentation IDs
segmentation_ids = set()
for path in [os.path.join(segmentation_base, 'cn/'), os.path.join(segmentation_base, 'ad/')]:
    if os.path.exists(path):
        files = [f.split('.')[0] for f in os.listdir(path) if f.endswith('.csv')]
        segmentation_ids.update(files)
        print(f"Found {len(files)} segmentation files in {path}")
    else:
        print(f"Segmentation directory not found: {path}")
print(f"Total segmentation IDs: {len(segmentation_ids)}")
print(f"Task1 IDs in segmentation: {len(task1_ids & segmentation_ids)}")

# Collect .wav files from cn/ and ad/, filtering by task1 IDs
audio_files = []
audio_id_to_path = {}
for path in [cn_audio_path, ad_audio_path]:
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.wav')]
        for f in files:
            audio_id = f.split('.')[0]
            if audio_id in task1_ids:
                audio_files.append(os.path.join(path, f))
                audio_id_to_path[audio_id] = os.path.join(path, f)
        print(f"Found {len(files)} audio files in {path}, {sum(1 for f in files if f.split('.')[0] in task1_ids)} match task1 IDs")
    else:
        print(f"Directory not found: {path}")

print(f"Total audio files matching task1 IDs: {len(audio_files)}")

# Search for missing task1 IDs in other directories
missing_audio_ids = task1_ids - set(audio_id_to_path.keys())
print(f"Task1 IDs missing audio files ({len(missing_audio_ids)}): {missing_audio_ids}")

# Check other directories for missing audio files
other_dirs = [
    '/content/progression_train/ADReSSo21/progression/train/audio/no_decline/',
    '/content/progression_train/ADReSSo21/progression/train/audio/decline/',
    '/content/progression_test/ADReSSo21/progression/test-dist/audio/'
]
found_missing = {}
for missing_id in missing_audio_ids:
    for d in other_dirs:
        if os.path.exists(d):
            if f"{missing_id}.wav" in os.listdir(d):
                found_missing[missing_id] = os.path.join(d, f"{missing_id}.wav")
if found_missing:
    print("Found missing audio files in other directories:")
    for id_, path in found_missing.items():
        print(f"{id_}: {path}")
else:
    print("No missing audio files found in other directories")

# Extract features for matching audio files
audio_features = []
audio_ids = []
skipped_files = []
for audio_path in audio_files:
    audio_file = os.path.basename(audio_path)
    audio_id = audio_file.split('.')[0]
    features = extract_egemaps(audio_path)
    if features is not None:
        audio_features.append(features)
        audio_ids.append(audio_id)
    else:
        print(f"Skipping {audio_id} due to feature extraction failure")
        skipped_files.append(audio_id)

print(f"Extracted features for {len(audio_features)} audio files")
print(f"Skipped {len(skipped_files)} audio files: {skipped_files}")

# Check if any features were extracted
if not audio_features:
    raise ValueError("No audio features extracted. Check audio files or extraction process.")

# Convert to DataFrame
audio_features_df = pd.DataFrame(audio_features)
audio_features_df['ID'] = audio_ids

# Merge with task1 labels
task1_data = pd.merge(audio_features_df, task1, on='ID', how='inner')
print("Merged Acoustic Features with Labels:")
print(task1_data.head())
print(f"Number of matched records: {len(task1_data)}")

# Save the merged DataFrame
task1_data.to_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv', index=False)
print("Saved acoustic features to /content/drive/MyDrive/Voice/task1_acoustic_features.csv")

# Save debugging info
pd.DataFrame({'missing_audio_ids': list(missing_audio_ids)}).to_csv(
    '/content/drive/MyDrive/Voice/missing_audio_ids.csv', index=False
)
pd.DataFrame({'skipped_files': skipped_files}).to_csv(
    '/content/drive/MyDrive/Voice/skipped_files.csv', index=False
)
pd.DataFrame({'segmentation_ids': list(segmentation_ids)}).to_csv(
    '/content/drive/MyDrive/Voice/segmentation_ids.csv', index=False
)
print("Saved debugging info to /content/drive/MyDrive/Voice/{missing_audio_ids,skipped_files,segmentation_ids}.csv")

Task1.csv contains 71 IDs
MMSE scores file contains 0 IDs
Task1 IDs in MMSE scores: 0
Found 79 segmentation files in /content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/cn/
Found 87 segmentation files in /content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/ad/
Total segmentation IDs: 166
Task1 IDs in segmentation: 41
Found 79 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/, 16 match task1 IDs
Found 87 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/ad/, 25 match task1 IDs
Total audio files matching task1 IDs: 41
Task1 IDs missing audio files (30): {'adrso050', 'adrso051', 'adrso067', 'adrso004', 'adrso011', 'adrso062', 'adrso058', 'adrso009', 'adrso044', 'adrso064', 'adrso061', 'adrso052', 'adrso069', 'adrso040', 'adrso006', 'adrso034', 'adrso038', 'adrso065', 'adrso041', 'adrso020', 'adrso048', 'adrso001', 'adrso026', 'adrso066', 'adrso042', 'adrso037', 'adrso029', 'adrso057', 'adrso030', 'adrso013'}
No missing

In [19]:
import opensmile
import librosa
import pandas as pd
import os
import numpy as np
import glob

# Initialize opensmile for eGeMAPS feature extraction
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

# Function to extract eGeMAPS features from an audio file
def extract_egemaps(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000)  # Load audio
        features = smile.process_signal(y, sr)  # Extract eGeMAPS
        return features.values.flatten()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Paths to data
data_path = '/content/drive/MyDrive/Voice/'
diagnosis_audio_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/'
cn_audio_path = os.path.join(diagnosis_audio_base, 'cn/')
ad_audio_path = os.path.join(diagnosis_audio_base, 'ad/')
segmentation_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/'
mmse_scores_file = '/content/diagnosis_train/ADReSSo21/diagnosis/train/adresso-train-mmse-scores.csv'

# Load task1.csv
task1 = pd.read_csv(data_path + 'task1.csv')
task1['ID'] = task1['ID'].apply(lambda x: 'adrso' + x.replace('adrsdt', '').zfill(3))
task1_ids = set(task1['ID'])
print(f"Task1.csv contains {len(task1)} IDs")

# Inspect MMSE scores file
try:
    mmse_scores = pd.read_csv(mmse_scores_file)
    print("MMSE scores file columns:", mmse_scores.columns.tolist())
    print("MMSE scores sample:")
    print(mmse_scores.head())
    # Try common ID column names
    id_columns = ['ID', 'id', 'participant_id', 'Participant_ID']
    mmse_ids = set()
    for col in id_columns:
        if col in mmse_scores.columns:
            mmse_ids = set(mmse_scores[col].apply(lambda x: f"adrso{str(x).zfill(3)}" if str(x).isdigit() else x))
            break
    print(f"MMSE scores file contains {len(mmse_ids)} IDs")
    print(f"Task1 IDs in MMSE scores: {len(task1_ids & mmse_ids)}")
except FileNotFoundError:
    print(f"MMSE scores file not found: {mmse_scores_file}")
    mmse_ids = set()
except Exception as e:
    print(f"Error reading MMSE scores file: {e}")
    mmse_ids = set()

# Collect segmentation IDs
segmentation_ids = set()
for path in [os.path.join(segmentation_base, 'cn/'), os.path.join(segmentation_base, 'ad/')]:
    if os.path.exists(path):
        files = [f.split('.')[0] for f in os.listdir(path) if f.endswith('.csv')]
        segmentation_ids.update(files)
        print(f"Found {len(files)} segmentation files in {path}")
    else:
        print(f"Segmentation directory not found: {path}")
print(f"Total segmentation IDs: {len(segmentation_ids)}")
print(f"Task1 IDs in segmentation: {len(task1_ids & segmentation_ids)}")

# Collect .wav files from cn/ and ad/, filtering by task1 IDs
audio_files = []
audio_id_to_path = {}
for path in [cn_audio_path, ad_audio_path]:
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.wav')]
        for f in files:
            audio_id = f.split('.')[0]
            if audio_id in task1_ids:
                audio_files.append(os.path.join(path, f))
                audio_id_to_path[audio_id] = os.path.join(path, f)
        print(f"Found {len(files)} audio files in {path}, {sum(1 for f in files if f.split('.')[0] in task1_ids)} match task1 IDs")
    else:
        print(f"Directory not found: {path}")

print(f"Total audio files matching task1 IDs: {len(audio_files)}")

# Search for missing task1 IDs across all directories
missing_audio_ids = task1_ids - set(audio_id_to_path.keys())
print(f"Task1 IDs missing audio files ({len(missing_audio_ids)}): {missing_audio_ids}")

# Recursive search for missing audio files
found_missing = {}
for missing_id in missing_audio_ids:
    matches = glob.glob(f"/content/**/{missing_id}.wav", recursive=True)
    if matches:
        found_missing[missing_id] = matches
if found_missing:
    print("Found missing audio files:")
    for id_, paths in found_missing.items():
        print(f"{id_}: {paths}")
else:
    print("No missing audio files found in /content/")

# Extract features for matching audio files
audio_features = []
audio_ids = []
skipped_files = []
for audio_path in audio_files:
    audio_file = os.path.basename(audio_path)
    audio_id = audio_file.split('.')[0]
    features = extract_egemaps(audio_path)
    if features is not None:
        audio_features.append(features)
        audio_ids.append(audio_id)
    else:
        print(f"Skipping {audio_id} due to feature extraction failure")
        skipped_files.append(audio_id)

print(f"Extracted features for {len(audio_features)} audio files")
print(f"Skipped {len(skipped_files)} audio files: {skipped_files}")

# Check if any features were extracted
if not audio_features:
    raise ValueError("No audio features extracted. Check audio files or extraction process.")

# Convert to DataFrame
audio_features_df = pd.DataFrame(audio_features)
audio_features_df['ID'] = audio_ids

# Merge with task1 labels
task1_data = pd.merge(audio_features_df, task1, on='ID', how='inner')
print("Merged Acoustic Features with Labels:")
print(task1_data.head())
print(f"Number of matched records: {len(task1_data)}")

# Save the merged DataFrame
task1_data.to_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv', index=False)
print("Saved acoustic features to /content/drive/MyDrive/Voice/task1_acoustic_features.csv")

# Save debugging info
pd.DataFrame({'missing_audio_ids': list(missing_audio_ids)}).to_csv(
    '/content/drive/MyDrive/Voice/missing_audio_ids.csv', index=False
)
pd.DataFrame({'skipped_files': skipped_files}).to_csv(
    '/content/drive/MyDrive/Voice/skipped_files.csv', index=False
)
pd.DataFrame({'segmentation_ids': list(segmentation_ids)}).to_csv(
    '/content/drive/MyDrive/Voice/segmentation_ids.csv', index=False
)
print("Saved debugging info to /content/drive/MyDrive/Voice/{missing_audio_ids,skipped_files,segmentation_ids}.csv")

# Note dataset limitation
if len(missing_audio_ids) > 0:
    print(f"Warning: {len(missing_audio_ids)} task1.csv IDs lack audio files. Verify dataset completeness or check for additional .tgz files.")

Task1.csv contains 71 IDs
MMSE scores file columns: ['Unnamed: 0', 'adressfname', 'mmse', 'dx']
MMSE scores sample:
   Unnamed: 0 adressfname  mmse  dx
0          23    adrso024    20  ad
1          24    adrso025    11  ad
2          25    adrso027    18  ad
3          26    adrso028    18  ad
4          28    adrso031    26  ad
MMSE scores file contains 0 IDs
Task1 IDs in MMSE scores: 0
Found 79 segmentation files in /content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/cn/
Found 87 segmentation files in /content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/ad/
Total segmentation IDs: 166
Task1 IDs in segmentation: 41
Found 79 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/, 16 match task1 IDs
Found 87 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/ad/, 25 match task1 IDs
Total audio files matching task1 IDs: 41
Task1 IDs missing audio files (30): {'adrso050', 'adrso051', 'adrso067', 'adrso004', 'adrso011', 'adr

In [20]:
import opensmile
import librosa
import pandas as pd
import os
import numpy as np
import glob

# Initialize opensmile for eGeMAPS feature extraction
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

# Function to extract eGeMAPS features from an audio file
def extract_egemaps(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000)  # Load audio
        features = smile.process_signal(y, sr)  # Extract eGeMAPS
        return features.values.flatten()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Paths to data
data_path = '/content/drive/MyDrive/Voice/'
diagnosis_audio_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/audio/'
cn_audio_path = os.path.join(diagnosis_audio_base, 'cn/')
ad_audio_path = os.path.join(diagnosis_audio_base, 'ad/')
segmentation_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/'
mmse_scores_file = '/content/diagnosis_train/ADReSSo21/diagnosis/train/adresso-train-mmse-scores.csv'

# Load task1.csv
task1 = pd.read_csv(data_path + 'task1.csv')
task1['ID'] = task1['ID'].apply(lambda x: 'adrso' + x.replace('adrsdt', '').zfill(3))
task1_ids = set(task1['ID'])
print(f"Task1.csv contains {len(task1)} IDs")

# Inspect MMSE scores file
try:
    mmse_scores = pd.read_csv(mmse_scores_file)
    print("MMSE scores file columns:", mmse_scores.columns.tolist())
    print("MMSE scores sample:")
    print(mmse_scores.head())
    if 'adressfname' in mmse_scores.columns:
        mmse_ids = set(mmse_scores['adressfname'])
    else:
        mmse_ids = set()
        print("No 'adressfname' column in MMSE scores file")
    print(f"MMSE scores file contains {len(mmse_ids)} IDs")
    print(f"Task1 IDs in MMSE scores: {len(task1_ids & mmse_ids)}")
    print(f"Missing task1 IDs in MMSE scores: {len(missing_audio_ids & mmse_ids)}")
except FileNotFoundError:
    print(f"MMSE scores file not found: {mmse_scores_file}")
    mmse_ids = set()
except Exception as e:
    print(f"Error reading MMSE scores file: {e}")
    mmse_ids = set()

# Collect segmentation IDs
segmentation_ids = set()
for path in [os.path.join(segmentation_base, 'cn/'), os.path.join(segmentation_base, 'ad/')]:
    if os.path.exists(path):
        files = [f.split('.')[0] for f in os.listdir(path) if f.endswith('.csv')]
        segmentation_ids.update(files)
        print(f"Found {len(files)} segmentation files in {path}")
    else:
        print(f"Segmentation directory not found: {path}")
print(f"Total segmentation IDs: {len(segmentation_ids)}")
print(f"Task1 IDs in segmentation: {len(task1_ids & segmentation_ids)}")

# Collect .wav files from cn/ and ad/, filtering by task1 IDs
audio_files = []
audio_id_to_path = {}
for path in [cn_audio_path, ad_audio_path]:
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.wav')]
        for f in files:
            audio_id = f.split('.')[0]
            if audio_id in task1_ids:
                audio_files.append(os.path.join(path, f))
                audio_id_to_path[audio_id] = os.path.join(path, f)
        print(f"Found {len(files)} audio files in {path}, {sum(1 for f in files if f.split('.')[0] in task1_ids)} match task1 IDs")
    else:
        print(f"Directory not found: {path}")

print(f"Total audio files matching task1 IDs: {len(audio_files)}")

# Search for missing task1 IDs across all directories
missing_audio_ids = task1_ids - set(audio_id_to_path.keys())
print(f"Task1 IDs missing audio files ({len(missing_audio_ids)}): {missing_audio_ids}")

# Recursive search for missing audio files
found_missing = {}
for missing_id in missing_audio_ids:
    matches = glob.glob(f"/content/**/{missing_id}.wav", recursive=True)
    if matches:
        found_missing[missing_id] = matches
if found_missing:
    print("Found missing audio files:")
    for id_, paths in found_missing.items():
        print(f"{id_}: {paths}")
else:
    print("No missing audio files found in /content/")

# Extract features for matching audio files
audio_features = []
audio_ids = []
skipped_files = []
for audio_path in audio_files:
    audio_file = os.path.basename(audio_path)
    audio_id = audio_file.split('.')[0]
    features = extract_egemaps(audio_path)
    if features is not None:
        audio_features.append(features)
        audio_ids.append(audio_id)
    else:
        print(f"Skipping {audio_id} due to feature extraction failure")
        skipped_files.append(audio_id)

print(f"Extracted features for {len(audio_features)} audio files")
print(f"Skipped {len(skipped_files)} audio files: {skipped_files}")

# Check if any features were extracted
if not audio_features:
    raise ValueError("No audio features extracted. Check audio files or extraction process.")

# Convert to DataFrame
audio_features_df = pd.DataFrame(audio_features)
audio_features_df['ID'] = audio_ids

# Merge with task1 labels
task1_data = pd.merge(audio_features_df, task1, on='ID', how='inner')
print("Merged Acoustic Features with Labels:")
print(task1_data.head())
print(f"Number of matched records: {len(task1_data)}")

# Save the merged DataFrame
task1_data.to_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv', index=False)
print("Saved acoustic features to /content/drive/MyDrive/Voice/task1_acoustic_features.csv")

# Save debugging info
pd.DataFrame({'missing_audio_ids': list(missing_audio_ids)}).to_csv(
    '/content/drive/MyDrive/Voice/missing_audio_ids.csv', index=False
)
pd.DataFrame({'skipped_files': skipped_files}).to_csv(
    '/content/drive/MyDrive/Voice/skipped_files.csv', index=False
)
pd.DataFrame({'segmentation_ids': list(segmentation_ids)}).to_csv(
    '/content/drive/MyDrive/Voice/segmentation_ids.csv', index=False
)
print("Saved debugging info to /content/drive/MyDrive/Voice/{missing_audio_ids,skipped_files,segmentation_ids}.csv")

# Note dataset limitation
if len(missing_audio_ids) > 0:
    print(f"Warning: {len(missing_audio_ids)} task1.csv IDs lack audio files. Verify dataset completeness or check for additional .tgz files.")

Task1.csv contains 71 IDs
MMSE scores file columns: ['Unnamed: 0', 'adressfname', 'mmse', 'dx']
MMSE scores sample:
   Unnamed: 0 adressfname  mmse  dx
0          23    adrso024    20  ad
1          24    adrso025    11  ad
2          25    adrso027    18  ad
3          26    adrso028    18  ad
4          28    adrso031    26  ad
MMSE scores file contains 166 IDs
Task1 IDs in MMSE scores: 41
Missing task1 IDs in MMSE scores: 0
Found 79 segmentation files in /content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/cn/
Found 87 segmentation files in /content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/ad/
Total segmentation IDs: 166
Task1 IDs in segmentation: 41
Found 79 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/cn/, 16 match task1 IDs
Found 87 audio files in /content/diagnosis_train/ADReSSo21/diagnosis/train/audio/ad/, 25 match task1 IDs
Total audio files matching task1 IDs: 41
Task1 IDs missing audio files (30): {'adrso050', 'adrso051', '

In [21]:
# Step 4: Linguistic Feature Extraction (Simulated)
import pandas as pd
import numpy as np

# Load task1_data from Step 3
task1_data = pd.read_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv')

# Simulate linguistic features from eval-summary.docx (DIAGNOSIS-TRAIN-AD/CN)
linguistic_features = {
    'ID': task1_data['ID'],
    'MLU': np.random.normal(12.625, 11.835, len(task1_data)),  # Mean of AD (8.68) and CN (17.57)
    'TTR': np.random.normal(0.665, 0.13, len(task1_data)),     # Mean of AD (0.66) and CN (0.67)
    '%_nouns': np.random.normal(22.745, 7.3, len(task1_data)), # Mean of AD (20.62) and CN (24.87)
    '%_verbs': np.random.normal(19.015, 4.55, len(task1_data)) # Mean of AD (19.12) and CN (18.91)
}

linguistic_features_df = pd.DataFrame(linguistic_features)

# Merge with task1_data labels
task1_linguistic_data = pd.merge(linguistic_features_df, task1_data[['ID', 'Dx']], on='ID')
print("Merged Linguistic Features with Labels:")
print(task1_linguistic_data.head())

# Save for next steps
task1_linguistic_data.to_csv('/content/drive/MyDrive/Voice/task1_linguistic_features.csv', index=False)
print("Saved linguistic features to /content/drive/MyDrive/Voice/task1_linguistic_features.csv")

Merged Linguistic Features with Labels:
         ID        MLU       TTR    %_nouns    %_verbs          Dx
0  adrso010   6.189669  0.826973  23.874254  19.282911     Control
1  adrso014   8.670293  0.560556  28.436979  22.503496  ProbableAD
2  adrso015  18.837356  0.811275  36.690959  23.657292     Control
3  adrso005   4.086280  0.636485  26.283843  21.318973     Control
4  adrso018  15.007849  0.469754  10.600154  19.260291  ProbableAD
Saved linguistic features to /content/drive/MyDrive/Voice/task1_linguistic_features.csv


In [24]:
! pip install pylangacq

Collecting pylangacq
  Downloading pylangacq-0.19.1-py3-none-any.whl.metadata (5.9 kB)
Downloading pylangacq-0.19.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pylangacq
Successfully installed pylangacq-0.19.1


In [25]:
import pandas as pd
import os
import numpy as np
import nltk
from pylangacq import Reader
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Paths
cha_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/'
task1_data = pd.read_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv')
task1_ids = set(task1_data['ID'])

# Collect .cha files
cha_files = []
for root, _, files in os.walk(cha_base):
    cha_files.extend([os.path.join(root, f) for f in files if f.endswith('.cha')])
print(f"Found {len(cha_files)} .cha files")

# Extract linguistic features
linguistic_features = {'ID': [], 'MLU': [], 'TTR': [], '%_nouns': [], '%_verbs': []}
for cha_file in cha_files:
    cha_id = os.path.basename(cha_file).split('.')[0]
    if cha_id in task1_ids:
        try:
            # Read .cha file
            chat = Reader.from_files([cha_file])

            # Get utterances (participant 'PAR' for participant speech)
            utterances = chat.utterances(participant='PAR')
            words = []
            for utt in utterances:
                words.extend(word_tokenize(utt.tier.lower()))

            # POS tagging
            pos_tags = pos_tag(words)

            # MLU: Mean length of utterances
            mlu = len(words) / len(utterances) if utterances else 0

            # TTR: Type-token ratio
            types = len(set(words))
            tokens = len(words)
            ttr = types / tokens if tokens > 0 else 0

            # % Nouns and Verbs
            noun_count = sum(1 for _, tag in pos_tags if tag.startswith('NN'))
            verb_count = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
            total_pos = len(pos_tags)
            percent_nouns = (noun_count / total_pos * 100) if total_pos > 0 else 0
            percent_verbs = (verb_count / total_pos * 100) if total_pos > 0 else 0

            linguistic_features['ID'].append(cha_id)
            linguistic_features['MLU'].append(mlu)
            linguistic_features['TTR'].append(ttr)
            linguistic_features['%_nouns'].append(percent_nouns)
            linguistic_features['%_verbs'].append(percent_verbs)
        except Exception as e:
            print(f"Error processing {cha_file}: {e}")

# Convert to DataFrame
linguistic_features_df = pd.DataFrame(linguistic_features)

# Merge with task1_data labels
task1_linguistic_data = pd.merge(linguistic_features_df, task1_data[['ID', 'Dx']], on='ID')
print("Merged Linguistic Features with Labels:")
print(task1_linguistic_data.head())

# Save
task1_linguistic_data.to_csv('/content/drive/MyDrive/Voice/task1_linguistic_features.csv', index=False)
print("Saved linguistic features to /content/drive/MyDrive/Voice/task1_linguistic_features.csv")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Found 0 .cha files
Merged Linguistic Features with Labels:
Empty DataFrame
Columns: [ID, MLU, TTR, %_nouns, %_verbs, Dx]
Index: []
Saved linguistic features to /content/drive/MyDrive/Voice/task1_linguistic_features.csv


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [26]:
sample_csv = '/content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/cn/adrso010.csv'
print(pd.read_csv(sample_csv).head())

   Unnamed: 0 speaker  begin    end
0           1     PAR      0    503
1           2     PAR    503   3613
2           3     PAR   3613   6131
3           4     PAR   6131   9415
4           5     PAR   9415  17404


In [27]:
import pandas as pd
import os
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Paths
segmentation_base = '/content/diagnosis_train/ADReSSo21/diagnosis/train/segmentation/'
task1_data = pd.read_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv')
task1_ids = set(task1_data['ID'])

# Inspect a segmentation CSV
sample_csv = os.path.join(segmentation_base, 'cn/adrso010.csv')
print("Sample segmentation CSV:")
print(pd.read_csv(sample_csv).head())

# Collect segmentation CSVs
linguistic_features = {'ID': [], 'MLU': [], 'TTR': [], '%_nouns': [], '%_verbs': []}
for path in [os.path.join(segmentation_base, 'cn/'), os.path.join(segmentation_base, 'ad/')]:
    if os.path.exists(path):
        for f in os.listdir(path):
            if f.endswith('.csv'):
                seg_id = f.split('.')[0]
                if seg_id in task1_ids:
                    try:
                        seg_df = pd.read_csv(os.path.join(path, f))
                        # Adjust 'text' to the actual column name (e.g., 'utterance', 'transcription')
                        if 'text' not in seg_df.columns:
                            print(f"No text column in {f}, skipping")
                            continue
                        utterances = seg_df['text'].dropna().tolist()
                        words = []
                        for utt in utterances:
                            words.extend(word_tokenize(str(utt).lower()))
                        pos_tags = pos_tag(words)

                        # MLU
                        mlu = len(words) / len(utterances) if utterances else 0

                        # TTR
                        types = len(set(words))
                        tokens = len(words)
                        ttr = types / tokens if tokens > 0 else 0

                        # % Nouns and Verbs
                        noun_count = sum(1 for _, tag in pos_tags if tag.startswith('NN'))
                        verb_count = sum(1 for _, tag in pos_tags if tag.startswith('VB'))
                        total_pos = len(pos_tags)
                        percent_nouns = (noun_count / total_pos * 100) if total_pos > 0 else 0
                        percent_verbs = (verb_count / total_pos * 100) if total_pos > 0 else 0

                        linguistic_features['ID'].append(seg_id)
                        linguistic_features['MLU'].append(mlu)
                        linguistic_features['TTR'].append(ttr)
                        linguistic_features['%_nouns'].append(percent_nouns)
                        linguistic_features['%_verbs'].append(percent_verbs)
                    except Exception as e:
                        print(f"Error processing {f}: {e}")

# Convert to DataFrame
linguistic_features_df = pd.DataFrame(linguistic_features)

# Merge with task1_data labels
task1_linguistic_data = pd.merge(linguistic_features_df, task1_data[['ID', 'Dx']], on='ID')
print("Merged Linguistic Features with Labels:")
print(task1_linguistic_data.head())

# Save
task1_linguistic_data.to_csv('/content/drive/MyDrive/Voice/task1_linguistic_features.csv', index=False)
print("Saved linguistic features to /content/drive/MyDrive/Voice/task1_linguistic_features.csv")

Sample segmentation CSV:
   Unnamed: 0 speaker  begin    end
0           1     PAR      0    503
1           2     PAR    503   3613
2           3     PAR   3613   6131
3           4     PAR   6131   9415
4           5     PAR   9415  17404
No text column in adrso018.csv, skipping
No text column in adrso017.csv, skipping
No text column in adrso012.csv, skipping
No text column in adrso023.csv, skipping
No text column in adrso007.csv, skipping
No text column in adrso014.csv, skipping
No text column in adrso022.csv, skipping
No text column in adrso019.csv, skipping
No text column in adrso021.csv, skipping
No text column in adrso003.csv, skipping
No text column in adrso005.csv, skipping
No text column in adrso002.csv, skipping
No text column in adrso015.csv, skipping
No text column in adrso016.csv, skipping
No text column in adrso010.csv, skipping
No text column in adrso008.csv, skipping
No text column in adrso039.csv, skipping
No text column in adrso032.csv, skipping
No text column in adr

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Restored Step 4: Simulated Linguistic Features

In [28]:
import pandas as pd
import numpy as np

# Load task1_data from Step 3
task1_data = pd.read_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv')

# Simulate linguistic features from eval-summary.docx (DIAGNOSIS-TRAIN-AD/CN)
linguistic_features = {
    'ID': task1_data['ID'],
    'MLU': np.random.normal(12.625, 11.835, len(task1_data)),  # Mean of AD (8.68) and CN (17.57)
    'TTR': np.random.normal(0.665, 0.13, len(task1_data)),     # Mean of AD (0.66) and CN (0.67)
    '%_nouns': np.random.normal(22.745, 7.3, len(task1_data)), # Mean of AD (20.62) and CN (24.87)
    '%_verbs': np.random.normal(19.015, 4.55, len(task1_data)) # Mean of AD (19.12) and CN (18.91)
}

linguistic_features_df = pd.DataFrame(linguistic_features)

# Merge with task1_data labels
task1_linguistic_data = pd.merge(linguistic_features_df, task1_data[['ID', 'Dx']], on='ID')
print("Merged Linguistic Features with Labels:")
print(task1_linguistic_data.head())

# Save for next steps
task1_linguistic_data.to_csv('/content/drive/MyDrive/Voice/task1_linguistic_features.csv', index=False)
print("Saved linguistic features to /content/drive/MyDrive/Voice/task1_linguistic_features.csv")

Merged Linguistic Features with Labels:
         ID        MLU       TTR    %_nouns    %_verbs          Dx
0  adrso010  27.396049  0.441229  29.927782  20.649705     Control
1  adrso014   8.033289  0.882510  10.582943  25.184818  ProbableAD
2  adrso015  23.863713  0.854305  28.208900  20.508639     Control
3  adrso005   9.619413  0.529333  28.138312  18.727957     Control
4  adrso018  34.630692  0.533147  22.200092  25.231971  ProbableAD
Saved linguistic features to /content/drive/MyDrive/Voice/task1_linguistic_features.csv


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load data
acoustic_df = pd.read_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv')
linguistic_df = pd.read_csv('/content/drive/MyDrive/Voice/task1_linguistic_features.csv')

# Merge features
merged_df = pd.merge(acoustic_df.drop(columns=['Dx']), linguistic_df, on='ID')
print("Merged features shape:", merged_df.shape)
print("Dx distribution:", merged_df['Dx'].value_counts())

# Prepare features and labels
X = merged_df.drop(columns=['ID', 'Dx'])
y = merged_df['Dx'].map({'Control': 0, 'ProbableAD': 1})

# Split data (20% test set)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM
svm = SVC(kernel='linear', random_state=42, class_weight='balanced')
svm.fit(X_train_scaled, y_train)

# Evaluate SVM
y_pred_svm = svm.predict(X_test_scaled)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=['Control', 'ProbableAD']))

# Cross-validation
svm_cv_scores = cross_val_score(svm, scaler.transform(X), y, cv=5, scoring='accuracy')
print("SVM Cross-Validation Accuracy: Mean =", svm_cv_scores.mean(), "Std =", svm_cv_scores.std())

# Try Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Control', 'ProbableAD']))

Merged features shape: (41, 94)
Dx distribution: Dx
ProbableAD    21
Control       20
Name: count, dtype: int64
SVM Accuracy: 0.4444444444444444
SVM Classification Report:
              precision    recall  f1-score   support

     Control       0.40      0.50      0.44         4
  ProbableAD       0.50      0.40      0.44         5

    accuracy                           0.44         9
   macro avg       0.45      0.45      0.44         9
weighted avg       0.46      0.44      0.44         9

SVM Cross-Validation Accuracy: Mean = 0.538888888888889 Std = 0.13362237189784104
Random Forest Accuracy: 0.4444444444444444
Random Forest Classification Report:
              precision    recall  f1-score   support

     Control       0.40      0.50      0.44         4
  ProbableAD       0.50      0.40      0.44         5

    accuracy                           0.44         9
   macro avg       0.45      0.45      0.44         9
weighted avg       0.46      0.44      0.44         9



In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

# Load data
acoustic_df = pd.read_csv('/content/drive/MyDrive/Voice/task1_acoustic_features.csv')
linguistic_df = pd.read_csv('/content/drive/MyDrive/Voice/task1_linguistic_features.csv')

# Merge features
merged_df = pd.merge(acoustic_df.drop(columns=['Dx']), linguistic_df, on='ID')
print("Merged features shape:", merged_df.shape)
print("Dx distribution:", merged_df['Dx'].value_counts())

# Prepare features and labels
X = merged_df.drop(columns=['ID', 'Dx'])
y = merged_df['Dx'].map({'Control': 0, 'ProbableAD': 1})

# Split data (20% test set)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature selection with Random Forest
scaler = StandardScaler()
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
selector = SelectFromModel(rf_selector, max_features=20)  # Select top 20 features
pipeline = Pipeline([
    ('scaler', scaler),
    ('selector', selector)
])
X_train_selected = pipeline.fit_transform(X_train, y_train)
X_test_selected = pipeline.transform(X_test)
print("Selected features shape:", X_train_selected.shape)

# SVM with Grid Search
svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_param_grid = {
    'C': [0.01, 0.1, 1, 10],
}
svm_grid = GridSearchCV(svm, svm_param_grid, cv=5, scoring='accuracy')
svm_grid.fit(X_train_selected, y_train)
print("Best SVM Parameters:", svm_grid.best_params_)
print("Best SVM CV Score:", svm_grid.best_score_)

# Evaluate SVM
y_pred_svm = svm_grid.predict(X_test_selected)
print("SVM Test Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=['Control', 'ProbableAD']))

# Cross-validation for SVM
svm_best = svm_grid.best_estimator_
svm_cv_scores = cross_val_score(svm_best, pipeline.transform(X), y, cv=5, scoring='accuracy')
print("SVM Cross-Validation Accuracy: Mean =", svm_cv_scores.mean(), "Std =", svm_cv_scores.std())

# Random Forest with Grid Search
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='accuracy')
rf_grid.fit(X_train_selected, y_train)
print("Best RF Parameters:", rf_grid.best_params_)
print("Best RF CV Score:", rf_grid.best_score_)

# Evaluate Random Forest
y_pred_rf = rf_grid.predict(X_test_selected)
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Control', 'ProbableAD']))

# Try Logistic Regression
lr = LogisticRegression(class_weight='balanced', random_state=42)
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
}
lr_grid = GridSearchCV(lr, lr_param_grid, cv=5, scoring='accuracy')
lr_grid.fit(X_train_selected, y_train)
print("Best LR Parameters:", lr_grid.best_params_)
print("Best LR CV Score:", lr_grid.best_score_)

# Evaluate Logistic Regression
y_pred_lr = lr_grid.predict(X_test_selected)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['Control', 'ProbableAD']))

Merged features shape: (41, 94)
Dx distribution: Dx
ProbableAD    21
Control       20
Name: count, dtype: int64
Selected features shape: (32, 20)
Best SVM Parameters: {'C': 0.1}
Best SVM CV Score: 0.8428571428571429
SVM Test Accuracy: 0.2222222222222222
SVM Classification Report:
              precision    recall  f1-score   support

     Control       0.29      0.50      0.36         4
  ProbableAD       0.00      0.00      0.00         5

    accuracy                           0.22         9
   macro avg       0.14      0.25      0.18         9
weighted avg       0.13      0.22      0.16         9

SVM Cross-Validation Accuracy: Mean = 0.6361111111111111 Std = 0.12832010513834552
Best RF Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best RF CV Score: 0.7523809523809524
Random Forest Test Accuracy: 0.4444444444444444
Random Forest Classification Report:
              precision    recall  f1-score   support

     Control       0.40      0.50      0.44     