<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Speech_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Setting Up the Environment in Google Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Install required libraries
!pip install opensmile speechbrain librosa



In [4]:
# Extract .tgz files
import tarfile
import os

data_path = '/content/drive/MyDrive/Voice/'  # Adjust to your dataset path
tgz_files = [
    'ADReSSo21-progression-train.tgz',
    'ADReSSo21-progression-test.tgz',
    'ADReSSo21-diagnosis-train.tgz'
]

for tgz in tgz_files:
    tar = tarfile.open(os.path.join(data_path, tgz), 'r:gz')
    tar.extractall('/content/data')
    tar.close()

# Step 2: Loading and Preprocessing Datasets

## Load Label Files

In [5]:
import pandas as pd

# Load label files
task1 = pd.read_csv(os.path.join(data_path, 'task1.csv'))  # AD classification labels
task2 = pd.read_csv(os.path.join(data_path, 'task2.csv'))  # MMSE regression labels
task3 = pd.read_csv(os.path.join(data_path, 'task3.csv'))  # Cognitive decline labels

In [6]:
task1.head()

Unnamed: 0,ID,Dx
0,adrsdt15,Control
1,adrsdt40,Control
2,adrsdt26,Control
3,adrsdt67,Control
4,adrsdt58,Control


- task1.csv: ID and AD/Control labels for diagnosis (AD classification).
- task2.csv: ID and MMSE scores for regression.
- task3.csv: ID and Decline/Non-Decline labels for cognitive decline prediction.

## Map Audio Files to Labels

In [13]:
# Function to map audio files to labels
def load_audio_label_pairs(audio_dir, label_df, label_col, id_col='ID'):
    audio_files, labels = [], []
    for _, row in label_df.iterrows():
        # Check both 'ad' and 'cn' subdirectories for diagnosis
        for subdir in ['ad', 'cn', 'decline', 'no_decline']:
            audio_path = os.path.join(audio_dir, subdir, f"{row[id_col]}.wav")
            if os.path.exists(audio_path):
                audio_files.append(audio_path)
                labels.append(row[label_col])
                break
    return audio_files, labels

# AD Classification (Cookie Theft)
ad_audio_dir = '/content/data/ADReSSo21/diagnosis/train/audio'
ad_audio_files, ad_labels = load_audio_label_pairs(ad_audio_dir, task1, 'Dx')

# MMSE Regression (same audio as AD classification)
mmse_audio_files, mmse_labels = load_audio_label_pairs(ad_audio_dir, task2, 'MMSE')

# Cognitive Decline (Category Fluency)
prog_audio_dir = '/content/data/ADReSSo21/progression/train/audio'
decline_audio_files, decline_labels = load_audio_label_pairs(prog_audio_dir, task3, 'Decline')

- For AD classification, use ADReSSo21-diagnosis-train.tgz audio files and task1.csv.
- For MMSE regression, use the same audio files as AD classification, paired with task2.csv.
- For cognitive decline, use ADReSSo21-progression-train.tgz audio files and task3.csv
- Ensure audio files match the IDs in the CSV files. If some IDs are missing audio, you may need to filter them out.
- The segmentation directories contain CSV files with transcriptions or timings, which can be used for linguistic feature extraction.

In [14]:
print(f"AD Classification: {len(ad_audio_files)} audio files, {len(ad_labels)} labels")
print(f"MMSE Regression: {len(mmse_audio_files)} audio files, {len(mmse_labels)} labels")
print(f"Cognitive Decline: {len(decline_audio_files)} audio files, {len(decline_labels)} labels")

AD Classification: 0 audio files, 0 labels
MMSE Regression: 0 audio files, 0 labels
Cognitive Decline: 0 audio files, 0 labels


In [15]:
import os

# Define audio directories
ad_audio_dir = '/content/data/ADReSSo21/diagnosis/train/audio'
prog_audio_dir = '/content/data/ADReSSo21/progression/train/audio'

# List contents of diagnosis audio directories
print("Diagnosis audio directories:")
for subdir in ['ad', 'cn']:
    path = os.path.join(ad_audio_dir, subdir)
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.wav')]
        print(f"{path}: {len(files)} .wav files")
        print("Sample files:", files[:5])  # Print first 5 files
    else:
        print(f"{path} does not exist")

# List contents of progression audio directories
print("\nProgression audio directories:")
for subdir in ['decline', 'no_decline']:
    path = os.path.join(prog_audio_dir, subdir)
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.wav')]
        print(f"{path}: {len(files)} .wav files")
        print("Sample files:", files[:5])
    else:
        print(f"{path} does not exist")

Diagnosis audio directories:
/content/data/ADReSSo21/diagnosis/train/audio/ad: 87 .wav files
Sample files: ['adrso071.wav', 'adrso116.wav', 'adrso032.wav', 'adrso053.wav', 'adrso027.wav']
/content/data/ADReSSo21/diagnosis/train/audio/cn: 79 .wav files
Sample files: ['adrso178.wav', 'adrso022.wav', 'adrso018.wav', 'adrso270.wav', 'adrso015.wav']

Progression audio directories:
/content/data/ADReSSo21/progression/train/audio/decline: 15 .wav files
Sample files: ['adrsp318.wav', 'adrsp179.wav', 'adrsp101.wav', 'adrsp209.wav', 'adrsp300.wav']
/content/data/ADReSSo21/progression/train/audio/no_decline: 58 .wav files
Sample files: ['adrsp251.wav', 'adrsp200.wav', 'adrsp363.wav', 'adrsp157.wav', 'adrsp192.wav']


In [17]:
import pandas as pd

data_path = '/content/drive/MyDrive/Voice/'  # Adjust to your Drive path
task1 = pd.read_csv(os.path.join(data_path, 'task1.csv'))
task2 = pd.read_csv(os.path.join(data_path, 'task2.csv'))
task3 = pd.read_csv(os.path.join(data_path, 'task3.csv'))

print("Task 1 IDs (AD Classification):", task1['ID'].tolist()[:5])
print("Task 2 IDs (MMSE Regression):", task2['ID'].tolist()[:5])
print("Task 3 IDs (Cognitive Decline):", task3['ID'].tolist()[:5])

Task 1 IDs (AD Classification): ['adrsdt15', 'adrsdt40', 'adrsdt26', 'adrsdt67', 'adrsdt58']
Task 2 IDs (MMSE Regression): ['adrsdt15', 'adrsdt40', 'adrsdt26', 'adrsdt67', 'adrsdt58']
Task 3 IDs (Cognitive Decline): ['adrspt2', 'adrspt18', 'adrspt9', 'adrspt21', 'adrspt29']


In [18]:
# List audio file names (without .wav extension)
def list_audio_ids(audio_dir, subdirs):
    ids = []
    for subdir in subdirs:
        path = os.path.join(audio_dir, subdir)
        if os.path.exists(path):
            ids.extend([f.replace('.wav', '') for f in os.listdir(path) if f.endswith('.wav')])
    return ids

ad_audio_ids = list_audio_ids(ad_audio_dir, ['ad', 'cn'])
prog_audio_ids = list_audio_ids(prog_audio_dir, ['decline', 'no_decline'])

print("Diagnosis audio IDs:", ad_audio_ids[:5])
print("Progression audio IDs:", prog_audio_ids[:5])

Diagnosis audio IDs: ['adrso071', 'adrso116', 'adrso032', 'adrso053', 'adrso027']
Progression audio IDs: ['adrsp318', 'adrsp179', 'adrsp101', 'adrsp209', 'adrsp300']


In [19]:
# Modified function to debug non-matching IDs
def load_audio_label_pairs(audio_dir, label_df, label_col, id_col='ID'):
    audio_files, labels = [], []
    missing_ids = []
    for _, row in label_df.iterrows():
        found = False
        for subdir in ['ad', 'cn', 'decline', 'no_decline']:
            audio_path = os.path.join(audio_dir, subdir, f"{row[id_col]}.wav")
            if os.path.exists(audio_path):
                audio_files.append(audio_path)
                labels.append(row[label_col])
                found = True
                break
        if not found:
            missing_ids.append(row[id_col])
    print(f"Missing IDs: {missing_ids[:10]}")  # Print first 10 missing IDs
    return audio_files, labels

# Re-run for all tasks
ad_audio_files, ad_labels = load_audio_label_pairs(ad_audio_dir, task1, 'Dx')
mmse_audio_files, mmse_labels = load_audio_label_pairs(ad_audio_dir, task2, 'MMSE')
decline_audio_files, decline_labels = load_audio_label_pairs(prog_audio_dir, task3, 'Decline')

print(f"AD Classification: {len(ad_audio_files)} audio files, {len(ad_labels)} labels")
print(f"MMSE Regression: {len(mmse_audio_files)} audio files, {len(mmse_labels)} labels")
print(f"Cognitive Decline: {len(decline_audio_files)} audio files, {len(decline_labels)} labels")

Missing IDs: ['adrsdt15', 'adrsdt40', 'adrsdt26', 'adrsdt67', 'adrsdt58', 'adrsdt49', 'adrsdt46', 'adrsdt8', 'adrsdt64', 'adrsdt29']
Missing IDs: ['adrsdt15', 'adrsdt40', 'adrsdt26', 'adrsdt67', 'adrsdt58', 'adrsdt49', 'adrsdt46', 'adrsdt8', 'adrsdt64', 'adrsdt29']
Missing IDs: ['adrspt2', 'adrspt18', 'adrspt9', 'adrspt21', 'adrspt29', 'adrspt5', 'adrspt12', 'adrspt32', 'adrspt3', 'adrspt25']
AD Classification: 0 audio files, 0 labels
MMSE Regression: 0 audio files, 0 labels
Cognitive Decline: 0 audio files, 0 labels


In [20]:
import os

# Search for metadata files
def find_metadata_files(root_dir):
    metadata_files = []
    for root, _, files in os.walk(root_dir):
        for f in files:
            if f.endswith('.csv') or f.endswith('.txt') or 'meta' in f.lower():
                metadata_files.append(os.path.join(root, f))
    return metadata_files

metadata_files = find_metadata_files('/content/data')
print("Found metadata files:")
for f in metadata_files:
    print(f)

Found metadata files:
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp300.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp055.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp127.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp209.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp003.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp101.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp179.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp266.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp051.csv
/content/data/ADReSSo21/progression/train/segmentation/decline/adrsp313.csv
/content/data/ADReSSo21/progression/train/segmentation/no_decline/adrsp030.csv
/content/data/ADReSSo21/progression/train/segmentation/no_decline/adrsp157.csv
/content/data/ADReSSo21/progression/train/segmentation/no_de

In [23]:
import pandas as pd

# Load the MMSE scores file
mmse_scores_file = '/content/data/ADReSSo21/diagnosis/train/adresso-train-mmse-scores.csv'
mmse_df = pd.read_csv(mmse_scores_file)  # Adjust separator if needed (e.g., sep=';')
print("Columns:", mmse_df.columns.tolist())
print("First 5 rows:\n", mmse_df.head())

Columns: ['Unnamed: 0', 'adressfname', 'mmse', 'dx']
First 5 rows:
    Unnamed: 0 adressfname  mmse  dx
0          23    adrso024    20  ad
1          24    adrso025    11  ad
2          25    adrso027    18  ad
3          26    adrso028    18  ad
4          28    adrso031    26  ad


In [24]:
# Load a sample segmentation file
seg_file = '/content/data/ADReSSo21/diagnosis/train/segmentation/ad/adrso249.csv'
seg_df = pd.read_csv(seg_file)  # Adjust separator if needed
print("Columns:", seg_df.columns.tolist())
print("First 5 rows:\n", seg_df.head())

Columns: ['Unnamed: 0', 'speaker', 'begin', 'end']
First 5 rows:
    Unnamed: 0 speaker  begin    end
0           1     INV      0   1494
1           2     PAR   1494   5370
2           3     PAR   5370   9171
3           4     INV   9171  10150
4           5     PAR  10150  15175


In [25]:
# Load test results file
test_results_file = '/content/data/ADReSSo21/progression/test-dist/test_results_task3.csv'
test_results_df = pd.read_csv(test_results_file)
print("Columns:", test_results_df.columns.tolist())
print("First 5 rows:\n", test_results_df.head())

Columns: ['ID', 'Prediction']
First 5 rows:
          ID  Prediction
0   adrspt2         NaN
1  adrspt18         NaN
2   adrspt9         NaN
3  adrspt21         NaN
4  adrspt29         NaN


In [27]:
test_results_df = pd.read_csv('/content/data/ADReSSo21/progression/test-dist/test_results_task3.csv')
print(test_results_df.head())

         ID  Prediction
0   adrspt2         NaN
1  adrspt18         NaN
2   adrspt9         NaN
3  adrspt21         NaN
4  adrspt29         NaN


In [28]:
import pandas as pd

# Load MMSE scores file
mmse_scores_file = '/content/data/ADReSSo21/diagnosis/train/adresso-train-mmse-scores.csv'
mmse_df = pd.read_csv(mmse_scores_file)  # Adjust separator if needed (e.g., sep=';')
print("Columns:", mmse_df.columns.tolist())
print("First 5 rows:\n", mmse_df.head())

Columns: ['Unnamed: 0', 'adressfname', 'mmse', 'dx']
First 5 rows:
    Unnamed: 0 adressfname  mmse  dx
0          23    adrso024    20  ad
1          24    adrso025    11  ad
2          25    adrso027    18  ad
3          26    adrso028    18  ad
4          28    adrso031    26  ad


In [29]:
import os

# Function to load audio files from metadata
def load_audio_from_metadata(audio_dir, meta_df, id_col, label_col, subdirs=['ad', 'cn']):
    audio_files, labels = [], []
    missing_ids = []
    for _, row in meta_df.iterrows():
        audio_id = row[id_col].replace('.wav', '') if '.wav' in str(row[id_col]) else str(row[id_col])
        found = False
        for subdir in subdirs:
            audio_path = os.path.join(audio_dir, subdir, f"{audio_id}.wav")
            if os.path.exists(audio_path):
                audio_files.append(audio_path)
                labels.append(row[label_col])
                found = True
                break
        if not found:
            missing_ids.append(audio_id)
    print(f"Missing IDs for {label_col}: {missing_ids[:10]}")
    return audio_files, labels

# Load audio files for diagnosis tasks
ad_audio_dir = '/content/data/ADReSSo21/diagnosis/train/audio'
ad_audio_files, ad_labels = load_audio_from_metadata(
    ad_audio_dir, mmse_df, 'ID', 'Dx', subdirs=['ad', 'cn']
)
mmse_audio_files, mmse_labels = load_audio_from_metadata(
    ad_audio_dir, mmse_df, 'ID', 'MMSE', subdirs=['ad', 'cn']
)

print(f"AD Classification: {len(ad_audio_files)} audio files, {len(ad_labels)} labels")
print(f"MMSE Regression: {len(mmse_audio_files)} audio files, {len(mmse_labels)} labels")

KeyError: 'ID'

# Step 3: Feature Extraction

You need to extract acoustic and linguistic features as specified.

Acoustic Features
- eGeMAPS: Use opensmile to extract eGeMAPS features.
- Active Data Representation (ADR): This may require a custom implementation or pre-trained model.

In [8]:
import opensmile
import librosa
import numpy as np

# Initialize opensmile for eGeMAPS
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

# Function to extract eGeMAPS features from audio
def extract_egemaps(audio_files):
    features = []
    for audio in audio_files:
        y, sr = librosa.load(audio, sr=16000)  # Load audio
        egemaps = smile.process_signal(y, sr)
        features.append(egemaps.values.flatten())
    return np.array(features)

# Extract eGeMAPS for each task
ad_egemaps = extract_egemaps(ad_audio_files)
mmse_egemaps = extract_egemaps(mmse_audio_files)
decline_egemaps = extract_egemaps(decline_audio_files)

- ADR Suggestion:

ADR typically involves learning representations from raw audio using unsupervised or self-supervised methods. You could use a pre-trained model like wav2vec2 from speechbrain or train a custom model on a larger dataset. For now, we'll proceed with eGeMAPS, but you can explore speechbrain’s Wav2Vec2 for ADR-like features.

- Linguistic Features:
<br>
Use Automatic Speech Recognition (ASR) to transcribe audio, then process transcripts in CHAT format for CLAN analysis (MOR, EVAL, FREQ).
We'll use speechbrain’s pre-trained ASR model for transcription.

In [9]:
from speechbrain.pretrained import EncoderDecoderASR

# Load pre-trained ASR model
asr_model = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-crdnn-rnnlm-librispeech",
    savedir="pretrained_models/asr-crdnn-rnnlm-librispeech"
)

# Function to transcribe audio
def transcribe_audio(audio_files):
    transcripts = []
    for audio in audio_files:
        transcription = asr_model.transcribe_file(audio)
        transcripts.append(transcription)
    return transcripts

# Transcribe audio for each task
ad_transcripts = transcribe_audio(ad_audio_files)
mmse_transcripts = transcribe_audio(mmse_audio_files)
decline_transcripts = transcribe_audio(decline_audio_files)

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
  from speechbrain.pretrained import EncoderDecoderASR
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/asr-crdnn-rnnlm-librispeech' if not cached


hyperparams.yaml:   0%|          | 0.00/4.83k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--asr-crdnn-rnnlm-librispeech/snapshots/979a53a7a3f6c9291c02c040fd8ebfb2471cf8a3/hyperparams.yaml' -> '/content/pretrained_models/asr-crdnn-rnnlm-librispeech/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/asr-crdnn-rnnlm-librispeech' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/asr-crdnn-rnnlm-librispeech.
INFO:speechbrain.utils.fetching:Fetch normalizer.ckpt: Fetching from HuggingFace Hub 'speechbrain/asr-crdnn-rnnlm-li

normalizer.ckpt:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--asr-crdnn-rnnlm-librispeech/snapshots/979a53a7a3f6c9291c02c040fd8ebfb2471cf8a3/normalizer.ckpt' -> '/content/pretrained_models/asr-crdnn-rnnlm-librispeech/normalizer.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["normalizer"] = /content/pretrained_models/asr-crdnn-rnnlm-librispeech/normalizer.ckpt
INFO:speechbrain.utils.fetching:Fetch asr.ckpt: Fetching from HuggingFace Hub 'speechbrain/asr-crdnn-rnnlm-librispeech' if not cached


asr.ckpt:   0%|          | 0.00/480M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--asr-crdnn-rnnlm-librispeech/snapshots/979a53a7a3f6c9291c02c040fd8ebfb2471cf8a3/asr.ckpt' -> '/content/pretrained_models/asr-crdnn-rnnlm-librispeech/asr.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["asr"] = /content/pretrained_models/asr-crdnn-rnnlm-librispeech/asr.ckpt
INFO:speechbrain.utils.fetching:Fetch lm.ckpt: Fetching from HuggingFace Hub 'speechbrain/asr-crdnn-rnnlm-librispeech' if not cached


lm.ckpt:   0%|          | 0.00/212M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--asr-crdnn-rnnlm-librispeech/snapshots/979a53a7a3f6c9291c02c040fd8ebfb2471cf8a3/lm.ckpt' -> '/content/pretrained_models/asr-crdnn-rnnlm-librispeech/lm.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["lm"] = /content/pretrained_models/asr-crdnn-rnnlm-librispeech/lm.ckpt
INFO:speechbrain.utils.fetching:Fetch tokenizer.ckpt: Fetching from HuggingFace Hub 'speechbrain/asr-crdnn-rnnlm-librispeech' if not cached


tokenizer.ckpt:   0%|          | 0.00/253k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--asr-crdnn-rnnlm-librispeech/snapshots/979a53a7a3f6c9291c02c040fd8ebfb2471cf8a3/tokenizer.ckpt' -> '/content/pretrained_models/asr-crdnn-rnnlm-librispeech/tokenizer.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["tokenizer"] = /content/pretrained_models/asr-crdnn-rnnlm-librispeech/tokenizer.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: normalizer, asr, lm, tokenizer
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): normalizer -> /content/pretrained_models/asr-crdnn-rnnlm-librispeech/normalizer.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): asr -> /content/pretrained_models/asr-crdnn-rnnlm-librispeech/asr.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): lm -> /content/pretrained_models/asr-crdnn

## Convert to CHAT Format

- Convert to CHAT Format:
<br> CHAT is a specific format for linguistic analysis. You’ll need to structure the transcripts with metadata (e.g., speaker ID, timestamps). Below is a basic example of saving transcripts in CHAT-like format.

In [10]:
def save_to_chat(transcripts, audio_files, output_dir, task_name):
    os.makedirs(output_dir, exist_ok=True)
    for i, (transcript, audio_file) in enumerate(zip(transcripts, audio_files)):
        chat_file = os.path.join(output_dir, f"{task_name}_{i}.cha")
        with open(chat_file, 'w') as f:
            f.write(f"@Begin\n")
            f.write(f"@Participants: PAR Participant\n")
            f.write(f"@ID: language|corpus|PAR|||||Participant||\n")
            f.write(f"*PAR:\t{transcript}\n")
            f.write(f"@End\n")

# Save transcripts to CHAT format
save_to_chat(ad_transcripts, ad_audio_files, '/content/chat/ad', 'ad')
save_to_chat(mmse_transcripts, mmse_audio_files, '/content/chat/mmse', 'mmse')
save_to_chat(decline_transcripts, decline_audio_files, '/content/chat/decline', 'decline')

- CLAN Analysis:
<br>CLAN (Computerized Language Analysis) is typically run locally to extract MOR (morphological), EVAL (evaluation), and FREQ (frequency) features. Since Colab doesn’t support CLAN directly, you can:
1. Download the .cha files and run CLAN locally.
2. Alternatively, extract basic linguistic features (e.g., word count, type-token ratio) in Python.

In [11]:
from collections import Counter
import re

# Basic linguistic feature extraction
def extract_linguistic_features(transcripts):
    features = []
    for transcript in transcripts:
        words = re.findall(r'\w+', transcript.lower())
        word_count = len(words)
        unique_words = len(set(words))
        ttr = unique_words / word_count if word_count > 0 else 0  # Type-token ratio
        features.append([word_count, unique_words, ttr])
    return np.array(features)

# Extract linguistic features
ad_linguistic = extract_linguistic_features(ad_transcripts)
mmse_linguistic = extract_linguistic_features(mmse_transcripts)
decline_linguistic = extract_linguistic_features(decline_transcripts)