<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Exploiting_linguistic_information_from_Persain_transcripts_for_early.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Step 1: Data Acquisition and Preparation for Alzheimer's Disease Detection
Pipeline using ADReSSo21 Dataset

This script handles the extraction, organization, and initial preparation of
transcripts from the ADReSSo21 dataset for early AD detection.
"""

import os
import pandas as pd
import tarfile
import shutil
from pathlib import Path
import csv
from typing import Dict, List, Tuple
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ADReSSo21DataProcessor:
    """
    Class to handle ADReSSo21 dataset extraction and preparation
    """

    def __init__(self, base_path: str = "/content/drive/MyDrive/Voice/ADReSSo21"):
        """
        Initialize the data processor

        Args:
            base_path (str): Base path where ADReSSo21 data will be stored
        """
        self.base_path = Path(base_path)
        self.extracted_path = self.base_path / "extracted"

        # Create directories if they don't exist
        self.base_path.mkdir(parents=True, exist_ok=True)
        self.extracted_path.mkdir(parents=True, exist_ok=True)

        # Dataset file names
        self.dataset_files = {
            'progression_train': 'ADReSSo21-progression-train.tgz',
            'progression_test': 'ADReSSo21-progression-test.tgz',
            'diagnosis_train': 'ADReSSo21-diagnosis-train.tgz'
        }

        # Directory structure mapping
        self.directory_structure = {
            'progression_train': {
                'segmentation': ['no_decline', 'decline'],
                'audio': ['no_decline', 'decline']
            },
            'progression_test': {
                'segmentation': [''],  # test-dist has no subdirectories
                'audio': ['']
            },
            'diagnosis_train': {
                'segmentation': ['cn', 'ad'],
                'audio': ['cn', 'ad']
            }
        }

    def mount_google_drive(self):
        """
        Mount Google Drive in Colab environment
        """
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            logger.info("Google Drive mounted successfully")
            return True
        except ImportError:
            logger.warning("Not running in Google Colab environment")
            return False
        except Exception as e:
            logger.error(f"Error mounting Google Drive: {e}")
            return False

    def extract_tgz_files(self) -> bool:
        """
        Extract all .tgz files to the extraction directory

        Returns:
            bool: True if extraction successful, False otherwise
        """
        try:
            for dataset_name, filename in self.dataset_files.items():
                file_path = self.base_path / filename

                if not file_path.exists():
                    logger.error(f"File not found: {file_path}")
                    logger.info(f"Please ensure {filename} is uploaded to {self.base_path}")
                    continue

                logger.info(f"Extracting {filename}...")

                # Extract to specific subdirectory
                extract_dir = self.extracted_path / dataset_name
                extract_dir.mkdir(parents=True, exist_ok=True)

                with tarfile.open(file_path, 'r:gz') as tar:
                    tar.extractall(path=extract_dir)

                logger.info(f"Successfully extracted {filename}")

            return True

        except Exception as e:
            logger.error(f"Error during extraction: {e}")
            return False

    def verify_directory_structure(self) -> Dict[str, bool]:
        """
        Verify that the extracted directories match expected structure

        Returns:
            Dict[str, bool]: Status of each dataset extraction
        """
        verification_results = {}

        for dataset_name, structure in self.directory_structure.items():
            dataset_path = self.extracted_path / dataset_name / "ADReSSo21"

            # Check if main dataset directory exists
            if not dataset_path.exists():
                verification_results[dataset_name] = False
                logger.error(f"Dataset directory not found: {dataset_path}")
                continue

            # Verify subdirectories
            all_dirs_exist = True

            for data_type, subdirs in structure.items():
                if dataset_name == 'progression_test':
                    # Special case for test data
                    seg_path = dataset_path / "progression" / "test-dist" / "segmentation"
                    audio_path = dataset_path / "progression" / "test-dist" / "audio"

                    if not (seg_path.exists() and audio_path.exists()):
                        all_dirs_exist = False
                        logger.error(f"Test directories missing in {dataset_name}")
                else:
                    # Regular structure for train data
                    base_type_path = dataset_path / ("progression" if "progression" in dataset_name else "diagnosis")

                    for subdir in subdirs:
                        if subdir:  # Skip empty strings
                            seg_path = base_type_path / "train" / "segmentation" / subdir
                            audio_path = base_type_path / "train" / "audio" / subdir

                            if not (seg_path.exists() and audio_path.exists()):
                                all_dirs_exist = False
                                logger.error(f"Missing directories for {dataset_name}/{subdir}")

            verification_results[dataset_name] = all_dirs_exist

            if all_dirs_exist:
                logger.info(f"✓ Directory structure verified for {dataset_name}")
            else:
                logger.warning(f"✗ Directory structure issues found for {dataset_name}")

        return verification_results

    def extract_transcripts_from_csv(self, csv_file_path: Path) -> List[Dict]:
        """
        Extract transcript data from a single CSV file following CHAT protocol

        Args:
            csv_file_path (Path): Path to the CSV file

        Returns:
            List[Dict]: List of transcript segments with metadata
        """
        transcripts = []

        try:
            # Try different encodings as CSV files might have various encodings
            encodings = ['utf-8', 'latin-1', 'cp1252']
            df = None

            for encoding in encodings:
                try:
                    df = pd.read_csv(csv_file_path, encoding=encoding)
                    break
                except UnicodeDecodeError:
                    continue

            if df is None:
                logger.error(f"Could not read CSV file with any encoding: {csv_file_path}")
                return transcripts

            # Log column names for inspection
            logger.info(f"CSV columns in {csv_file_path.name}: {list(df.columns)}")

            # Extract relevant columns (adjust based on actual CSV structure)
            # Common CHAT protocol columns might include: speaker, utterance, time, etc.
            for index, row in df.iterrows():
                transcript_entry = {
                    'file_id': csv_file_path.stem,
                    'row_index': index,
                    'data': dict(row)  # Store all columns for now
                }

                # Look for text/utterance columns (common names in CHAT protocol)
                text_columns = ['utterance', 'text', 'transcript', 'speech', 'content']
                for col in text_columns:
                    if col in df.columns and pd.notna(row[col]):
                        transcript_entry['transcript'] = str(row[col])
                        break

                transcripts.append(transcript_entry)

        except Exception as e:
            logger.error(f"Error processing CSV file {csv_file_path}: {e}")

        return transcripts

    def collect_all_transcripts(self) -> Dict[str, List[Dict]]:
        """
        Collect all transcripts from segmentation CSV files

        Returns:
            Dict[str, List[Dict]]: Organized transcripts by category
        """
        all_transcripts = {
            'progression_train_no_decline': [],
            'progression_train_decline': [],
            'progression_test': [],
            'diagnosis_train_cn': [],
            'diagnosis_train_ad': []
        }

        # Process progression training data
        prog_train_path = self.extracted_path / "progression_train" / "ADReSSo21" / "progression" / "train" / "segmentation"

        for category in ['no_decline', 'decline']:
            csv_dir = prog_train_path / category
            if csv_dir.exists():
                for csv_file in csv_dir.glob('*.csv'):
                    transcripts = self.extract_transcripts_from_csv(csv_file)
                    all_transcripts[f'progression_train_{category}'].extend(transcripts)
                    logger.info(f"Processed {len(transcripts)} entries from {csv_file.name}")

        # Process progression test data
        prog_test_path = self.extracted_path / "progression_test" / "ADReSSo21" / "progression" / "test-dist" / "segmentation"
        if prog_test_path.exists():
            for csv_file in prog_test_path.glob('*.csv'):
                transcripts = self.extract_transcripts_from_csv(csv_file)
                all_transcripts['progression_test'].extend(transcripts)
                logger.info(f"Processed {len(transcripts)} entries from {csv_file.name}")

        # Process diagnosis training data
        diag_train_path = self.extracted_path / "diagnosis_train" / "ADReSSo21" / "diagnosis" / "train" / "segmentation"

        for category in ['cn', 'ad']:
            csv_dir = diag_train_path / category
            if csv_dir.exists():
                for csv_file in csv_dir.glob('*.csv'):
                    transcripts = self.extract_transcripts_from_csv(csv_file)
                    all_transcripts[f'diagnosis_train_{category}'].extend(transcripts)
                    logger.info(f"Processed {len(transcripts)} entries from {csv_file.name}")

        return all_transcripts

    def save_transcripts_summary(self, transcripts: Dict[str, List[Dict]]) -> Path:
        """
        Save a summary of extracted transcripts for review

        Args:
            transcripts (Dict): Organized transcripts

        Returns:
            Path: Path to the saved summary file
        """
        summary_file = self.base_path / "transcripts_summary.txt"

        with open(summary_file, 'w', encoding='utf-8') as f:
            f.write("ADReSSo21 Dataset Transcripts Summary\n")
            f.write("=" * 50 + "\n\n")

            total_transcripts = 0
            for category, transcript_list in transcripts.items():
                count = len(transcript_list)
                total_transcripts += count
                f.write(f"{category}: {count} transcript entries\n")

                # Show sample transcript if available
                if transcript_list and 'transcript' in transcript_list[0]:
                    sample = transcript_list[0]['transcript'][:100] + "..." if len(transcript_list[0]['transcript']) > 100 else transcript_list[0]['transcript']
                    f.write(f"  Sample: {sample}\n")
                f.write("\n")

            f.write(f"Total transcript entries: {total_transcripts}\n")
            f.write("\nNote: These English transcripts need to be translated to Persian for the study.\n")
            f.write("Translation should be done manually by native Persian speakers as per the methodology.\n")

        logger.info(f"Transcripts summary saved to: {summary_file}")
        return summary_file

    def prepare_for_translation(self, transcripts: Dict[str, List[Dict]]) -> Path:
        """
        Prepare transcript files for manual Persian translation

        Args:
            transcripts (Dict): Organized transcripts

        Returns:
            Path: Path to the translation directory
        """
        translation_dir = self.base_path / "for_translation"
        translation_dir.mkdir(exist_ok=True)

        for category, transcript_list in transcripts.items():
            if not transcript_list:
                continue

            # Create CSV file for translation
            csv_file = translation_dir / f"{category}_for_translation.csv"

            with open(csv_file, 'w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['ID', 'Original_English', 'Persian_Translation', 'Notes'])

                for i, entry in enumerate(transcript_list):
                    if 'transcript' in entry:
                        transcript_id = f"{category}_{i+1}"
                        english_text = entry['transcript']
                        writer.writerow([transcript_id, english_text, '', ''])

            logger.info(f"Created translation file: {csv_file}")

        # Create translation instructions
        instructions_file = translation_dir / "TRANSLATION_INSTRUCTIONS.txt"
        with open(instructions_file, 'w', encoding='utf-8') as f:
            f.write("PERSIAN TRANSLATION INSTRUCTIONS\n")
            f.write("=" * 40 + "\n\n")
            f.write("IMPORTANT REQUIREMENTS:\n")
            f.write("1. Translation must be done by native Persian speakers\n")
            f.write("2. Translator should have at least 13 years of formal Persian education\n")
            f.write("3. Translation should be verified by an independent linguistic expert\n")
            f.write("4. PRESERVE ALL linguistic features:\n")
            f.write("   - Pause words (uhm, uhh, etc.) - translate equivalent Persian pause words\n")
            f.write("   - Repetitions - keep all repetitions\n")
            f.write("   - Linguistic errors - preserve grammatical/syntactic errors\n")
            f.write("   - Syntactic errors - maintain sentence structure issues\n")
            f.write("5. EXCLUDE annotations like [clears throat], [laughs], etc.\n")
            f.write("6. Do NOT use machine translation - manual translation only\n")
            f.write("7. Capture cultural and linguistic nuances specific to Persian\n\n")
            f.write("Fill in the 'Persian_Translation' column in each CSV file.\n")
            f.write("Use 'Notes' column for any translation decisions or concerns.\n")

        logger.info(f"Translation instructions saved to: {instructions_file}")
        return translation_dir


def main():
    """
    Main function to execute Step 1 of the pipeline
    """
    logger.info("Starting Step 1: Data Acquisition and Preparation")

    # Initialize the data processor
    processor = ADReSSo21DataProcessor()

    # Step 1.1: Mount Google Drive (if in Colab)
    logger.info("Step 1.1: Mounting Google Drive...")
    processor.mount_google_drive()

    # Step 1.2: Extract dataset files
    logger.info("Step 1.2: Extracting dataset files...")
    if not processor.extract_tgz_files():
        logger.error("Failed to extract dataset files. Please check file paths.")
        return False

    # Step 1.3: Verify directory structure
    logger.info("Step 1.3: Verifying directory structure...")
    verification_results = processor.verify_directory_structure()

    if not all(verification_results.values()):
        logger.warning("Some directory structure issues found. Proceeding with available data.")

    # Step 1.4: Extract transcripts from CSV files
    logger.info("Step 1.4: Extracting transcripts from segmentation CSV files...")
    all_transcripts = processor.collect_all_transcripts()

    # Step 1.5: Save summary and prepare for translation
    logger.info("Step 1.5: Saving transcripts summary...")
    processor.save_transcripts_summary(all_transcripts)

    logger.info("Step 1.6: Preparing files for Persian translation...")
    translation_dir = processor.prepare_for_translation(all_transcripts)

    # Final summary
    total_transcripts = sum(len(transcripts) for transcripts in all_transcripts.values())
    logger.info(f"\n{'='*50}")
    logger.info("STEP 1 COMPLETED SUCCESSFULLY!")
    logger.info(f"Total transcript entries extracted: {total_transcripts}")
    logger.info(f"Translation files prepared in: {translation_dir}")
    logger.info("NEXT STEPS:")
    logger.info("1. Have native Persian speakers translate the CSV files")
    logger.info("2. Verify translations with linguistic expert")
    logger.info("3. Return translated files for Step 2 (Data Preprocessing)")
    logger.info(f"{'='*50}")

    return True


# Example usage
if __name__ == "__main__":
    success = main()
    if success:
        print("\n✅ Step 1 completed successfully!")
        print("📁 Check the translation directory for files to be translated to Persian")
        print("🔄 Once translation is complete, you can proceed to Step 2")
    else:
        print("\n❌ Step 1 encountered errors. Please check the logs above.")

ERROR:__main__:Error mounting Google Drive: Mountpoint must not already contain files
ERROR:__main__:File not found: /content/drive/MyDrive/Voice/ADReSSo21/ADReSSo21-progression-train.tgz
ERROR:__main__:File not found: /content/drive/MyDrive/Voice/ADReSSo21/ADReSSo21-progression-test.tgz
ERROR:__main__:File not found: /content/drive/MyDrive/Voice/ADReSSo21/ADReSSo21-diagnosis-train.tgz
ERROR:__main__:Dataset directory not found: /content/drive/MyDrive/Voice/ADReSSo21/extracted/progression_train/ADReSSo21
ERROR:__main__:Dataset directory not found: /content/drive/MyDrive/Voice/ADReSSo21/extracted/progression_test/ADReSSo21
ERROR:__main__:Dataset directory not found: /content/drive/MyDrive/Voice/ADReSSo21/extracted/diagnosis_train/ADReSSo21



✅ Step 1 completed successfully!
📁 Check the translation directory for files to be translated to Persian
🔄 Once translation is complete, you can proceed to Step 2
