<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/Exploiting_linguistic_information_from_Persain_transcripts_for_early.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
"""
Step 1: Data Acquisition and Preparation for Alzheimer's Disease Detection
Pipeline using ADReSSo21 Dataset

This script handles the extraction, organization, and initial preparation of
transcripts from the ADReSSo21 dataset for early AD detection.
"""

import os
import pandas as pd
import tarfile
import shutil
from pathlib import Path
import csv
from typing import Dict, List, Tuple
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ADReSSo21DataProcessor:
    """
    Class to handle ADReSSo21 dataset extraction and preparation
    """

    def __init__(self, base_path: str = "/content/drive/MyDrive/Voice/ADReSSo21"):
        """
        Initialize the data processor

        Args:
            base_path (str): Base path where ADReSSo21 data will be stored
        """
        self.base_path = Path(base_path)
        self.extracted_path = self.base_path / "extracted"

        # Create directories if they don't exist
        self.base_path.mkdir(parents=True, exist_ok=True)
        self.extracted_path.mkdir(parents=True, exist_ok=True)

        # Dataset file names
        self.dataset_files = {
            'progression_train': 'ADReSSo21-progression-train.tgz',
            'progression_test': 'ADReSSo21-progression-test.tgz',
            'diagnosis_train': 'ADReSSo21-diagnosis-train.tgz'
        }

        # Directory structure mapping
        self.directory_structure = {
            'progression_train': {
                'segmentation': ['no_decline', 'decline'],
                'audio': ['no_decline', 'decline']
            },
            'progression_test': {
                'segmentation': [''],  # test-dist has no subdirectories
                'audio': ['']
            },
            'diagnosis_train': {
                'segmentation': ['cn', 'ad'],
                'audio': ['cn', 'ad']
            }
        }

    def mount_google_drive(self):
        """
        Mount Google Drive in Colab environment
        """
        try:
            from google.colab import drive

            # Check if already mounted
            if os.path.exists('/content/drive'):
                logger.info("Google Drive already mounted")
                return True

            drive.mount('/content/drive')
            logger.info("Google Drive mounted successfully")
            return True
        except ImportError:
            logger.warning("Not running in Google Colab environment")
            return False
        except Exception as e:
            logger.error(f"Error mounting Google Drive: {e}")
            # Try to continue anyway if drive is accessible
            if os.path.exists('/content/drive'):
                logger.info("Drive appears to be accessible despite error")
                return True
            return False

    def find_dataset_files(self):
        """
        Search for ADReSSo21 dataset files in various formats and locations
        """
        logger.info(f"Searching for dataset files in: {self.base_path}")

        # Check if base directory exists
        if not self.base_path.exists():
            logger.error(f"Base directory does not exist: {self.base_path}")
            logger.info("Checking parent directories...")

            # Check common alternative paths
            alternative_paths = [
                Path("/content/drive/MyDrive/Voice/"),
                Path("/content/drive/MyDrive/"),
                Path("/content/drive/"),
                Path("/content/")
            ]

            for alt_path in alternative_paths:
                if alt_path.exists():
                    logger.info(f"Found directory: {alt_path}")
                    # List contents
                    items = list(alt_path.glob("*"))
                    for item in items[:10]:  # Show first 10 items
                        logger.info(f"  {item.name}")
                    if len(items) > 10:
                        logger.info(f"  ... and {len(items) - 10} more items")

            return None, []

        # List all files in the base directory
        all_files = list(self.base_path.glob("*"))
        logger.info(f"Found {len(all_files)} items in base directory:")

        dataset_files = []

        for item in all_files:
            if item.is_file():
                size_mb = item.stat().st_size / (1024*1024)
                logger.info(f"  FILE: {item.name} ({size_mb:.1f} MB)")

                # Check for various dataset file formats
                if any(keyword in item.name.lower() for keyword in ['adresso', 'alzheimer', 'dementia']):
                    dataset_files.append(item)
            else:
                logger.info(f"  DIR:  {item.name}/")

        # Look for specific file types
        file_types = {
            '.tgz': list(self.base_path.glob("*.tgz")),
            '.tar.gz': list(self.base_path.glob("*.tar.gz")),
            '.zip': list(self.base_path.glob("*.zip")),
            '.rar': list(self.base_path.glob("*.rar")),
            '.7z': list(self.base_path.glob("*.7z"))
        }

        found_archives = []
        for file_type, files in file_types.items():
            if files:
                logger.info(f"Found {len(files)} {file_type} files:")
                for file in files:
                    logger.info(f"  {file.name}")
                    found_archives.extend(files)

        if dataset_files:
            logger.info(f"Found {len(dataset_files)} potential dataset files:")
            for file in dataset_files:
                logger.info(f"  {file.name}")

        return found_archives, dataset_files

    def interactive_file_selection(self, found_archives, dataset_files):
        """
        Help user identify and select the correct dataset files
        """
        logger.info("\n" + "="*60)
        logger.info("DATASET FILE DETECTION RESULTS")
        logger.info("="*60)

        if not found_archives and not dataset_files:
            logger.error("❌ No archive files or potential dataset files found!")
            logger.info("\n📋 TROUBLESHOOTING STEPS:")
            logger.info("1. Verify you've uploaded the ADReSSo21 dataset files")
            logger.info("2. Check if files are in a different directory")
            logger.info("3. Ensure files are properly uploaded to Google Drive")
            logger.info("4. Check if files have different names or extensions")
            return None

        logger.info("🔍 FOUND FILES ANALYSIS:")

        # Analyze found files
        likely_candidates = []

        for file in found_archives + dataset_files:
            score = 0
            reasons = []

            # Check file name for ADReSSo21 indicators
            name_lower = file.name.lower()
            if 'adresso' in name_lower:
                score += 5
                reasons.append("Contains 'ADReSSo'")
            if 'progression' in name_lower:
                score += 3
                reasons.append("Contains 'progression'")
            if 'diagnosis' in name_lower:
                score += 3
                reasons.append("Contains 'diagnosis'")
            if 'train' in name_lower:
                score += 2
                reasons.append("Contains 'train'")
            if 'test' in name_lower:
                score += 2
                reasons.append("Contains 'test'")

            # Check file size (ADReSSo21 files should be reasonably large)
            size_mb = file.stat().st_size / (1024*1024)
            if size_mb > 10:  # Larger than 10MB
                score += 2
                reasons.append(f"Good size ({size_mb:.1f} MB)")
            elif size_mb > 1:
                score += 1
                reasons.append(f"Moderate size ({size_mb:.1f} MB)")

            if score > 0:
                likely_candidates.append((file, score, reasons))

        # Sort by score
        likely_candidates.sort(key=lambda x: x[1], reverse=True)

        if likely_candidates:
            logger.info(f"🎯 TOP CANDIDATES (sorted by likelihood):")
            for i, (file, score, reasons) in enumerate(likely_candidates[:5]):
                logger.info(f"  {i+1}. {file.name} (Score: {score})")
                logger.info(f"     Reasons: {', '.join(reasons)}")
                logger.info(f"     Path: {file}")

        # Return the most likely candidate for automatic processing
        if likely_candidates and likely_candidates[0][1] >= 5:
            return likely_candidates[0][0]

        return None
    def extract_any_archive(self, file_path: Path) -> bool:
        """
        Extract archive files in various formats

        Args:
            file_path (Path): Path to the archive file

        Returns:
            bool: True if extraction successful
        """
        try:
            file_extension = file_path.suffix.lower()
            extract_dir = self.extracted_path / file_path.stem
            extract_dir.mkdir(parents=True, exist_ok=True)

            logger.info(f"Attempting to extract: {file_path.name}")

            if file_extension in ['.tgz', '.gz'] or file_path.name.endswith('.tar.gz'):
                # Handle .tgz and .tar.gz files
                with tarfile.open(file_path, 'r:gz') as tar:
                    tar.extractall(path=extract_dir)

            elif file_extension == '.zip':
                # Handle .zip files
                import zipfile
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_dir)

            elif file_extension == '.tar':
                # Handle .tar files
                with tarfile.open(file_path, 'r') as tar:
                    tar.extractall(path=extract_dir)

            else:
                logger.error(f"Unsupported archive format: {file_extension}")
                return False

            logger.info(f"Successfully extracted {file_path.name} to {extract_dir}")

            # List extracted contents
            extracted_items = list(extract_dir.rglob("*"))
            logger.info(f"Extracted {len(extracted_items)} items")

            # Show directory structure
            dirs = [item for item in extracted_items if item.is_dir()]
            files = [item for item in extracted_items if item.is_file()]

            logger.info(f"  Directories: {len(dirs)}")
            logger.info(f"  Files: {len(files)}")

            return True

        except Exception as e:
            logger.error(f"Error extracting {file_path}: {e}")
            return False
    def extract_tgz_files(self) -> bool:
        """
        Extract all archive files to the extraction directory

        Returns:
            bool: True if extraction successful, False otherwise
        """
        try:
            # Find potential dataset files
            found_archives, dataset_files = self.find_dataset_files()

            # Try to identify the best candidates
            best_candidate = self.interactive_file_selection(found_archives, dataset_files)

            if best_candidate:
                logger.info(f"🎯 Attempting to extract most likely candidate: {best_candidate.name}")
                if self.extract_any_archive(best_candidate):
                    return True

            # If no clear candidate, try all archive files
            if found_archives:
                logger.info("🔄 Trying to extract all found archive files...")
                extracted_any = False

                for archive in found_archives:
                    if self.extract_any_archive(archive):
                        extracted_any = True

                return extracted_any

            # Fallback: try the original method for exact file names
            logger.info("🔄 Trying original extraction method...")
            extracted_any = False

            for dataset_name, filename in self.dataset_files.items():
                file_path = self.base_path / filename

                if not file_path.exists():
                    logger.warning(f"Expected file not found: {file_path}")
                    continue

                logger.info(f"Extracting {filename}...")

                # Extract to specific subdirectory
                extract_dir = self.extracted_path / dataset_name
                extract_dir.mkdir(parents=True, exist_ok=True)

                with tarfile.open(file_path, 'r:gz') as tar:
                    tar.extractall(path=extract_dir)

                logger.info(f"Successfully extracted {filename}")
                extracted_any = True

            return extracted_any

        except Exception as e:
            logger.error(f"Error during extraction: {e}")
            return False

    def verify_directory_structure(self) -> Dict[str, bool]:
        """
        Verify that the extracted directories match expected structure

        Returns:
            Dict[str, bool]: Status of each dataset extraction
        """
        verification_results = {}

        for dataset_name, structure in self.directory_structure.items():
            dataset_path = self.extracted_path / dataset_name / "ADReSSo21"

            # Check if main dataset directory exists
            if not dataset_path.exists():
                verification_results[dataset_name] = False
                logger.error(f"Dataset directory not found: {dataset_path}")
                continue

            # Verify subdirectories
            all_dirs_exist = True

            for data_type, subdirs in structure.items():
                if dataset_name == 'progression_test':
                    # Special case for test data
                    seg_path = dataset_path / "progression" / "test-dist" / "segmentation"
                    audio_path = dataset_path / "progression" / "test-dist" / "audio"

                    if not (seg_path.exists() and audio_path.exists()):
                        all_dirs_exist = False
                        logger.error(f"Test directories missing in {dataset_name}")
                else:
                    # Regular structure for train data
                    base_type_path = dataset_path / ("progression" if "progression" in dataset_name else "diagnosis")

                    for subdir in subdirs:
                        if subdir:  # Skip empty strings
                            seg_path = base_type_path / "train" / "segmentation" / subdir
                            audio_path = base_type_path / "train" / "audio" / subdir

                            if not (seg_path.exists() and audio_path.exists()):
                                all_dirs_exist = False
                                logger.error(f"Missing directories for {dataset_name}/{subdir}")

            verification_results[dataset_name] = all_dirs_exist

            if all_dirs_exist:
                logger.info(f"✓ Directory structure verified for {dataset_name}")
            else:
                logger.warning(f"✗ Directory structure issues found for {dataset_name}")

        return verification_results

    def extract_transcripts_from_csv(self, csv_file_path: Path) -> List[Dict]:
        """
        Extract transcript data from a single CSV file following CHAT protocol

        Args:
            csv_file_path (Path): Path to the CSV file

        Returns:
            List[Dict]: List of transcript segments with metadata
        """
        transcripts = []

        try:
            # Try different encodings as CSV files might have various encodings
            encodings = ['utf-8', 'latin-1', 'cp1252']
            df = None

            for encoding in encodings:
                try:
                    df = pd.read_csv(csv_file_path, encoding=encoding)
                    break
                except UnicodeDecodeError:
                    continue

            if df is None:
                logger.error(f"Could not read CSV file with any encoding: {csv_file_path}")
                return transcripts

            # Log column names for inspection
            logger.info(f"CSV columns in {csv_file_path.name}: {list(df.columns)}")

            # Extract relevant columns (adjust based on actual CSV structure)
            # Common CHAT protocol columns might include: speaker, utterance, time, etc.
            for index, row in df.iterrows():
                transcript_entry = {
                    'file_id': csv_file_path.stem,
                    'row_index': index,
                    'data': dict(row)  # Store all columns for now
                }

                # Look for text/utterance columns (common names in CHAT protocol)
                text_columns = ['utterance', 'text', 'transcript', 'speech', 'content']
                for col in text_columns:
                    if col in df.columns and pd.notna(row[col]):
                        transcript_entry['transcript'] = str(row[col])
                        break

                transcripts.append(transcript_entry)

        except Exception as e:
            logger.error(f"Error processing CSV file {csv_file_path}: {e}")

        return transcripts

    def collect_all_transcripts(self) -> Dict[str, List[Dict]]:
        """
        Collect all transcripts from segmentation CSV files

        Returns:
            Dict[str, List[Dict]]: Organized transcripts by category
        """
        all_transcripts = {
            'progression_train_no_decline': [],
            'progression_train_decline': [],
            'progression_test': [],
            'diagnosis_train_cn': [],
            'diagnosis_train_ad': []
        }

        # Process progression training data
        prog_train_path = self.extracted_path / "progression_train" / "ADReSSo21" / "progression" / "train" / "segmentation"

        for category in ['no_decline', 'decline']:
            csv_dir = prog_train_path / category
            if csv_dir.exists():
                for csv_file in csv_dir.glob('*.csv'):
                    transcripts = self.extract_transcripts_from_csv(csv_file)
                    all_transcripts[f'progression_train_{category}'].extend(transcripts)
                    logger.info(f"Processed {len(transcripts)} entries from {csv_file.name}")

        # Process progression test data
        prog_test_path = self.extracted_path / "progression_test" / "ADReSSo21" / "progression" / "test-dist" / "segmentation"
        if prog_test_path.exists():
            for csv_file in prog_test_path.glob('*.csv'):
                transcripts = self.extract_transcripts_from_csv(csv_file)
                all_transcripts['progression_test'].extend(transcripts)
                logger.info(f"Processed {len(transcripts)} entries from {csv_file.name}")

        # Process diagnosis training data
        diag_train_path = self.extracted_path / "diagnosis_train" / "ADReSSo21" / "diagnosis" / "train" / "segmentation"

        for category in ['cn', 'ad']:
            csv_dir = diag_train_path / category
            if csv_dir.exists():
                for csv_file in csv_dir.glob('*.csv'):
                    transcripts = self.extract_transcripts_from_csv(csv_file)
                    all_transcripts[f'diagnosis_train_{category}'].extend(transcripts)
                    logger.info(f"Processed {len(transcripts)} entries from {csv_file.name}")

        return all_transcripts

    def save_transcripts_summary(self, transcripts: Dict[str, List[Dict]]) -> Path:
        """
        Save a summary of extracted transcripts for review

        Args:
            transcripts (Dict): Organized transcripts

        Returns:
            Path: Path to the saved summary file
        """
        summary_file = self.base_path / "transcripts_summary.txt"

        with open(summary_file, 'w', encoding='utf-8') as f:
            f.write("ADReSSo21 Dataset Transcripts Summary\n")
            f.write("=" * 50 + "\n\n")

            total_transcripts = 0
            for category, transcript_list in transcripts.items():
                count = len(transcript_list)
                total_transcripts += count
                f.write(f"{category}: {count} transcript entries\n")

                # Show sample transcript if available
                if transcript_list and 'transcript' in transcript_list[0]:
                    sample = transcript_list[0]['transcript'][:100] + "..." if len(transcript_list[0]['transcript']) > 100 else transcript_list[0]['transcript']
                    f.write(f"  Sample: {sample}\n")
                f.write("\n")

            f.write(f"Total transcript entries: {total_transcripts}\n")
            f.write("\nNote: These English transcripts need to be translated to Persian for the study.\n")
            f.write("Translation should be done manually by native Persian speakers as per the methodology.\n")

        logger.info(f"Transcripts summary saved to: {summary_file}")
        return summary_file

    def prepare_for_translation(self, transcripts: Dict[str, List[Dict]]) -> Path:
        """
        Prepare transcript files for manual Persian translation

        Args:
            transcripts (Dict): Organized transcripts

        Returns:
            Path: Path to the translation directory
        """
        translation_dir = self.base_path / "for_translation"
        translation_dir.mkdir(exist_ok=True)

        for category, transcript_list in transcripts.items():
            if not transcript_list:
                continue

            # Create CSV file for translation
            csv_file = translation_dir / f"{category}_for_translation.csv"

            with open(csv_file, 'w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['ID', 'Original_English', 'Persian_Translation', 'Notes'])

                for i, entry in enumerate(transcript_list):
                    if 'transcript' in entry:
                        transcript_id = f"{category}_{i+1}"
                        english_text = entry['transcript']
                        writer.writerow([transcript_id, english_text, '', ''])

            logger.info(f"Created translation file: {csv_file}")

        # Create translation instructions
        instructions_file = translation_dir / "TRANSLATION_INSTRUCTIONS.txt"
        with open(instructions_file, 'w', encoding='utf-8') as f:
            f.write("PERSIAN TRANSLATION INSTRUCTIONS\n")
            f.write("=" * 40 + "\n\n")
            f.write("IMPORTANT REQUIREMENTS:\n")
            f.write("1. Translation must be done by native Persian speakers\n")
            f.write("2. Translator should have at least 13 years of formal Persian education\n")
            f.write("3. Translation should be verified by an independent linguistic expert\n")
            f.write("4. PRESERVE ALL linguistic features:\n")
            f.write("   - Pause words (uhm, uhh, etc.) - translate equivalent Persian pause words\n")
            f.write("   - Repetitions - keep all repetitions\n")
            f.write("   - Linguistic errors - preserve grammatical/syntactic errors\n")
            f.write("   - Syntactic errors - maintain sentence structure issues\n")
            f.write("5. EXCLUDE annotations like [clears throat], [laughs], etc.\n")
            f.write("6. Do NOT use machine translation - manual translation only\n")
            f.write("7. Capture cultural and linguistic nuances specific to Persian\n\n")
            f.write("Fill in the 'Persian_Translation' column in each CSV file.\n")
            f.write("Use 'Notes' column for any translation decisions or concerns.\n")

        logger.info(f"Translation instructions saved to: {instructions_file}")
        return translation_dir


def main():
    """
    Main function to execute Step 1 of the pipeline
    """
    logger.info("Starting Step 1: Data Acquisition and Preparation")

    # Initialize the data processor with the correct path based on your error
    processor = ADReSSo21DataProcessor()

    # Step 1.1: Mount Google Drive (if in Colab)
    logger.info("Step 1.1: Mounting Google Drive...")
    drive_mounted = processor.mount_google_drive()

    # Step 1.2: Search for dataset files
    logger.info("Step 1.2: Searching for dataset files...")
    found_archives, dataset_files = processor.find_dataset_files()

    if not found_archives and not dataset_files:
        logger.error("❌ No potential dataset files found!")
        logger.info("\n📋 PLEASE CHECK:")
        logger.info("1. Are the ADReSSo21 files uploaded to Google Drive?")
        logger.info("2. Are they in the correct directory?")
        logger.info(f"   Expected location: {processor.base_path}")
        logger.info("3. Do they have the expected names:")
        logger.info("   - ADReSSo21-progression-train.tgz")
        logger.info("   - ADReSSo21-progression-test.tgz")
        logger.info("   - ADReSSo21-diagnosis-train.tgz")
        logger.info("4. Or are they in a different format (.zip, .rar, etc.)?")
        return False

    # Step 1.3: Extract dataset files
    logger.info("Step 1.3: Extracting dataset files...")
    extraction_success = processor.extract_tgz_files()

    if not extraction_success:
        logger.error("❌ Failed to extract any dataset files.")
        logger.info("\n🔧 POSSIBLE SOLUTIONS:")
        logger.info("1. Check if files are corrupted - try re-downloading")
        logger.info("2. Try extracting files manually first")
        logger.info("3. Ensure files are not password protected")
        logger.info("4. Check if files are in an unsupported format")
        return False

    # Step 1.4: Verify directory structure
    logger.info("Step 1.4: Verifying directory structure...")
    verification_results = processor.verify_directory_structure()

    successful_extractions = [k for k, v in verification_results.items() if v]

    if not successful_extractions:
        logger.error("No datasets were successfully extracted and verified.")
        return False

    logger.info(f"Successfully processed datasets: {successful_extractions}")

    # Step 1.5: Extract transcripts from CSV files
    logger.info("Step 1.5: Extracting transcripts from segmentation CSV files...")
    all_transcripts = processor.collect_all_transcripts()

    # Check if we actually got any transcripts
    total_transcripts = sum(len(transcripts) for transcripts in all_transcripts.values())

    if total_transcripts == 0:
        logger.error("No transcripts were extracted from CSV files!")
        logger.info("This might indicate:")
        logger.info("  - CSV files are in a different format than expected")
        logger.info("  - Directory structure is different")
        logger.info("  - Files are corrupted")
        return False

    # Step 1.6: Save summary and prepare for translation
    logger.info("Step 1.6: Saving transcripts summary...")
    processor.save_transcripts_summary(all_transcripts)

    logger.info("Step 1.7: Preparing files for Persian translation...")
    translation_dir = processor.prepare_for_translation(all_transcripts)

    # Final summary
    logger.info(f"\n{'='*50}")
    logger.info("STEP 1 COMPLETED SUCCESSFULLY!")
    logger.info(f"Successfully processed datasets: {successful_extractions}")
    logger.info(f"Total transcript entries extracted: {total_transcripts}")
    logger.info(f"Translation files prepared in: {translation_dir}")
    logger.info("NEXT STEPS:")
    logger.info("1. Have native Persian speakers translate the CSV files")
    logger.info("2. Verify translations with linguistic expert")
    logger.info("3. Return translated files for Step 2 (Data Preprocessing)")
    logger.info(f"{'='*50}")

    return True


# Example usage
if __name__ == "__main__":
    success = main()
    if success:
        print("\n✅ Step 1 completed successfully!")
        print("📁 Check the translation directory for files to be translated to Persian")
        print("🔄 Once translation is complete, you can proceed to Step 2")
    else:
        print("\n❌ Step 1 encountered errors. Please check the logs above.")

ERROR:__main__:❌ No potential dataset files found!



❌ Step 1 encountered errors. Please check the logs above.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import tarfile
import pandas as pd
from google.colab import drive

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Define paths
base_path = '/content/drive/MyDrive/Voice'
extracted_base_path = '/content/drive/MyDrive/Voice/Extracted_dataset'

# Dataset file paths
dataset_files = {
    'diagnosis_train': '/content/drive/MyDrive/Voice/ADReSSo21-diagnosis-train.tgz',
    'progression_test': '/content/drive/MyDrive/Voice/ADReSSo21-progression-test.tgz',
    'progression_train': '/content/drive/MyDrive/Voice/ADReSSo21-progression-train.tgz'
}

def create_directory_structure():
    """Create the directory structure for extracted datasets"""
    print("Creating directory structure...")

    # Create main extracted dataset folder
    os.makedirs(extracted_base_path, exist_ok=True)

    # Create subdirectories for each dataset
    for dataset_name in dataset_files.keys():
        dataset_dir = os.path.join(extracted_base_path, dataset_name)
        os.makedirs(dataset_dir, exist_ok=True)

    print(f"Directory structure created at: {extracted_base_path}")

def extract_dataset(tgz_path, extract_to_path, dataset_name):
    """Extract a .tgz file to the specified directory"""
    print(f"Extracting {dataset_name}...")

    if not os.path.exists(tgz_path):
        print(f"ERROR: File not found - {tgz_path}")
        return False

    try:
        with tarfile.open(tgz_path, 'r:gz') as tar:
            tar.extractall(path=extract_to_path)
        print(f"Successfully extracted {dataset_name} to {extract_to_path}")
        return True
    except Exception as e:
        print(f"ERROR extracting {dataset_name}: {str(e)}")
        return False

def inspect_extracted_structure():
    """Inspect the extracted directory structure and report findings"""
    print("\n" + "="*60)
    print("INSPECTING EXTRACTED DATASET STRUCTURE")
    print("="*60)

    for dataset_name in dataset_files.keys():
        dataset_path = os.path.join(extracted_base_path, dataset_name)
        print(f"\n--- {dataset_name.upper()} ---")

        if os.path.exists(dataset_path):
            # Walk through the directory structure
            for root, dirs, files in os.walk(dataset_path):
                level = root.replace(dataset_path, '').count(os.sep)
                indent = ' ' * 2 * level
                print(f"{indent}{os.path.basename(root)}/")

                # Show first few files in each directory
                sub_indent = ' ' * 2 * (level + 1)
                for file in files[:5]:  # Show first 5 files
                    print(f"{sub_indent}{file}")
                if len(files) > 5:
                    print(f"{sub_indent}... and {len(files) - 5} more files")
        else:
            print(f"Directory not found: {dataset_path}")

def find_and_inspect_csv_files():
    """Find CSV files containing transcripts and inspect their structure"""
    print("\n" + "="*60)
    print("INSPECTING CSV FILES (TRANSCRIPTS)")
    print("="*60)

    csv_files_found = []

    for dataset_name in dataset_files.keys():
        dataset_path = os.path.join(extracted_base_path, dataset_name)

        # Look for CSV files in segmentation directories
        for root, dirs, files in os.walk(dataset_path):
            if 'segmentation' in root:
                for file in files:
                    if file.endswith('.csv'):
                        csv_path = os.path.join(root, file)
                        csv_files_found.append((dataset_name, root, file, csv_path))

    print(f"Found {len(csv_files_found)} CSV files total")

    # Inspect structure of first few CSV files
    for i, (dataset_name, directory, filename, full_path) in enumerate(csv_files_found[:3]):
        print(f"\n--- CSV File {i+1}: {filename} ---")
        print(f"Dataset: {dataset_name}")
        print(f"Directory: {directory}")

        try:
            # Read and inspect CSV structure
            df = pd.read_csv(full_path)
            print(f"Shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            print("First few rows:")
            print(df.head(2).to_string())

            # Check for transcript-like content
            for col in df.columns:
                if any(keyword in col.lower() for keyword in ['transcript', 'text', 'utterance', 'speech']):
                    print(f"\nSample content from '{col}':")
                    sample_content = df[col].dropna().head(2).tolist()
                    for content in sample_content:
                        print(f"  '{str(content)[:100]}...'")

        except Exception as e:
            print(f"Error reading CSV: {str(e)}")

    return csv_files_found

def generate_summary_report(csv_files_found):
    """Generate a summary report of the extracted data"""
    print("\n" + "="*60)
    print("SUMMARY REPORT")
    print("="*60)

    # Count files by dataset and category
    summary = {}
    for dataset_name, directory, filename, full_path in csv_files_found:
        if dataset_name not in summary:
            summary[dataset_name] = {'total_csv': 0, 'categories': {}}

        summary[dataset_name]['total_csv'] += 1

        # Determine category (cn, ad, decline, no_decline)
        category = 'unknown'
        if '/cn/' in directory:
            category = 'cn'
        elif '/ad/' in directory:
            category = 'ad'
        elif '/decline/' in directory:
            category = 'decline'
        elif '/no_decline/' in directory:
            category = 'no_decline'

        if category not in summary[dataset_name]['categories']:
            summary[dataset_name]['categories'][category] = 0
        summary[dataset_name]['categories'][category] += 1

    # Print summary
    for dataset_name, data in summary.items():
        print(f"\n{dataset_name.upper()}:")
        print(f"  Total CSV files: {data['total_csv']}")
        print("  Categories:")
        for category, count in data['categories'].items():
            print(f"    {category}: {count} files")

    print(f"\nExtracted dataset location: {extracted_base_path}")
    print("Ready for Step 2: Translation to Persian")

# Main execution
def main():
    print("Starting Step 1: Data Acquisition and Preparation")
    print("="*60)

    # Step 1.1: Create directory structure
    create_directory_structure()

    # Step 1.2: Extract all datasets
    print("\nExtracting datasets...")
    extraction_success = True

    for dataset_name, tgz_path in dataset_files.items():
        extract_path = os.path.join(extracted_base_path, dataset_name)
        success = extract_dataset(tgz_path, extract_path, dataset_name)
        if not success:
            extraction_success = False

    if not extraction_success:
        print("\nERROR: Some extractions failed. Please check the file paths and try again.")
        return

    # Step 1.3: Inspect extracted structure
    inspect_extracted_structure()

    # Step 1.4: Find and inspect CSV files (transcripts)
    csv_files_found = find_and_inspect_csv_files()

    # Step 1.5: Generate summary report
    generate_summary_report(csv_files_found)

    print("\n" + "="*60)
    print("STEP 1 COMPLETED SUCCESSFULLY!")
    print("="*60)
    print("Next step: Manual translation of English transcripts to Persian")
    print("Note: The paper emphasizes using native Persian speakers for translation")

# Run the main function
if __name__ == "__main__":
    main()

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting Step 1: Data Acquisition and Preparation
Creating directory structure...
Directory structure created at: /content/drive/MyDrive/Voice/Extracted_dataset

Extracting datasets...
Extracting diagnosis_train...
Successfully extracted diagnosis_train to /content/drive/MyDrive/Voice/Extracted_dataset/diagnosis_train
Extracting progression_test...
Successfully extracted progression_test to /content/drive/MyDrive/Voice/Extracted_dataset/progression_test
Extracting progression_train...
Successfully extracted progression_train to /content/drive/MyDrive/Voice/Extracted_dataset/progression_train

INSPECTING EXTRACTED DATASET STRUCTURE

--- DIAGNOSIS_TRAIN ---
diagnosis_train/
  ADReSSo21/
    diagnosis/
      README.md
      train/
        adresso-train-mmse-scores.csv
        segmentation/
          cn/
            adrso281.csv
         

In [4]:
import os
import pandas as pd
import numpy as np
import json
from datetime import datetime
import re

# Define paths
extracted_base_path = '/content/drive/MyDrive/Voice/Extracted_dataset'
persian_translated_path = '/content/drive/MyDrive/Voice/Persian_Translated_Dataset'

def create_persian_dataset_structure():
    """Create directory structure for Persian translated dataset"""
    print("Creating Persian dataset directory structure...")

    # Create main Persian dataset folder
    os.makedirs(persian_translated_path, exist_ok=True)

    # Mirror the original structure
    dataset_types = ['diagnosis_train', 'progression_test', 'progression_train']

    for dataset_type in dataset_types:
        original_path = os.path.join(extracted_base_path, dataset_type)
        persian_path = os.path.join(persian_translated_path, dataset_type)

        # Walk through original structure and create mirror structure
        for root, dirs, files in os.walk(original_path):
            # Create corresponding Persian directory
            relative_path = os.path.relpath(root, original_path)
            if relative_path == '.':
                new_path = persian_path
            else:
                new_path = os.path.join(persian_path, relative_path)
            os.makedirs(new_path, exist_ok=True)

    print(f"Persian dataset structure created at: {persian_translated_path}")

def find_transcript_columns(df):
    """Identify columns that likely contain transcript text"""
    transcript_cols = []

    for col in df.columns:
        col_lower = col.lower()
        # Look for common transcript column names
        if any(keyword in col_lower for keyword in ['transcript', 'text', 'utterance', 'speech', 'content', 'words']):
            transcript_cols.append(col)
        # Also check if column contains string data that looks like speech
        elif df[col].dtype == 'object':
            sample_data = df[col].dropna().head(5)
            if len(sample_data) > 0:
                # Check if it contains typical speech patterns
                sample_text = ' '.join(sample_data.astype(str))
                if any(word in sample_text.lower() for word in ['the', 'and', 'is', 'are', 'this', 'that', 'uhm', 'uh']):
                    transcript_cols.append(col)

    return transcript_cols

def analyze_linguistic_features(text):
    """Analyze text for linguistic features that need to be preserved"""
    if pd.isna(text) or not isinstance(text, str):
        return {
            'has_pause_words': False,
            'has_repetitions': False,
            'pause_words': [],
            'word_count': 0,
            'requires_translation': False
        }

    text_lower = text.lower()

    # Common English pause words/fillers that need to be preserved
    pause_words = ['uhm', 'uh', 'um', 'er', 'ah', 'hmm', 'mm']
    found_pause_words = [pw for pw in pause_words if pw in text_lower]

    # Simple repetition detection (same word appearing consecutively)
    words = text.split()
    has_repetitions = any(i > 0 and words[i].lower() == words[i-1].lower() for i in range(1, len(words)))

    # Check if text contains English content (basic check)
    english_indicators = ['the', 'and', 'is', 'are', 'this', 'that', 'with', 'for', 'to', 'of', 'in', 'on']
    requires_translation = any(word in text_lower for word in english_indicators)

    return {
        'has_pause_words': len(found_pause_words) > 0,
        'has_repetitions': has_repetitions,
        'pause_words': found_pause_words,
        'word_count': len(words),
        'requires_translation': requires_translation
    }

def create_translation_template(csv_files_info):
    """Create a structured template for translation"""
    translation_data = []

    print("Creating translation template...")

    for dataset_name, directory, filename, full_path in csv_files_info:
        try:
            df = pd.read_csv(full_path)
            transcript_cols = find_transcript_columns(df)

            if transcript_cols:
                print(f"Processing {filename} - Found transcript columns: {transcript_cols}")

                for idx, row in df.iterrows():
                    for col in transcript_cols:
                        original_text = row[col]
                        if pd.notna(original_text) and isinstance(original_text, str) and original_text.strip():

                            features = analyze_linguistic_features(original_text)

                            if features['requires_translation']:
                                translation_entry = {
                                    'dataset': dataset_name,
                                    'file': filename,
                                    'directory': directory,
                                    'row_index': idx,
                                    'column': col,
                                    'original_english': original_text,
                                    'persian_translation': '',  # To be filled
                                    'has_pause_words': features['has_pause_words'],
                                    'pause_words': features['pause_words'],
                                    'has_repetitions': features['has_repetitions'],
                                    'word_count': features['word_count'],
                                    'translation_notes': '',
                                    'translation_status': 'pending'
                                }
                                translation_data.append(translation_entry)

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    return translation_data

def save_translation_template(translation_data):
    """Save translation template for manual translation"""
    template_path = os.path.join(persian_translated_path, 'translation_template.json')

    # Save as JSON for easy editing
    with open(template_path, 'w', encoding='utf-8') as f:
        json.dump(translation_data, f, ensure_ascii=False, indent=2)

    # Also create a CSV version for easier viewing/editing
    csv_template_path = os.path.join(persian_translated_path, 'translation_template.csv')
    df_template = pd.DataFrame(translation_data)
    df_template.to_csv(csv_template_path, index=False, encoding='utf-8')

    print(f"Translation template saved:")
    print(f"  JSON format: {template_path}")
    print(f"  CSV format: {csv_template_path}")

    return template_path, csv_template_path

def create_translation_guidelines():
    """Create detailed translation guidelines based on the paper's methodology"""
    guidelines = """
PERSIAN TRANSLATION GUIDELINES
==============================

CRITICAL REQUIREMENTS (Based on Research Paper):

1. PRESERVE ALL LINGUISTIC FEATURES:
   - Keep ALL pause words: "uhm", "uh", "um", "er", "ah" etc.
   - Translate to Persian equivalents: "اوم", "اه", "ام", "ار", "آه"
   - MAINTAIN repetitions exactly as they appear
   - PRESERVE all linguistic and syntactic errors
   - Keep hesitations and false starts

2. EXCLUDE NON-LINGUISTIC ANNOTATIONS:
   - Remove: [clears throat], [laughs], [coughs]
   - Remove: [inaudible], [unclear]
   - Keep only actual speech content

3. TRANSLATION PRINCIPLES:
   - Translate meaning while preserving linguistic characteristics
   - Maintain natural Persian flow where possible
   - Keep cultural context appropriate to Persian speakers
   - Preserve sentence structure patterns when possible

4. SPECIFIC EXAMPLES:
   English: "The woman is uhm she is washing dishes"
   Persian: "زن اوم او ظرف می‌شوید" (keeping the pause word and structure)

   English: "The the boy is running"
   Persian: "پسر پسر دارد می‌دود" (preserving repetition)

5. QUALITY CONTROL:
   - Double-check each translation
   - Ensure pause words are correctly placed
   - Verify repetitions are maintained
   - Check that errors are preserved appropriately

PAUSE WORD EQUIVALENTS:
- "uhm" → "اوم"
- "uh" → "اه"
- "um" → "ام"
- "er" → "ار"
- "ah" → "آه"
- "hmm" → "هوم"
"""

    guidelines_path = os.path.join(persian_translated_path, 'translation_guidelines.txt')
    with open(guidelines_path, 'w', encoding='utf-8') as f:
        f.write(guidelines)

    print(f"Translation guidelines saved: {guidelines_path}")
    return guidelines_path

def load_and_apply_translations(template_path):
    """Load completed translations and apply them to create Persian dataset"""
    print("Loading translations and creating Persian dataset...")

    # Load translation data
    with open(template_path, 'r', encoding='utf-8') as f:
        translation_data = json.load(f)

    # Group by file for processing
    files_to_process = {}
    for entry in translation_data:
        key = (entry['dataset'], entry['file'], entry['directory'])
        if key not in files_to_process:
            files_to_process[key] = []
        files_to_process[key].append(entry)

    translated_files_count = 0

    for (dataset_name, filename, directory), translations in files_to_process.items():
        # Load original CSV
        original_path = os.path.join(directory, filename)

        try:
            df = pd.read_csv(original_path)
            df_persian = df.copy()

            # Apply translations
            for translation in translations:
                if translation['persian_translation'].strip():  # Only if translation exists
                    row_idx = translation['row_index']
                    col = translation['column']
                    df_persian.at[row_idx, col] = translation['persian_translation']

            # Save Persian version
            relative_dir = os.path.relpath(directory, extracted_base_path)
            persian_dir = os.path.join(persian_translated_path, relative_dir)
            os.makedirs(persian_dir, exist_ok=True)

            persian_file_path = os.path.join(persian_dir, f"persian_{filename}")
            df_persian.to_csv(persian_file_path, index=False, encoding='utf-8')

            translated_files_count += 1
            print(f"Created Persian version: {persian_file_path}")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    print(f"Successfully created {translated_files_count} Persian dataset files")

def get_translation_statistics(csv_files_info):
    """Get statistics about translation requirements"""
    total_entries = 0
    entries_needing_translation = 0
    total_words = 0
    files_with_transcripts = 0

    for dataset_name, directory, filename, full_path in csv_files_info:
        try:
            df = pd.read_csv(full_path)
            transcript_cols = find_transcript_columns(df)

            if transcript_cols:
                files_with_transcripts += 1
                for col in transcript_cols:
                    for text in df[col].dropna():
                        if isinstance(text, str) and text.strip():
                            total_entries += 1
                            features = analyze_linguistic_features(text)
                            if features['requires_translation']:
                                entries_needing_translation += 1
                                total_words += features['word_count']
        except:
            continue

    return {
        'total_files_with_transcripts': files_with_transcripts,
        'total_entries': total_entries,
        'entries_needing_translation': entries_needing_translation,
        'estimated_total_words': total_words
    }

# Main execution function
def main():
    print("Starting Step 2: Translation to Persian")
    print("="*60)

    # Step 2.1: Create Persian dataset structure
    create_persian_dataset_structure()

    # Step 2.2: Find all CSV files from Step 1
    print("\nScanning for CSV files...")
    csv_files_info = []

    for dataset_name in ['diagnosis_train', 'progression_test', 'progression_train']:
        dataset_path = os.path.join(extracted_base_path, dataset_name)

        for root, dirs, files in os.walk(dataset_path):
            if 'segmentation' in root:
                for file in files:
                    if file.endswith('.csv'):
                        full_path = os.path.join(root, file)
                        csv_files_info.append((dataset_name, root, file, full_path))

    print(f"Found {len(csv_files_info)} CSV files to analyze")

    # Step 2.3: Get translation statistics
    stats = get_translation_statistics(csv_files_info)
    print(f"\nTranslation Statistics:")
    print(f"  Files with transcripts: {stats['total_files_with_transcripts']}")
    print(f"  Total text entries: {stats['total_entries']}")
    print(f"  Entries needing translation: {stats['entries_needing_translation']}")
    print(f"  Estimated total words: {stats['estimated_total_words']}")

    # Step 2.4: Create translation template
    translation_data = create_translation_template(csv_files_info)

    # Step 2.5: Save translation template
    template_path, csv_template_path = save_translation_template(translation_data)

    # Step 2.6: Create translation guidelines
    guidelines_path = create_translation_guidelines()

    print("\n" + "="*60)
    print("STEP 2 SETUP COMPLETED!")
    print("="*60)
    print(f"Found {len(translation_data)} text entries requiring translation")
    print(f"\nNext Actions Required:")
    print(f"1. Review translation guidelines: {guidelines_path}")
    print(f"2. Open translation template: {csv_template_path}")
    print(f"3. Fill in 'persian_translation' column for each entry")
    print(f"4. Save the completed translations")
    print(f"5. Run the apply_translations() function")

    print(f"\nIMPORTANT REMINDERS:")
    print(f"- Preserve ALL pause words (uhm, uh, etc.)")
    print(f"- Keep repetitions exactly as they appear")
    print(f"- Maintain linguistic errors and hesitations")
    print(f"- Use native Persian speaker expertise for accuracy")

    return template_path, csv_template_path, guidelines_path

def apply_completed_translations():
    """Call this function after completing manual translations"""
    template_path = os.path.join(persian_translated_path, 'translation_template.json')

    if os.path.exists(template_path):
        load_and_apply_translations(template_path)
        print("\nPersian dataset creation completed!")
        print("Ready for Step 3: Data Preprocessing")
    else:
        print("Translation template not found. Please complete Step 2 setup first.")

# Run the main function
if __name__ == "__main__":
    template_path, csv_template_path, guidelines_path = main()
    print(f"\n{'='*60}")
    print("READY FOR TRANSLATION!")
    print(f"{'='*60}")
    print(f"Please open this file to start translating:")
    print(f"{csv_template_path}")
    print(f"\nAfter completing translations, run:")
    print(f"apply_completed_translations()")

# Quick function to check translation progress
def check_translation_progress():
    """Check how many translations have been completed"""
    template_path = os.path.join(persian_translated_path, 'translation_template.json')

    if os.path.exists(template_path):
        with open(template_path, 'r', encoding='utf-8') as f:
            translation_data = json.load(f)

        total = len(translation_data)
        completed = sum(1 for entry in translation_data if entry['persian_translation'].strip())

        print(f"Translation Progress: {completed}/{total} ({completed/total*100:.1f}%)")
        return completed, total
    else:
        print("Translation template not found. Run main() first.")
        return 0, 0

Starting Step 2: Translation to Persian
Creating Persian dataset directory structure...
Persian dataset structure created at: /content/drive/MyDrive/Voice/Persian_Translated_Dataset

Scanning for CSV files...
Found 228 CSV files to analyze

Translation Statistics:
  Files with transcripts: 0
  Total text entries: 0
  Entries needing translation: 0
  Estimated total words: 0
Creating translation template...
Translation template saved:
  JSON format: /content/drive/MyDrive/Voice/Persian_Translated_Dataset/translation_template.json
  CSV format: /content/drive/MyDrive/Voice/Persian_Translated_Dataset/translation_template.csv
Translation guidelines saved: /content/drive/MyDrive/Voice/Persian_Translated_Dataset/translation_guidelines.txt

STEP 2 SETUP COMPLETED!
Found 0 text entries requiring translation

Next Actions Required:
1. Review translation guidelines: /content/drive/MyDrive/Voice/Persian_Translated_Dataset/translation_guidelines.txt
2. Open translation template: /content/drive/MyD