In [7]:
import os
import re

In [25]:
#Defining functions to create an output folder,extract header from CHA file, itirate through files
def create_output_folder(output_folder): 
    if not os.path.exists(output_folder): 
        os.makedirs(output_folder)
        
def process_cha(cha_file_path, output_folder):
    # Step 1: Read the content of the CHA file
    with open(cha_file_path, "r", encoding="utf-8") as cha_file:
        cha_content = cha_file.read()

    # Step 2: Extract header information from the CHA content
    max_words = 0
    header_info = None

    for line in cha_content.split('\n'):
        if line.startswith('@ID:'):
            fields = line.split('|')
            total_words = len([word for word in fields if word.isalnum() or word.isalpha()])
            
            if total_words > max_words:
                max_words = total_words
                header_info = fields

    
    # Step 3: Define the participant ID without the .cha extension & create the output file
    participant_id = os.path.splitext(os.path.basename(cha_file_path))[0]
    txt_file_path = os.path.join(output_folder, f"{participant_id}.txt")

    with open(txt_file_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(
            f"Participant's ID: {participant_id}\n"
            f"Age: {header_info[3]}\n"
            f"Sex: {header_info[4]}\n"
            f"Diagnosis: {header_info[5]}\n"
            f"Moca: {header_info[8]}\n\n"
            )

        # Step 4: Separate inv and par lines
        speaker = ""
        all_lines = []

        for line in cha_content.split('\n'):
            if line.startswith('*INV:'):
                speaker = "INV"
                line = line[len('*INV:'):]
            elif line.startswith("*PAR:"):
                speaker = "PAR"
                line = line[len('*PAR:'):]
            elif line.startswith("@G:"):
                speaker = "TASK"
                line = line[len('@G:'):]
            else:
                continue

            # Clean the text in the line
            line = re.sub(r'[0-9#%*\[\]&<>_]', '', line)

            all_lines.append((speaker, line))

        for speaker, line in all_lines:
            txt_file.write(f"{speaker}:{line}\n")

    print(f"Done processing {os.path.basename(cha_file_path)}, saved to {os.path.basename(txt_file_path)}")

def process_all_cha_files(input_folder, output_folder): 
    
    cha_files = [file for file in os.listdir(input_folder) if file.endswith('.cha')]
    
    for i in cha_files: 
        cha_file_path = os.path.join(input_folder, i)
        create_output_folder(output_folder)
        
        process_cha(cha_file_path, output_folder)
    
    print(f"FINISHED CLEANING ALL CHA FILES IN {os.path.basename(input_folder)} ")

In [27]:
corpus = 'Baycrest_PPA'

input_folder = os.path.join(os.getcwd(),f"Corpus/{corpus}")
output_folder = os.path.join(os.getcwd(),f"Corpus_TXT/{corpus}")

# Process all CHA files
process_all_cha_files(input_folder, output_folder)

Done processing Baycrest10285.cha, saved to Baycrest10285.txt
Done processing Baycrest13188.cha, saved to Baycrest13188.txt
Done processing Baycrest11014.cha, saved to Baycrest11014.txt
Done processing Baycrest12828.cha, saved to Baycrest12828.txt
Done processing Baycrest13074.cha, saved to Baycrest13074.txt
Done processing Baycrest12829.cha, saved to Baycrest12829.txt
Done processing Baycrest12756.cha, saved to Baycrest12756.txt
Done processing Baycrest13196.cha, saved to Baycrest13196.txt
Done processing Baycrest13156.cha, saved to Baycrest13156.txt
Done processing Baycrest13187.cha, saved to Baycrest13187.txt
FINISHED CLEANING ALL CHA FILES IN Baycrest_PPA 
