In [None]:
import os
import re
import pandas as pd
import csv

In [None]:
pip install pylangacq



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir("/content/drive/MyDrive/OneDrive/DementiaBank/Pitt/Pitt/Dementia/sentence")

In [None]:
# Define the folder containing .cha files
folder_path = "./cha_files"

In [None]:
import os
import re
import pandas as pd
import pylangacq

# Function to remove illegal characters for Excel
def remove_illegal_characters(text):
    """
    Remove characters that are not allowed in Excel.
    """
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    return ILLEGAL_CHARACTERS_RE.sub("", text)

# Function to clean text and extract Time_Start and Time_End
def clean_text_and_extract_time(text):
    """
    Clean text and extract Time_Start and Time_End from the utterance.
    """
    time_pattern = r'\x15(\d+)_(\d+)\x15'  # Pattern to match time stamps
    match = re.search(time_pattern, text)
    if match:
        time_start, time_end = match.groups()  # Extract start and end times
        text = re.sub(time_pattern, '', text).strip()  # Remove time stamps from text
    else:
        time_start, time_end = None, None
    return text, time_start, time_end

# Function to count annotations based on conditions
def count_annotations(text):
    """
    Count specific patterns based on the provided conditions.
    """
    patterns = {
        "Incomplete sentences": r"\+\.\.\.",  # Trailing off +...: Trailing off
        "Grammatical errors": r"\[\+ gram\]",  # Grammatical errors [+ gram]
        "filler words": r"@fp|&-",  # Filler words @fp(Filler Words) , &- (Fillers)
        "repetitions": r"\[/\]"  # Repetition
    }
    counts = {key: len(re.findall(pattern, text)) for key, pattern in patterns.items()}
    return counts

In [None]:
# Define word groups for analysis
word_groups = {
    "pencil": ["pencil"],
    "tree": ["tree"],
    "child, hospital": ["child", "hospital"],
    "cold, winter": ["cold", "winter"],
    "chair, doctor, sit": ["chair", "doctor", "sit"],
    "bureau, open, drawer": ["bureau", "open", "drawer"]
}

# Function to update INV-related sentences
def update_inv_sentences(file_name, utterances):
    """
    Update INV-related sentences for the current file.
    """
    inv_sentences = {key: [] for key in word_groups.keys()}  # Collect INV sentences

    for utterance in utterances:
        if utterance.participant == 'INV':
            raw_text = utterance.tiers.get('INV', '')
            for group, words in word_groups.items():
                if all(word in raw_text for word in words):
                    inv_sentences[group].append(raw_text)

    return inv_sentences

# Function to process utterances and extract required metrics
def extract_annotations_from_utterances(utterances, file_name, inv_sentences):
    """
    Extract annotations, INV sentences, and word group analysis for PAR utterances.
    """
    all_annotations = []

    for utterance in utterances:
        if utterance.participant == 'PAR':
            raw_text = utterance.tiers.get('PAR', '')
            cleaned_text, time_start, time_end = clean_text_and_extract_time(raw_text)

            # Count annotations for this utterance
            annotation_counts = count_annotations(cleaned_text)

            # Compile the row for this utterance
            row = {
                "file_name": file_name,
                "participant": utterance.participant,
                "utterance_text": cleaned_text,
                "Time_Start": time_start,
                "Time_End": time_end,
                **annotation_counts
            }

            # Add word group counts for PAR utterances
            for group, words in word_groups.items():
                row[f"PAR_{group}_Count"] = sum(cleaned_text.count(word) for word in words)
                row[f"INV_{group}_Sentences"] = "; ".join(inv_sentences[group]) if inv_sentences[group] else "N/A"

            all_annotations.append(row)

    return all_annotations

# Function to clean data before saving to Excel
def clean_data_for_excel(data):
    """
    Remove illegal characters from DataFrame columns.
    """
    for column in data.columns:
        if data[column].dtype == "object":  # Only clean string-type columns
            data[column] = data[column].apply(lambda x: remove_illegal_characters(x) if isinstance(x, str) else x)
    return data

# Process all .cha files in the current folder
def process_cha_files():
    all_annotations = []

    for file_name in os.listdir("."):
        if file_name.endswith(".cha"):
            print(f"Processing file: {file_name}")
            corpus = pylangacq.read_chat(file_name)
            utterances = corpus.utterances()

            # Update INV-related data
            inv_sentences = update_inv_sentences(file_name, utterances)

            # Extract annotations with INV-related updates
            file_annotations = extract_annotations_from_utterances(utterances, file_name, inv_sentences)
            all_annotations.extend(file_annotations)

    # Convert to a DataFrame
    df = pd.DataFrame(all_annotations)

    # Clean data to remove illegal characters
    df = clean_data_for_excel(df)

    # Write to Excel
    output_file = "Sentence_with_Timestamps_and_PAR_INV_Counts.xlsx"
    df.to_excel(output_file, index=False, engine='openpyxl')
    print(f"Annotations with timestamps and PAR INV counts saved to {output_file}")

# Run the processing function
process_cha_files()

Processing file: 164-3.cha
Processing file: 172-3.cha
Processing file: 067-1.cha
Processing file: 057-2.cha
Processing file: 212-1.cha
Processing file: 049-3.cha
Processing file: 183-0.cha
Processing file: 283-1.cha
Processing file: 342-0.cha
Processing file: 094-1.cha
Processing file: 125-0.cha
Processing file: 216-1.cha
Processing file: 010-0.cha
Processing file: 319-0.cha
Processing file: 338-0.cha
Processing file: 058-0.cha
Processing file: 283-0.cha
Processing file: 051-1.cha
Processing file: 010-2.cha
Processing file: 252-2.cha
Processing file: 035-1.cha
Processing file: 183-1.cha
Processing file: 046-0.cha
Processing file: 181-0.cha
Processing file: 010-4.cha
Processing file: 341-0.cha
Processing file: 154-1.cha
Processing file: 157-2.cha
Processing file: 257-2.cha
Processing file: 181-2.cha
Processing file: 051-2.cha
Processing file: 043-0.cha
Processing file: 213-1.cha
Processing file: 164-1.cha
Processing file: 237-2.cha
Processing file: 157-1.cha
Processing file: 306-0.cha
P

In [None]:
df.head()

Unnamed: 0,file_name,participant,utterance_text,Incomplete sentences,Grammatical errors,filler words,repetitions
0,164-3.cha,PAR,hmhunh . [+ exc],0,0,0,0
1,164-3.cha,PAR,I'm writing a letter . [+ exc],0,0,0,0
2,164-3.cha,PAR,sure . [+ exc],0,0,0,0
3,164-3.cha,PAR,I need a pencil .,0,0,0,0
4,164-3.cha,PAR,the tree (i)s [/] (.) is gettin(g) (.) taller .,0,0,0,1
