## Major Part-1

In [23]:
import os
import re
import pandas as pd
import csv

In [24]:
pip install pylangacq



In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
os.chdir("/content/drive/MyDrive/OneDrive/DementiaBank/Pitt/Pitt/Dementia/sentence")

In [27]:
# Define the folder containing .cha files
folder_path = "./cha_files"

In [28]:
import re

def major_annotations(text):
    major_patterns = {
        "@fp": "Filled pause",
        "@n": "Neologism",
        r"\[:: [^\]]+\]": "Real-word reformulation",  # updated: replaced [:: word] with flexible text matching
        r"\[\*\]": "Error in speech",
        r"\[paraling\]": "Paralinguistic features1",
        # removed: r"\[= explanation\]" — this was a fixed string, replaced below
        r"\[= [^\]]+\]": "Word explanation",  # new: supports patterns like [= closet]
        r"\w*\([^()]+\)\w*": "Noncompletion of word",  # updated: generalized pattern for (text)
        "xxx": "Unintelligible speech",
        r"&-": "Fillers",
        r"&=0\w+": "Omitted word",  # updated: general pattern for &=0word
        r"\^": "Pause within word",
        r"\(\.\)": "Short pause",  # new: added short pause
        r"\(\.\.\)": "Medium pause",  # new: added medium pause
        r"\(\.\.\.\)": "Long pause",  # updated: replaced ... with (...) convention
        r"\+//\?": "Self-interrupted question",
        r"< aft >": "Occurrence after",
        r"< bef >": "Occurrence before",
        r"\+\.\.\.": "Trailing off",
        r"\+\/\.": "Interruption",
        r"&\{l=\*\.\.\.&\}l=": "Long vocal event",
        r"&\{n=\*\.\.\.&\}n=": "Long nonvocal event",
        r"\[\^ [^\]]+\]": "Complex local event",  # updated: generalized to match any content
        r"\[=! [^\]]+\]": "Paralinguistic features2",  # updated: generalized to match [=! cries]
        r"\[# [^\]]+\]": "Duration",  # updated: generalized from [# time]
        r"\[: [^\]]+\]": "Standardized replacement",  # updated: generalized for [: text]
        r"\[/\]": "Repetition",
        r"\[//\]": "Retracing with correction",
        r"\[///\]": "Full reformulation",
        r"\[/-\]": "False start without retracing",
        r"@i": "Interjection",
        r"@o": "Onomatopoeia",
        r"@p": "Phonologically consistent form",
        r"@si": "Singing",
        r"@wp": "Word play",
        r"@x": "Excluded words",
        r"&\+": "Phonological fragments",
        r"&~": "Nonwords",
        r"↫": "Repeated segment",
        r"\[=\? [^\]]+\]": "Alternative transcription",  # updated: supports actual [=? one too]
        "www": "Untranscribed material",
        r"\[\?\]": "Uncertain word",
        r"\[\+ gram\]": "Grammatical error",
        r"\[\+ jar\]": "Jar-related annotation",
        r"\[\+ es\]": "ES-related annotation",
        r"\[\+ per\]": "PER-related annotation",
        r"\[\+ cir\]": "CIR-related annotation",
        r"&=": "Simple Event/Paralinguistic3",
        r"\+<": "Lazy overlap",
        r"‡": "Initial interactional marker",
        r"„": "Blocking",
        r"@q": "Metalinguistic use"
    }



    major_found = {}
    for pattern, description in major_patterns.items():
        matches = re.findall(pattern, text)
        major_found[description] = len(matches)

    return major_found


In [None]:
import pylangacq
# Function to clean time marks but retain annotations
def clean_text(text):
    """ Clean time marks but retain annotations for detection. """
    text = re.sub(r'\x15\d+_\d+\x15', '', text)  # Remove time marks
    return text.strip()
# Function to process utterances and extract annotations
def extract_annotations_from_utterances(utterances, file_name):
    all_annotations = []

    for utterance in utterances:
        if utterance.participant == 'PAR':
            raw_text = utterance.tiers.get('PAR', '')
            cleaned_text = clean_text(raw_text)  # Clean time marks

            # Detect annotations
            annotations = major_annotations(cleaned_text)

            # Compile the row for this utterance
            row = {
                "file_name": file_name,
                "participant": utterance.participant,
                "utterance_text": cleaned_text,
                **annotations
            }
            all_annotations.append(row)

    return all_annotations

# Process all .cha files in the current folder
def process_cha_files():
    all_annotations = []

    for file_name in os.listdir("."):
        if file_name.endswith(".cha"):
            print(f"Processing file: {file_name}")
            corpus = pylangacq.read_chat(file_name)
            utterances = corpus.utterances()

            # Extract annotations for this file
            file_annotations = extract_annotations_from_utterances(utterances, file_name)
            all_annotations.extend(file_annotations)

    # Convert to a DataFrame
    global df  # Declare df as global
    df = pd.DataFrame(all_annotations)

    # Write to Excel
    output_file = "annotations_major_sentence.xlsx"
    df.to_excel(output_file, index=False, engine='openpyxl')
    print(f"Annotations extracted and saved to {output_file}")

# Run the processing function
process_cha_files()


Processing file: 164-3.cha
Processing file: 172-3.cha
Processing file: 067-1.cha
Processing file: 057-2.cha
Processing file: 212-1.cha
Processing file: 049-3.cha
Processing file: 183-0.cha
Processing file: 283-1.cha
Processing file: 342-0.cha
Processing file: 094-1.cha
Processing file: 125-0.cha
Processing file: 216-1.cha
Processing file: 010-0.cha
Processing file: 319-0.cha
Processing file: 338-0.cha
Processing file: 058-0.cha
Processing file: 283-0.cha
Processing file: 051-1.cha
Processing file: 010-2.cha
Processing file: 252-2.cha
Processing file: 035-1.cha
Processing file: 183-1.cha
Processing file: 046-0.cha
Processing file: 181-0.cha
Processing file: 010-4.cha
Processing file: 341-0.cha
Processing file: 154-1.cha
Processing file: 157-2.cha
Processing file: 257-2.cha
Processing file: 181-2.cha
Processing file: 051-2.cha
Processing file: 043-0.cha
Processing file: 213-1.cha
Processing file: 164-1.cha
Processing file: 237-2.cha
Processing file: 157-1.cha
Processing file: 306-0.cha
P

In [None]:
df.head()