## Minor Part- From Ash Code (Sentence)


In [None]:
import os
import re
import pandas as pd
import csv

In [None]:
pip install pylangacq



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir("/content/drive/MyDrive/OneDrive/DementiaBank/Pitt/Pitt/Dementia/sentence")

In [None]:
# Define the folder containing .cha files
folder_path = "./cha_files"

In [None]:
# Function to detect annotations
def minor_annotations(text):
    minor_patterns = {
        "@t": "Test word",
        r"\+\!\?": "Emphatic question",
        r"\[\!\]": "Stress",
        r"\[\!\!\]": "Contrastive stress",
        "@b": "Babbling sounds",
        "@c": "Child-invented word",
        "@d": "Dialect form",
        "@f": "Family-specific form",
        "@g": "General special form",
        "@k": "Multiple letters",
        "@l": "Single letter",
        "@q": "Metalinguistic use",
        r"@s:": "Second-language form",
        r"@s\$n": "Second-language noun",
        r"@sl": "Signed language",
        r"@sas": "Sign and speech",
        r"@z:": "User-defined code",
        "yyy": "Phonological coding",
        r"‡": "Initial interactional marker",
        r"„": "Final interactional marker",
        r"≠": "Blocking",
        r"&\*": "Interposed word",
        r'\+"\./': "Quotation follows",
        r"\$sc=n": "Scope on main tier",
        r"[↓↑]": "Tone direction",
        r"\+\^": "Quick uptake",
        r"\+,": "Self-completion",
        r"\+\+": "Other completion",
        r'\+"\\"': "Quoted utterance start",
        r'\+”/\.\'': "Quotation follows",
        r'\+"\."': "Quotation precedes",
        r"\[% text\]": "Comment on main line",
        r"\[>\]": "Overlap follows",
        r"\[<\]": "Overlap precedes",
        r"\+<": "Lazy overlap",
        r"\[/\?\]": "Unclear retracing type",
        r"\[\+exc\], \[e\]": "Excluded material",
        r"\[\^c\]": "Clause delimiter",
        r"\[- text\]": "Language precodes",
        r"\[\+ text\]": "Postcodes",
        r"\[\+ bch\]": "Excluded utterance",
        r"\[\+ trn\]": "Included utterance"
    }

    minor_found = {}
    for pattern, description in minor_patterns.items():
        matches = re.findall(pattern, text)
        minor_found[description] = len(matches)

    return  minor_found

In [None]:
import pylangacq
# Function to clean time marks but retain annotations
def clean_text(text):
    """ Clean time marks but retain annotations for detection. """
    text = re.sub(r'\x15\d+_\d+\x15', '', text)  # Remove time marks
    return text.strip()
# Function to process utterances and extract annotations
def extract_annotations_from_utterances(utterances, file_name):
    all_annotations = []

    for utterance in utterances:
        if utterance.participant == 'PAR':
            raw_text = utterance.tiers.get('PAR', '')
            cleaned_text = clean_text(raw_text)  # Clean time marks

            # Detect annotations
            annotations = detect_annotations(cleaned_text)

            # Compile the row for this utterance
            row = {
                "file_name": file_name,
                "participant": utterance.participant,
                "utterance_text": cleaned_text,
                **annotations
            }
            all_annotations.append(row)

    return all_annotations

# Process all .cha files in the current folder
# Process all .cha files in the current folder
def process_cha_files():
    all_annotations = []

    for file_name in os.listdir("."):
        if file_name.endswith(".cha"):
            print(f"Processing file: {file_name}")
            corpus = pylangacq.read_chat(file_name)
            utterances = corpus.utterances()

            # Extract annotations for this file
            file_annotations = extract_annotations_from_utterances(utterances, file_name)
            all_annotations.extend(file_annotations)

    # Convert to a DataFrame
    global df  # Declare df as global
    df = pd.DataFrame(all_annotations)

    # Write to Excel
    output_file = "annotations_minor_sentence.xlsx"
    df.to_excel(output_file, index=False, engine='openpyxl')
    print(f"Annotations extracted and saved to {output_file}")

# Run the processing function
process_cha_files()


Processing file: 164-3.cha
Processing file: 172-3.cha
Processing file: 067-1.cha
Processing file: 057-2.cha
Processing file: 212-1.cha
Processing file: 049-3.cha
Processing file: 183-0.cha
Processing file: 283-1.cha
Processing file: 342-0.cha
Processing file: 094-1.cha
Processing file: 125-0.cha
Processing file: 216-1.cha
Processing file: 010-0.cha
Processing file: 319-0.cha
Processing file: 338-0.cha
Processing file: 058-0.cha
Processing file: 283-0.cha
Processing file: 051-1.cha
Processing file: 010-2.cha
Processing file: 252-2.cha
Processing file: 035-1.cha
Processing file: 183-1.cha
Processing file: 046-0.cha
Processing file: 181-0.cha
Processing file: 010-4.cha
Processing file: 341-0.cha
Processing file: 154-1.cha
Processing file: 157-2.cha
Processing file: 257-2.cha
Processing file: 181-2.cha
Processing file: 051-2.cha
Processing file: 043-0.cha
Processing file: 213-1.cha
Processing file: 164-1.cha
Processing file: 237-2.cha
Processing file: 157-1.cha
Processing file: 306-0.cha
P

In [None]:
df.head()

Unnamed: 0,file_name,participant,utterance_text,Test word,Emphatic question,Stress,Contrastive stress,Babbling sounds,Child-invented word,Dialect form,...,Overlap follows,Overlap precedes,Lazy overlap,Unclear retracing type,Excluded material,Clause delimiter,Language precodes,Postcodes,Excluded utterance,Included utterance
0,164-3.cha,PAR,hmhunh . [+ exc],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,164-3.cha,PAR,I'm writing a letter . [+ exc],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,164-3.cha,PAR,sure . [+ exc],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,164-3.cha,PAR,I need a pencil .,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,164-3.cha,PAR,the tree (i)s [/] (.) is gettin(g) (.) taller .,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Major Code1

In [None]:
import os
import re
import pandas as pd

# Define paths
os.chdir("/content/drive/MyDrive/OneDrive/DementiaBank/Pitt/Pitt/Dementia/sentence")
folder_path = "/content/drive/MyDrive/OneDrive/DementiaBank/Pitt/Pitt/Dementia/sentence"

# Function to detect major annotations
def detect_major_annotations(text):
    major_patterns = {
        "@fp": "Filled pause",
        "@n": "Neologism",
        r"\[:: word\]": "Real-word reformulation",
        r"\[\*\]": "Error in speech",
        r"\[paraling\]": "Paralinguistic features",
        r"\[= explanation\]": "Explanation of a term",
        r"&=0word": "Omitted word",
        "xxx": "Unintelligible speech",
        r"&-": "Fillers",
        r"text\(text\)text": "Noncompletion of word",
        r"\^": "Pause within word",
        r"\.\.\.": "Pause duration",
        r"\+//\?": "Self-interrupted question",
        r"\:": "Lengthened syllable",
        r"< aft >": "Occurrence after",
        r"< bef >": "Occurrence before",
        r"\+\.\.\.": "Trailing off",
        r"\+\/\.": "Interruption",
        r"&\{l=\*\.\.\.&\}l=": "Long vocal event",
        r"&\{n=\*\.\.\.&\}n=": "Long nonvocal event",
        r"\[ \^ text \]": "Complex local event",
        r"\[=! text\]": "Paralinguistic features"
    }

    major_found = {}
    for pattern, description in major_patterns.items():
        matches = re.findall(pattern, text)
        major_found[description] = len(matches)

    return major_found

# Process files
def process_files():
    if not os.path.exists(folder_path):
        print(f"Error: Directory {folder_path} does not exist.")
        return

    if not any(file.endswith(".cha") for file in os.listdir(folder_path)):
        print(f"No .cha files found in {folder_path}.")
        return

    all_annotations = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".cha"):
            print(f"Processing file: {file_name}")
            try:
                with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8', errors='ignore') as file:
                    text = file.read()
                    detected = detect_major_annotations(text)

                    row = {
                        "File Name": file_name,
                        **detected
                    }
                    all_annotations.append(row)
            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

    if not all_annotations:
        print("No annotations found.")
        return

    df2 = pd.DataFrame(all_annotations)
    output_file = "annotations_major_sentence.xlsx"
    df2.to_excel(output_file, index=False, engine='openpyxl')
    print(f"Annotations saved to {output_file}")

# Run the function
process_files()


Processing file: 164-3.cha
Processing file: 172-3.cha
Processing file: 067-1.cha
Processing file: 057-2.cha
Processing file: 212-1.cha
Processing file: 049-3.cha
Processing file: 183-0.cha
Processing file: 283-1.cha
Processing file: 342-0.cha
Processing file: 094-1.cha
Processing file: 125-0.cha
Processing file: 216-1.cha
Processing file: 010-0.cha
Processing file: 319-0.cha
Processing file: 338-0.cha
Processing file: 058-0.cha
Processing file: 283-0.cha
Processing file: 051-1.cha
Processing file: 010-2.cha
Processing file: 252-2.cha
Processing file: 035-1.cha
Processing file: 183-1.cha
Processing file: 046-0.cha
Processing file: 181-0.cha
Processing file: 010-4.cha
Processing file: 341-0.cha
Processing file: 154-1.cha
Processing file: 157-2.cha
Processing file: 257-2.cha
Processing file: 181-2.cha
Processing file: 051-2.cha
Processing file: 043-0.cha
Processing file: 213-1.cha
Processing file: 164-1.cha
Processing file: 237-2.cha
Processing file: 157-1.cha
Processing file: 306-0.cha
P

In [None]:
df2

NameError: name 'df2' is not defined