test feature: number of pauses that follow an article and precede content words (Vincze et al., 2022) to feature set


In [None]:
# test function for one participant
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

def debug_article_pause_contentword(word_timestamp_file, pause=0.15):
    print(f"\nloading file: {word_timestamp_file}")
    df = pd.read_csv(word_timestamp_file)

    def get_tag(word):
        if isinstance(word, str) and word.strip():
            return nlp(word)[0].tag_
        return None

    df["tag"] = df["word"].apply(get_tag)

    print("\nfirst few rows with tags:")
    print(df[["word", "start", "end", "tag"]].head(10))

    article_tags = {"DT"}
    content_tags = {"NN", "NNS", "NNP", "NNPS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "JJR", "JJS"}
    filled_pauses = {"UH"}

    pause_count = 0
    total_DT_content_links = 0  
    i = 0

    print("\nchecking for DT ➝ pause ➝ content OR DT ➝ UH ➝ pause ➝ content")

    while i < len(df) - 1:
        current_tag = df.loc[i, "tag"]
        current_word = df.loc[i, "word"]

        if current_tag in article_tags:
            j = i + 1

            # case: article -> filled pause -> content-word
            if df.loc[j, "tag"] in filled_pauses:
                j += 1
                if j >= len(df):
                    break

                next_tag = df.loc[j, "tag"]
                if next_tag in content_tags:
                    pause_duration = df.loc[j, "start"] - df.loc[i, "end"]
                    print(f"→ '{current_word}' (DT) ➝ '{df.loc[i+1, 'word']}' (UH) ➝ '{df.loc[j, 'word']}' ({next_tag}) | Pause: {pause_duration:.3f}s")
                    total_DT_content_links += 1
                    if pause_duration > pause:
                        print(f"matched DT ➝ UH ➝ pause ➝ content: '{current_word}' ➝ '{df.loc[i+1, 'word']}' ➝ '{df.loc[j, 'word']}'")
                        pause_count += 1
                i = j
                continue

            # case: article -> silent pause -> content-word
            next_tag = df.loc[j, "tag"]
            if next_tag in content_tags:
                pause_duration = df.loc[j, "start"] - df.loc[i, "end"]
                print(f"→ '{current_word}' (DT) ➝ '{df.loc[j, 'word']}' ({next_tag}) | Pause: {pause_duration:.3f}s")
                total_DT_content_links += 1
                if pause_duration > pause:
                    print(f"matched DT ➝ pause ➝ content: '{current_word}' ➝ '{df.loc[j, 'word']}'")
                    pause_count += 1

        i += 1

    print(f"\ntotal matches: {pause_count}")
    print(f"total DT ➝ content links: {total_DT_content_links}")

    if total_DT_content_links > 0:
        ratio = pause_count / total_DT_content_links
        print(f"ratio: {ratio:.2f}")
    else:
        print("ratio: N/A (no DT ➝ content links found)")

    return pause_count, total_DT_content_links, ratio if total_DT_content_links > 0 else None


In [None]:
subject_id = "41"
timestamp_file = f"/Volumes/methlab/Students/Gila/word_timestamps/cookieTheft/google/timestamps/{subject_id}.csv"
debug_article_pause_contentword(timestamp_file)

In [None]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

def debug_article_pause_patterns(word_timestamp_file, pause_threshold=0.15):
    print(f"\nloading file: {word_timestamp_file}")
    df = pd.read_csv(word_timestamp_file)

    # POS-tag row by row
    def get_pos(word):
        if word == "[pause]":
            return "PAUSE"
        try:
            doc = nlp(word)
            return doc[0].tag_ if doc else "X"
        except:
            return "X"

    df["pos"] = df["word"].apply(get_pos)

    # Insert [pause] rows
    pause_rows = []
    for i in range(1, len(df)):
        pause = df.loc[i, "start"] - df.loc[i - 1, "end"]
        if pause > pause_threshold:
            pause_row = {
                "word": "[pause]",
                "start": df.loc[i - 1, "end"],
                "end": df.loc[i, "start"],
                "pos": "PAUSE"
            }
            pause_rows.append((i, pause_row))

    for idx, row in reversed(pause_rows):
        df = pd.concat([df.iloc[:idx], pd.DataFrame([row]), df.iloc[idx:]], ignore_index=True)

    # Categorize POS
    def categorize(tag):
        if tag in {"UH", "PAUSE"}:
            return "PAUSE"
        elif tag == "DT":
            return "ARTICLE"
        elif tag in {
            "NN", "NNS", "NNP", "NNPS",
            "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
            "JJ", "JJR", "JJS"
        }:
            return "CONTENT"
        else:
            return "OTHER"

    df["pos_category"] = df["pos"].apply(categorize)

    # Show output
    pd.set_option('display.max_rows', None)
    print("\nfirst few rows (with POS tags and categories):")
    print(df[["word", "start", "end", "pos", "pos_category"]].head(130))

    print("\nPOS category sequence:")
    print(df["pos_category"].tolist())

    # Match patterns
    patterns = [
        ["ARTICLE", "PAUSE", "CONTENT"],
        ["ARTICLE", "PAUSE", "ARTICLE", "CONTENT"],
        ["ARTICLE", "PAUSE", "ARTICLE", "PAUSE", "CONTENT"]
    ]
    sequence = df["pos_category"].tolist()

    total_article_content = sum(
        1 for i in range(len(sequence) - 1)
        if sequence[i] == "ARTICLE" and sequence[i + 1] == "CONTENT"
    )

    match_count = 0
    print("\nChecking for pattern matches...")
    for i in range(len(sequence)):
        for pattern in patterns:
            if sequence[i:i + len(pattern)] == pattern:
                words_matched = df["word"].iloc[i:i + len(pattern)].tolist()
                print(f"Matched pattern {pattern}: {words_matched}")
                match_count += 1
                break

    print(f"\nTotal pattern matches: {match_count}")
    print(f"Total ARTICLE ➝ CONTENT transitions: {total_article_content}")
    ratio = match_count / total_article_content if total_article_content > 0 else None
    print(f"Ratio: {ratio:.2f}" if ratio is not None else "Ratio: N/A")

    return match_count, total_article_content, ratio

# use for example-subject
subject_id = "41"
task = "cookieTheft"
timestamp_file = f"/Volumes/methlab/Students/Gila/word_timestamps/{task}/google/timestamps/{subject_id}.csv"
debug_article_pause_patterns(timestamp_file)
