<a href="https://colab.research.google.com/github/eliseobao/redsm5/blob/main/analysis/linguistic/pronouns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pronouns usage

In [1]:
import os

os.environ["SHELL"] = "/bin/bash"

In [2]:
%%capture
!pip install spacy
!python3 -m spacy download en_core_web_sm

In [8]:
import spacy
import pandas as pd
from tqdm import tqdm
from collections import Counter

nlp = spacy.load("en_core_web_sm")

In [9]:
SYMPTOMS = [
    "NO_SYMPTOMS",
    "DEPRESSED_MOOD",
    "ANHEDONIA",
    "APPETITE_CHANGE",
    "SLEEP_ISSUES",
    "PSYCHOMOTOR",
    "FATIGUE",
    "WORTHLESSNESS",
    "COGNITIVE_ISSUES",
    "SUICIDAL_THOUGHTS",
]

In [10]:
data = pd.read_csv("data/redsm5.csv")

texts_per_symptom = {}
for symptom in SYMPTOMS:
    texts_per_symptom[symptom] = data.loc[
        data["labels"].str.contains(symptom), "text"
    ].tolist()

In [None]:
def get_pronouns_counts(text):
    """
    Count the occurrences of first-person singular, first-person plural, second person, third-person singular, and third-person plural pronouns in the given text.

    Parameters:
    - text (str): Input text to analyze.

    Returns:
    Tuple[int, int, int, int, int]: A tuple containing counts of first-person singular, first-person plural, second person, third-person singular, and third-person plural pronouns.
    """
    doc = nlp(text)
    pronoun_counter = Counter(
        token.text.lower() for token in doc if token.pos_ == "PRON"
    )

    first_person_singular_count = pronoun_counter["i"]
    first_person_plural_count = pronoun_counter["we"]
    second_person_singular_count = pronoun_counter["you"]
    third_person_singular_count = (
        pronoun_counter["he"] + pronoun_counter["she"] + pronoun_counter["it"]
    )
    third_person_plural_count = pronoun_counter["they"]

    return (
        first_person_singular_count,
        first_person_plural_count,
        second_person_singular_count,
        third_person_singular_count,
        third_person_plural_count,
    )

In [24]:
results = {}
for symptom, texts in texts_per_symptom.items():
    print(f"Analyzing pronouns for {symptom}...")
    (
        total_first_person_singular_count,
        total_first_person_plural_count,
        total_second_person_singular_count,
        total_third_person_singular_count,
        total_third_person_plural_count,
    ) = (0, 0, 0, 0, 0)

    for text in tqdm(texts):
        (
            first_person_singular_count,
            first_person_plural_count,
            second_person_singular_count,
            third_person_singular_count,
            third_person_plural_count,
        ) = get_pronouns_counts(text)

        total_first_person_singular_count += first_person_singular_count
        total_first_person_plural_count += first_person_plural_count
        total_second_person_singular_count += second_person_singular_count
        total_third_person_singular_count += third_person_singular_count
        total_third_person_plural_count += third_person_plural_count

    results[symptom] = {
        "total_first_person_singular_count": total_first_person_singular_count,
        "total_first_person_plural_count": total_first_person_plural_count,
        "total_second_person_singular_count": total_second_person_singular_count,
        "total_third_person_singular_count": total_third_person_singular_count,
        "total_third_person_plural_count": total_third_person_plural_count,
    }

Analyzing pronouns for NO_SYMPTOMS...


100%|██████████| 392/392 [00:26<00:00, 14.95it/s]


Analyzing pronouns for DEPRESSED_MOOD...


100%|██████████| 328/328 [00:14<00:00, 23.04it/s]


Analyzing pronouns for ANHEDONIA...


100%|██████████| 124/124 [00:03<00:00, 33.40it/s]


Analyzing pronouns for APPETITE_CHANGE...


100%|██████████| 44/44 [00:01<00:00, 24.17it/s]


Analyzing pronouns for SLEEP_ISSUES...


100%|██████████| 102/102 [00:03<00:00, 26.23it/s]


Analyzing pronouns for PSYCHOMOTOR...


100%|██████████| 35/35 [00:01<00:00, 18.06it/s]


Analyzing pronouns for FATIGUE...


100%|██████████| 124/124 [00:05<00:00, 22.62it/s]


Analyzing pronouns for WORTHLESSNESS...


100%|██████████| 311/311 [00:12<00:00, 24.96it/s]


Analyzing pronouns for COGNITIVE_ISSUES...


100%|██████████| 59/59 [00:01<00:00, 30.17it/s]


Analyzing pronouns for SUICIDAL_THOUGHTS...


100%|██████████| 165/165 [00:05<00:00, 32.00it/s]


In [25]:
results_percentage = {}

for symptom, results in results.items():
    total_pronouns = (
        results["total_first_person_singular_count"]
        + results["total_first_person_plural_count"]
        + results["total_second_person_singular_count"]
        + results["total_third_person_singular_count"]
        + results["total_third_person_plural_count"]
    )

    first_person_singular_percentage = (
        ((results["total_first_person_singular_count"] / total_pronouns) * 100)
        if total_pronouns > 0
        else 0
    )

    first_person_plural_percentage = (
        ((results["total_first_person_plural_count"] / total_pronouns) * 100)
        if total_pronouns > 0
        else 0
    )

    second_person_singular_percentage = (
        ((results["total_second_person_singular_count"] / total_pronouns) * 100)
        if total_pronouns > 0
        else 0
    )

    third_person_singular_percentage = (
        ((results["total_third_person_singular_count"] / total_pronouns) * 100)
        if total_pronouns > 0
        else 0
    )

    third_person_plural_percentage = (
        ((results["total_third_person_plural_count"] / total_pronouns) * 100)
        if total_pronouns > 0
        else 0
    )

    results_percentage[symptom] = {
        "first_person_singular_percentage": first_person_singular_percentage,
        "first_person_plural_percentage": first_person_plural_percentage,
        "second_person_singular_percentage": second_person_singular_percentage,
        "third_person_singular_percentage": third_person_singular_percentage,
        "third_person_plural_percentage": third_person_plural_percentage,
    }

In [27]:
for symptom in SYMPTOMS:
    print(f"\nSymptom: {symptom}")
    print(
        f"First Person Singular: {results_percentage[symptom]['first_person_singular_percentage']:.2f}"
    )
    print(
        f"First Person Plural: {results_percentage[symptom]['first_person_plural_percentage']:.2f}"
    )
    print(
        f"Second Person Singular: {results_percentage[symptom]['second_person_singular_percentage']:.2f}"
    )
    print(
        f"Third Person Singular: {results_percentage[symptom]['third_person_singular_percentage']:.2f}"
    )
    print(
        f"Third Person Plural: {results_percentage[symptom]['third_person_plural_percentage']:.2f}"
    )


Symptom: NO_SYMPTOMS
First Person Singular: 51.84
First Person Plural: 5.21
Second Person Singular: 8.41
Third Person Singular: 30.37
Third Person Plural: 4.16

Symptom: DEPRESSED_MOOD
First Person Singular: 63.52
First Person Plural: 3.84
Second Person Singular: 6.11
Third Person Singular: 22.43
Third Person Plural: 4.09

Symptom: ANHEDONIA
First Person Singular: 69.86
First Person Plural: 1.69
Second Person Singular: 6.57
Third Person Singular: 19.18
Third Person Plural: 2.69

Symptom: APPETITE_CHANGE
First Person Singular: 68.98
First Person Plural: 1.17
Second Person Singular: 5.38
Third Person Singular: 19.95
Third Person Plural: 4.52

Symptom: SLEEP_ISSUES
First Person Singular: 65.23
First Person Plural: 2.53
Second Person Singular: 7.29
Third Person Singular: 21.11
Third Person Plural: 3.84

Symptom: PSYCHOMOTOR
First Person Singular: 64.60
First Person Plural: 2.48
Second Person Singular: 7.13
Third Person Singular: 22.23
Third Person Plural: 3.56

Symptom: FATIGUE
First Pers