<a href="https://colab.research.google.com/github/eliseobao/redsm5/blob/main/analysis/lexical/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER

In [1]:
import os

os.environ["SHELL"] = "/bin/bash"

In [2]:
%%capture
!pip install spacy
!python3 -m spacy download en_core_web_sm

In [3]:
import spacy
import pandas as pd
from tqdm import tqdm
from collections import Counter

nlp = spacy.load("en_core_web_sm")

In [4]:
SYMPTOMS = [
    "NO_SYMPTOMS",
    "DEPRESSED_MOOD",
    "ANHEDONIA",
    "APPETITE_CHANGE",
    "SLEEP_ISSUES",
    "PSYCHOMOTOR",
    "FATIGUE",
    "WORTHLESSNESS",
    "COGNITIVE_ISSUES",
    "SUICIDAL_THOUGHTS",
]

In [None]:
data = pd.read_csv("data/redsm5.csv")

texts_per_symptom = {}
for symptom in SYMPTOMS:
    texts_per_symptom[symptom] = data.loc[
        data["labels"].str.contains(symptom), "text"
    ].tolist()

In [None]:
def analyze_named_entities(text):
    """
    Analyze named entities in the given text.

    Parameters:
    - text (str): The input text to analyze.

    Returns:
    Tuple[dict, Union[tuple, None]]: A tuple containing:
        - A dictionary of named entity type percentages.
        - A tuple representing the most referenced named entity and its entity type, or None if no entities are found.
    """
    doc = nlp(text)

    named_entities = [(ent.text, ent.label_) for ent in doc.ents]

    if not named_entities:
        return {}, None

    entity_types = [ent_type for _, ent_type in named_entities]
    entity_type_counts = Counter(entity_types)

    total_entities = len(entity_types)
    entity_type_percentages = {
        ent_type: count / total_entities * 100
        for ent_type, count in entity_type_counts.items()
    }

    most_referenced_entity, _ = max(Counter(named_entities), key=lambda x: x[1])

    return entity_type_percentages, most_referenced_entity

In [None]:
results = {}
for symptom in SYMPTOMS:
    print(f"Analyzing {symptom} texts")
    results[symptom] = [
        analyze_named_entities(text) for text in tqdm(texts_per_symptom[symptom])
    ]

Analyzing NO_SYMPTOMS texts


100%|██████████| 392/392 [00:27<00:00, 14.23it/s]


Analyzing DEPRESSED_MOOD texts


100%|██████████| 328/328 [00:14<00:00, 22.36it/s]


Analyzing ANHEDONIA texts


100%|██████████| 124/124 [00:03<00:00, 32.73it/s]


Analyzing APPETITE_CHANGE texts


100%|██████████| 44/44 [00:01<00:00, 23.68it/s]


Analyzing SLEEP_ISSUES texts


100%|██████████| 102/102 [00:03<00:00, 25.78it/s]


Analyzing PSYCHOMOTOR texts


100%|██████████| 35/35 [00:01<00:00, 17.74it/s]


Analyzing FATIGUE texts


100%|██████████| 124/124 [00:05<00:00, 22.06it/s]


Analyzing WORTHLESSNESS texts


100%|██████████| 311/311 [00:12<00:00, 24.67it/s]


Analyzing COGNITIVE_ISSUES texts


100%|██████████| 59/59 [00:01<00:00, 29.99it/s]


Analyzing SUICIDAL_THOUGHTS texts


100%|██████████| 165/165 [00:05<00:00, 32.12it/s]


In [None]:
for symptom, result in results.items():
    total_entity_type_percentages = Counter()
    most_referenced_entities = {}

    for entity_type_percentages, most_referenced_entity in result:
        total_entity_type_percentages.update(entity_type_percentages)

        if most_referenced_entity:
            most_referenced_entities[
                texts_per_symptom[symptom][
                    result.index((entity_type_percentages, most_referenced_entity))
                ]
            ] = most_referenced_entity

    total_entities_across_texts = sum(total_entity_type_percentages.values())
    overall_entity_type_percentages = {
        ent_type: count / total_entities_across_texts * 100
        for ent_type, count in total_entity_type_percentages.items()
    }

    print(f"\n{symptom}")
    for ent_type, percentage in overall_entity_type_percentages.items():
        print(f"{ent_type}: {percentage:.2f}%")


NO_SYMPTOMS
ORG: 8.76%
DATE: 33.57%
CARDINAL: 15.16%
ORDINAL: 5.63%
GPE: 4.21%
PERCENT: 1.91%
TIME: 11.95%
QUANTITY: 0.80%
PERSON: 13.92%
NORP: 1.19%
PRODUCT: 0.77%
FAC: 0.38%
LANGUAGE: 0.04%
WORK_OF_ART: 0.53%
LOC: 0.24%
LAW: 0.13%
MONEY: 0.80%
EVENT: 0.01%

DEPRESSED_MOOD
ORG: 6.84%
DATE: 47.17%
CARDINAL: 16.14%
PERSON: 8.37%
TIME: 8.50%
GPE: 3.48%
ORDINAL: 4.71%
NORP: 1.39%
FAC: 0.08%
MONEY: 0.66%
PRODUCT: 0.45%
LOC: 0.43%
EVENT: 0.02%
PERCENT: 0.63%
QUANTITY: 0.62%
WORK_OF_ART: 0.37%
LANGUAGE: 0.02%
LAW: 0.11%

ANHEDONIA
ORDINAL: 3.16%
DATE: 49.84%
CARDINAL: 18.89%
PERSON: 5.29%
TIME: 7.06%
ORG: 8.90%
LOC: 0.37%
GPE: 3.05%
NORP: 1.27%
QUANTITY: 0.65%
MONEY: 0.18%
PERCENT: 0.45%
PRODUCT: 0.73%
EVENT: 0.10%
LANGUAGE: 0.08%

APPETITE_CHANGE
CARDINAL: 19.98%
ORG: 9.56%
GPE: 1.86%
DATE: 46.94%
TIME: 6.19%
QUANTITY: 2.17%
PERSON: 6.64%
ORDINAL: 4.32%
PERCENT: 0.61%
LOC: 0.08%
FAC: 0.08%
MONEY: 0.34%
NORP: 0.55%
EVENT: 0.67%

SLEEP_ISSUES
TIME: 33.17%
CARDINAL: 13.97%
ORG: 6.01%
DATE: 31