In [1]:
from tqdm import tqdm
import json
import pandas as pd
from pathlib import Path
import numpy as np

import spacy
import scispacy
from scispacy.umls_linking import UmlsEntityLinker
import emoji

In [2]:
posts_folder = Path('dataset') / 'emetophobia_posts'
with open(posts_folder / 'emetophobia_all_posts_one_label_normalized.json', 'r') as f:
    all_posts = json.load(f)

len(all_posts)

986

In [3]:
ner_dir = Path('ner')

In [4]:
nlp = spacy.load("en_core_sci_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [5]:
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
linker = UmlsEntityLinker(resolve_abbreviations=True, filter_for_definitions=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
def minimal_normalziation_noemoji(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = emoji.replace_emoji(text, replace='')
    return text.strip()

In [7]:
for post in tqdm(all_posts):
    doc = nlp(minimal_normalziation_noemoji(post['content']))
    post['entities'] = [(ent, ent._.kb_ents[0][0]) for ent in doc.ents if len(ent._.kb_ents) > 0 and ent._.kb_ents[0][0] in linker.kb.cui_to_entity]

  0%|          | 0/986 [00:00<?, ?it/s]

100%|██████████| 986/986 [00:44<00:00, 22.22it/s]


## Top entities

In [8]:
from collections import Counter

In [9]:
counter = Counter()
for post in tqdm(all_posts):
    for ent, cui in post['entities']:
        counter[cui] += 1

100%|██████████| 986/986 [00:00<00:00, 155983.24it/s]


In [10]:
counter.most_common(20)

[('C0212009', 1988),
 ('C0003467', 471),
 ('C0221423', 352),
 ('C0332173', 316),
 ('C0027497', 313),
 ('C0038351', 302),
 ('C0015726', 294),
 ('C0240526', 188),
 ('C0027361', 187),
 ('C0439234', 171),
 ('C0040364', 164),
 ('C0013470', 163),
 ('C0442519', 152),
 ('C0086769', 147),
 ('C0439230', 132),
 ('C0439227', 132),
 ('C0040223', 131),
 ('C0349231', 131),
 ('C0016452', 127),
 ('C0439228', 121)]

In [11]:
for cui, count in counter.most_common(20):
    print(f"{linker.kb.cui_to_entity[cui]}=====")

CUI: C0212009, Name: I-antigen
Definition: None
TUI(s): T116, T129
Aliases (abbreviated, total: 12): 
	 I BLOOD GROUP ANTIGENS, Blood group antigen I, I antigen, I, Blood group antigen I (substance), I Antigen, I-antigen, Blood group antigen I, NOS, I NOS Ag, I Ag=====
CUI: C0003467, Name: Anxiety
Definition: Feelings or emotions of dread, apprehension, and impending disaster but not disabling as with ANXIETY DISORDERS.
TUI(s): T048
Aliases (abbreviated, total: 21): 
	 anxious, Angst, anxieties, Anxiety, Anxiety Reaction, Feel anxious, Excessive, persistent worry and fear, unfocused anxiety, Anxiety (finding), angst=====
CUI: C0221423, Name: Illness (finding)
Definition: A state of ill health, bodily malfunction, or discomfort.
TUI(s): T184
Aliases (abbreviated, total: 13): 
	 sicknesses, sickness, Illness, NOS, Illness (finding), Sickness, ailments, feeling ill, Sick, Illness, illnesses=====
CUI: C0332173, Name: Daily
Definition: Occurring or done each day.
TUI(s): T079
Aliases (abbre

## Top entities per label

In [12]:
labels = set()
for post in tqdm(all_posts):
    labels.update([post['labels']])
labels

100%|██████████| 986/986 [00:00<00:00, 1623070.54it/s]


{'Does Anyone Else...?',
 'Interesting info/Articles',
 'It Happened (TW)',
 'Moderator',
 'Needing Support - Anxious about FP',
 'Needing Support - In Acute Crisis (at risk of self injury)',
 'Needing Support - N, V, D etc',
 'Needing Support - N, V, D etc NO REASSURANCE',
 'Needing Support - Non-Emet related',
 'Needing support - Panic attack',
 'Needing support: Just not feeling good',
 'None',
 'Positive Reminder',
 'Potentially Triggering',
 'Question',
 'Rant',
 'Recovery',
 'Success!',
 'Techniques, tips and tricks',
 'Therapy info!',
 'Venting - Advice wanted',
 'Venting - No advice please'}

In [13]:
for label in labels:
    print(label)
    lbl_counter = Counter()
    for post in all_posts:
        if label in post['labels']:
            for ent, cui in post['entities']:
                lbl_counter[cui] += 1
    print(lbl_counter.most_common(5))

    for cui, count in lbl_counter.most_common(5):
        print(f"{linker.kb.cui_to_entity[cui]}=====")
    print('\n\n')

Does Anyone Else...?
[('C0212009', 192), ('C0003467', 62), ('C0221423', 33), ('C0027497', 28), ('C0038351', 22)]
CUI: C0212009, Name: I-antigen
Definition: None
TUI(s): T116, T129
Aliases (abbreviated, total: 12): 
	 I BLOOD GROUP ANTIGENS, Blood group antigen I, I antigen, I, Blood group antigen I (substance), I Antigen, I-antigen, Blood group antigen I, NOS, I NOS Ag, I Ag=====
CUI: C0003467, Name: Anxiety
Definition: Feelings or emotions of dread, apprehension, and impending disaster but not disabling as with ANXIETY DISORDERS.
TUI(s): T048
Aliases (abbreviated, total: 21): 
	 anxious, Angst, anxieties, Anxiety, Anxiety Reaction, Feel anxious, Excessive, persistent worry and fear, unfocused anxiety, Anxiety (finding), angst=====
CUI: C0221423, Name: Illness (finding)
Definition: A state of ill health, bodily malfunction, or discomfort.
TUI(s): T184
Aliases (abbreviated, total: 13): 
	 sicknesses, sickness, Illness, NOS, Illness (finding), Sickness, ailments, feeling ill, Sick, Illne

## Total entities per label

In [14]:
for label in labels:
    lbl_counter = 0
    for post in all_posts:
        if label in post['labels']:
            lbl_counter += len(post['entities'])
    
    print(f"{label}: {lbl_counter} entities")

Does Anyone Else...?: 1407 entities
Techniques, tips and tricks: 461 entities
Needing Support - Anxious about FP: 453 entities
Success!: 1122 entities
Interesting info/Articles: 21 entities
Venting - Advice wanted: 1040 entities
Venting - No advice please: 46 entities
Needing support - Panic attack: 2504 entities
Question: 2581 entities
It Happened (TW): 362 entities
Positive Reminder: 221 entities
Needing Support - N, V, D etc NO REASSURANCE: 72 entities
Therapy info!: 28 entities
Needing Support - In Acute Crisis (at risk of self injury): 713 entities
Recovery: 342 entities
None: 169 entities
Needing Support - N, V, D etc: 667 entities
Moderator: 160 entities
Needing Support - Non-Emet related: 106 entities
Rant: 2616 entities
Potentially Triggering: 1849 entities
Needing support: Just not feeling good: 1240 entities


## Statistics entities per post

In [15]:
all_num_entities = []

for post in tqdm(all_posts):
    all_num_entities.append(len(post['entities']))

all_num_entities = np.array(all_num_entities)
print(f"Mean number of entities per post: {np.mean(all_num_entities)}")
print(f"Median number of entities per post: {np.median(all_num_entities)}")
print(f"Max number of entities per post: {np.max(all_num_entities)}")
print(f"Min number of entities per post: {np.min(all_num_entities)}")
print(f"Stdev number of entities per post: {np.std(all_num_entities)}")

100%|██████████| 986/986 [00:00<00:00, 2355116.03it/s]

Mean number of entities per post: 18.365111561866126
Median number of entities per post: 14.0
Max number of entities per post: 160
Min number of entities per post: 0
Stdev number of entities per post: 16.834493223950325





## Statistics entities per label

In [16]:
lbl_dict = {}

for label in labels:
    print(label)
    lbl_num_entities = []
    for post in all_posts:
        if label in post['labels']:
            lbl_num_entities.append(len(post['entities']))
    lbl_num_entities = np.array(lbl_num_entities)

    lbl_dict[label] = {
        'mean': np.mean(lbl_num_entities),
        'median': np.median(lbl_num_entities),
        'max': np.max(lbl_num_entities),
        'min': np.min(lbl_num_entities),
        'stdev': np.std(lbl_num_entities),
        'num_posts': len(lbl_num_entities)
    }

    print(f"Mean number of entities per post: {np.mean(lbl_num_entities)}")
    print(f"Median number of entities per post: {np.median(lbl_num_entities)}")
    print(f"Max number of entities per post: {np.max(lbl_num_entities)}")
    print(f"Min number of entities per post: {np.min(lbl_num_entities)}")
    print(f"Stdev number of entities per post: {np.std(lbl_num_entities)}")
    print("\n\n")

Does Anyone Else...?
Mean number of entities per post: 18.03846153846154
Median number of entities per post: 14.5
Max number of entities per post: 53
Min number of entities per post: 1
Stdev number of entities per post: 13.112376525479908



Techniques, tips and tricks
Mean number of entities per post: 25.61111111111111
Median number of entities per post: 13.0
Max number of entities per post: 85
Min number of entities per post: 3
Stdev number of entities per post: 25.351175490801843



Needing Support - Anxious about FP
Mean number of entities per post: 14.612903225806452
Median number of entities per post: 13.0
Max number of entities per post: 44
Min number of entities per post: 1
Stdev number of entities per post: 9.690958271318745



Success!
Mean number of entities per post: 29.526315789473685
Median number of entities per post: 22.0
Max number of entities per post: 119
Min number of entities per post: 1
Stdev number of entities per post: 24.419294401316588



Interesting info/Arti

In [17]:
df_lbl_scispacy = pd.DataFrame.from_dict(lbl_dict, orient='index')

df_lbl_scispacy.to_excel(ner_dir / 'emetophobia_labels_ner_scispacy.xlsx', index=True, index_label='label')

In [18]:
all_serializable_posts = []

for post in tqdm(all_posts):
    new_post = post.copy()
    new_ents = []
    for ent in new_post['entities']:
        new_ents.append([ent[0].text, ent[1]])
    new_post['entities'] = new_ents
    all_serializable_posts.append(new_post)

len(all_serializable_posts)

100%|██████████| 986/986 [00:00<00:00, 30474.58it/s]


986

In [19]:
with open(ner_dir / 'emetophobia_posts_ner_scispacy.json', 'w') as f:
    json.dump(all_serializable_posts, f, indent=4)

## Posts with 0 entities

In [20]:
posts_0_ents = []

for post in tqdm(all_posts):
    if len(post['entities']) == 0:
        posts_0_ents.append(post['content'])

len(posts_0_ents)

100%|██████████| 986/986 [00:00<00:00, 2634129.77it/s]


10

In [21]:
df_posts_0_ents = pd.DataFrame(posts_0_ents, columns=['content'])
df_posts_0_ents.to_excel(ner_dir / 'emetophobia_posts_0_entities_scispacy.xlsx', index=False)