In [7]:
from tqdm import tqdm
import json
import pandas as pd
from pathlib import Path
import numpy as np
import emoji

In [8]:
posts_folder = Path('dataset') / 'emetophobia_posts'
with open(posts_folder / 'emetophobia_all_posts_one_label_normalized.json', 'r') as f:
    all_posts = json.load(f)

len(all_posts)

986

In [9]:
ner_dir = Path('ner')

In [10]:
from transformers import pipeline

pipe = pipeline("token-classification", model="d4data/biomedical-ner-all", aggregation_strategy='simple')

Device set to use cuda:0


In [11]:
def minimal_normalziation_noemoji(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = emoji.replace_emoji(text, replace='')
    return text.strip()

In [12]:
for post in tqdm(all_posts):
    post['entities'] = [[e['word'], e['entity_group']] for e in pipe(minimal_normalziation_noemoji(post['content'])) if e['score'] > 0.5]

100%|██████████| 986/986 [00:07<00:00, 124.94it/s]


## Top entities

In [13]:
from collections import Counter

In [14]:
counter = Counter()
for post in tqdm(all_posts):
    for ent, ent_group in post['entities']:
        counter[ent_group] += 1

100%|██████████| 986/986 [00:00<00:00, 210430.15it/s]


In [15]:
counter.most_common(20)

[('Sign_symptom', 4069),
 ('Activity', 1271),
 ('Detailed_description', 1199),
 ('Time', 536),
 ('Medication', 449),
 ('Subject', 435),
 ('Disease_disorder', 424),
 ('Biological_structure', 389),
 ('Duration', 384),
 ('Nonbiological_location', 366),
 ('History', 337),
 ('Therapeutic_procedure', 292),
 ('Lab_value', 272),
 ('Coreference', 271),
 ('Frequency', 226),
 ('Diagnostic_procedure', 182),
 ('Clinical_event', 160),
 ('Date', 128),
 ('Severity', 91),
 ('Other_entity', 72)]

## Top entities per label

In [16]:
labels = set()
for post in tqdm(all_posts):
    labels.update([post['labels']])
labels

100%|██████████| 986/986 [00:00<00:00, 1630750.69it/s]


{'Does Anyone Else...?',
 'Interesting info/Articles',
 'It Happened (TW)',
 'Moderator',
 'Needing Support - Anxious about FP',
 'Needing Support - In Acute Crisis (at risk of self injury)',
 'Needing Support - N, V, D etc',
 'Needing Support - N, V, D etc NO REASSURANCE',
 'Needing Support - Non-Emet related',
 'Needing support - Panic attack',
 'Needing support: Just not feeling good',
 'None',
 'Positive Reminder',
 'Potentially Triggering',
 'Question',
 'Rant',
 'Recovery',
 'Success!',
 'Techniques, tips and tricks',
 'Therapy info!',
 'Venting - Advice wanted',
 'Venting - No advice please'}

In [17]:
for label in labels:
    print(label)
    lbl_counter = Counter()
    for post in all_posts:
        if label in post['labels']:
            for ent, ent_group in post['entities']:
                lbl_counter[ent_group] += 1
    print(f"{lbl_counter.most_common(5)}\n\n")

Needing Support - N, V, D etc
[('Sign_symptom', 195), ('Activity', 44), ('Time', 40), ('Detailed_description', 30), ('Nonbiological_location', 28)]


Does Anyone Else...?
[('Sign_symptom', 399), ('Activity', 112), ('Detailed_description', 96), ('Coreference', 49), ('Time', 34)]


Venting - Advice wanted
[('Sign_symptom', 210), ('Activity', 52), ('Detailed_description', 40), ('Disease_disorder', 24), ('Subject', 24)]


Needing Support - Non-Emet related
[('Sign_symptom', 10), ('Activity', 3), ('Medication', 3), ('Time', 3), ('Biological_structure', 2)]


Needing Support - N, V, D etc NO REASSURANCE
[('Sign_symptom', 10), ('Disease_disorder', 4), ('Therapeutic_procedure', 3), ('Other_event', 2), ('Activity', 2)]


Potentially Triggering
[('Sign_symptom', 367), ('Detailed_description', 120), ('Activity', 117), ('Subject', 50), ('Time', 44)]


Rant
[('Sign_symptom', 559), ('Activity', 217), ('Detailed_description', 138), ('Subject', 81), ('Time', 62)]


Venting - No advice please
[('Subjec

## Total entities per label

In [18]:
for label in labels:
    lbl_counter = 0
    for post in all_posts:
        if label in post['labels']:
            lbl_counter += len(post['entities'])
    
    print(f"{label}: {lbl_counter} entities")

Needing Support - N, V, D etc: 530 entities
Does Anyone Else...?: 966 entities
Venting - Advice wanted: 544 entities
Needing Support - Non-Emet related: 30 entities
Needing Support - N, V, D etc NO REASSURANCE: 28 entities
Potentially Triggering: 1120 entities
Rant: 1657 entities
Venting - No advice please: 24 entities
Needing Support - Anxious about FP: 340 entities
Success!: 548 entities
Question: 1578 entities
Techniques, tips and tricks: 173 entities
Needing support - Panic attack: 2101 entities
Therapy info!: 24 entities
None: 50 entities
Moderator: 12 entities
Recovery: 129 entities
Interesting info/Articles: 10 entities
Needing support: Just not feeling good: 1000 entities
It Happened (TW): 199 entities
Positive Reminder: 101 entities
Needing Support - In Acute Crisis (at risk of self injury): 692 entities


## Statistics entities per post

In [19]:
all_num_entities = []

for post in tqdm(all_posts):
    all_num_entities.append(len(post['entities']))

all_num_entities = np.array(all_num_entities)
print(f"Mean number of entities per post: {np.mean(all_num_entities)}")
print(f"Median number of entities per post: {np.median(all_num_entities)}")
print(f"Max number of entities per post: {np.max(all_num_entities)}")
print(f"Min number of entities per post: {np.min(all_num_entities)}")
print(f"Stdev number of entities per post: {np.std(all_num_entities)}")

100%|██████████| 986/986 [00:00<00:00, 2722569.94it/s]

Mean number of entities per post: 11.995943204868155
Median number of entities per post: 10.0
Max number of entities per post: 56
Min number of entities per post: 0
Stdev number of entities per post: 8.0226595781395





## Statistics entities per label

In [20]:
lbl_dict = {}

for label in labels:
    print(label)
    lbl_num_entities = []
    for post in all_posts:
        if label in post['labels']:
            lbl_num_entities.append(len(post['entities']))
    lbl_num_entities = np.array(lbl_num_entities)

    lbl_dict[label] = {
        'mean': np.mean(lbl_num_entities),
        'median': np.median(lbl_num_entities),
        'max': np.max(lbl_num_entities),
        'min': np.min(lbl_num_entities),
        'stdev': np.std(lbl_num_entities),
        'num_posts': len(lbl_num_entities)
    }

    print(f"Mean number of entities per post: {np.mean(lbl_num_entities)}")
    print(f"Median number of entities per post: {np.median(lbl_num_entities)}")
    print(f"Max number of entities per post: {np.max(lbl_num_entities)}")
    print(f"Min number of entities per post: {np.min(lbl_num_entities)}")
    print(f"Stdev number of entities per post: {np.std(lbl_num_entities)}")
    print("\n\n")

Needing Support - N, V, D etc
Mean number of entities per post: 12.325581395348838
Median number of entities per post: 10.0
Max number of entities per post: 35
Min number of entities per post: 1
Stdev number of entities per post: 7.099691870918023



Does Anyone Else...?
Mean number of entities per post: 12.384615384615385
Median number of entities per post: 9.0
Max number of entities per post: 53
Min number of entities per post: 0
Stdev number of entities per post: 9.719437001251759



Venting - Advice wanted
Mean number of entities per post: 12.952380952380953
Median number of entities per post: 11.5
Max number of entities per post: 32
Min number of entities per post: 2
Stdev number of entities per post: 7.705756601869555



Needing Support - Non-Emet related
Mean number of entities per post: 7.5
Median number of entities per post: 8.5
Max number of entities per post: 9
Min number of entities per post: 4
Stdev number of entities per post: 2.0615528128088303



Needing Support - N, V,

In [21]:
df_lbl_scispacy = pd.DataFrame.from_dict(lbl_dict, orient='index')

df_lbl_scispacy.to_excel(ner_dir / 'emetophobia_labels_ner_biobert.xlsx', index=True, index_label='label')

In [22]:
with open(ner_dir / 'emetophobia_posts_ner_biobert.json', 'w') as f:
    json.dump(all_posts, f, indent=4)

## Posts with 0 entities

In [23]:
posts_0_ents = []

for post in tqdm(all_posts):
    if len(post['entities']) == 0:
        posts_0_ents.append(post['content'])

len(posts_0_ents)

100%|██████████| 986/986 [00:00<00:00, 1905798.96it/s]


7

In [24]:
df_posts_0_ents = pd.DataFrame(posts_0_ents, columns=['content'])
df_posts_0_ents.to_excel(ner_dir / 'emetophobia_posts_0_entities_biobert.xlsx', index=False)