# Explore label examples
This notebook is for exploring the annotations for the individual label to get a better insight into the data

In [1]:
from spacy.tokens import DocBin
import spacy
import random

# Initialize NLP object
nlp = spacy.blank("en")

# Get participants docs from training data
participants_path = "../corpus/ner_train_p.spacy"
participants_docBin = DocBin().from_disk(participants_path)
participants_docs = list(participants_docBin.get_docs(nlp.vocab))

# Get intervention docs from training data
intervention_path = "../corpus/ner_train_i.spacy"
intervention_docBin = DocBin().from_disk(intervention_path)
intervention_docs = list(intervention_docBin.get_docs(nlp.vocab))

# Get intervention docs from training data
outcome_path = "../corpus/ner_train_o.spacy"
outcome_docBin = DocBin().from_disk(outcome_path)
outcome_docs = list(outcome_docBin.get_docs(nlp.vocab))

## Label [Participants]

In [22]:
# Show 10 random spans marked with the participant label
random.shuffle(participants_docs)
for doc in participants_docs[:10]:
    for i,ent in enumerate(doc.ents):
        print(i,ent, len(ent))

0 metastatic colorectal cancer : 4
1 southern Italy cooperative oncology group trials . 7
2 metastatic colorectal cancer ( CRC ) . 7
3 254 patients 2
4 elderly patients . 3
0 older heart failure patients : 5
0 patients with large or locally advanced HER2-positive ( HER2+ ) breast cancers 12
1 core biopsy samples from 27 HER2+ breast cancer patients enrolled in a preoperative clinical trial using trastuzumab , nab-paclitaxel and carboplatin combination therapy ( BrUOG BR-211B ( NCT00617942 ) ) . 31
2 18 weeks of treatment , 5
0 health education intervention trials : 5
1 health education research . 4
2 mixed RCT/preference trial comparing two formats 6
3 Women take PRIDE heart disease management program . 8
4 women 1
0 elderly oncologic hypertensive patients : 5
1 Elderly neoplastic patients 3
2 hypertension and hyperuricemia 3
3 hypertensive neoplastic elderly patients . 5
4 Seventy patients with cancer 4
5 hypertension and hyperuricemia in elderly patients under chemotherapeutic treatm

In [28]:
# Create a list of span lengths for the participant label
participant_lengths = {}
for doc in participants_docs:
    for ent in doc.ents:
        ent_len = 0
        for token in ent:
            if not token.is_stop:
                ent_len+= 1
        if ent_len not in participant_lengths:
            participant_lengths[ent_len] = 0
        participant_lengths[ent_len] += 1
        
participant_sum = 0

for ent_len in participant_lengths:
    participant_sum += participant_lengths[ent_len]

range_max = 7
range_sum = 0
for i in range(0,range_max+1):
    range_sum += participant_lengths[i]
    
print(f"0-{range_max}: {range_sum} ({round((range_sum/participant_sum)*100,2)}%)")
    
for ent_len in sorted(participant_lengths.keys()) :        
    print(f"{ent_len}: {participant_lengths[ent_len]} ({round((participant_lengths[ent_len]/participant_sum)*100,2)}%)")

0-7: 9479 (69.65%)
0: 3 (0.02%)
1: 604 (4.44%)
2: 1424 (10.46%)
3: 2033 (14.94%)
4: 1914 (14.06%)
5: 1548 (11.37%)
6: 1067 (7.84%)
7: 886 (6.51%)
8: 656 (4.82%)
9: 467 (3.43%)
10: 379 (2.78%)
11: 332 (2.44%)
12: 244 (1.79%)
13: 202 (1.48%)
14: 201 (1.48%)
15: 181 (1.33%)
16: 158 (1.16%)
17: 135 (0.99%)
18: 126 (0.93%)
19: 114 (0.84%)
20: 96 (0.71%)
21: 95 (0.7%)
22: 83 (0.61%)
23: 64 (0.47%)
24: 77 (0.57%)
25: 45 (0.33%)
26: 31 (0.23%)
27: 40 (0.29%)
28: 40 (0.29%)
29: 29 (0.21%)
30: 24 (0.18%)
31: 19 (0.14%)
32: 27 (0.2%)
33: 24 (0.18%)
34: 19 (0.14%)
35: 19 (0.14%)
36: 15 (0.11%)
37: 13 (0.1%)
38: 15 (0.11%)
39: 17 (0.12%)
40: 8 (0.06%)
41: 6 (0.04%)
42: 11 (0.08%)
43: 10 (0.07%)
44: 7 (0.05%)
45: 8 (0.06%)
46: 8 (0.06%)
47: 9 (0.07%)
48: 10 (0.07%)
49: 5 (0.04%)
50: 6 (0.04%)
51: 4 (0.03%)
52: 2 (0.01%)
53: 7 (0.05%)
54: 1 (0.01%)
55: 5 (0.04%)
56: 4 (0.03%)
57: 4 (0.03%)
58: 1 (0.01%)
62: 1 (0.01%)
63: 3 (0.02%)
64: 2 (0.01%)
65: 2 (0.01%)
66: 1 (0.01%)
67: 2 (0.01%)
69: 1 (0.01%)


## Label [Intervention]

In [15]:
random.shuffle(intervention_docs)
for doc in intervention_docs[:15]:
    for i,ent in enumerate(doc.ents):
        print(i,ent)

0 inpatient Dialectical Behavior Therapy ( DBT )
1 standard outpatient DBT ,
2 receive 12 weeks of intensified inpatient DBT plus six months of standard DBT ,
0 all-norgestrel
1 dl-norgestrel alone
2 estradiol-17 beta alone
3 combined hormones
4 placebo control
5 dl-norgestrel
6 dl-norgestrel
0 careful estimates
1 public commitment , self-consistency
2 unique causal risk models .
3 risk anchor based on downward social comparison processes
4 comparison anchors
0 New drug trials
0 recombinant human granulocyte-macrophage colony-stimulating factor
1 Recombinant murine GM-CSF administration
2 recombinant human ( rhu ) GM-CSF
3 placebo
4 rhuGM-CSF
5 rhuGM-CSF .
6 rhuGM-CSF
7 rhuGM-CSF .
8 rhuGM-CSF
9 rhuGM-CSF
10 rhuGM-CSF
0 adenosine myocardial perfusion imaging
1 adenosine myocardial perfusion imaging
2 Coronary angiography was conducted within 6 weeks of an adenosine thallium-201 myocardial perfusion imaging study .
3 adenosine thallium-201 myocardial perfusion imaging
0 remote ischemic 

## Label [Outcome]

In [2]:
random.shuffle(outcome_docs)
for doc in outcome_docs[:15]:
    for i,ent in enumerate(doc.ents):
        print(i,ent)

0 proteinuria
1 proteinuria-lowering effect of a renin inhibitor ( aliskiren )
2 reduced proteinuria . These
3 Furthermore , 24-h proteinuria was
4 significantly reduced proteinuria . The antiproteinuric effect is
5 with chronic proteinuric non-diabetic kidney disease .
0 advanced colorectal cancer :
1 5-fluorouracil
2 partial response
3 complete
4 partial response
5 Time to failure
6 median survival time
7 Diarrhea , stomatitis and vomiting
8 nonhematologic toxicities
9 hematologic toxicity was leukopenia ;
10 advanced colorectal cancer
0 language
1 behavioral symptoms
2 assessments of language , behavior , and autism symptomatology .
3 mean scores on any measure of language , behavior , or autism symptom severity
0 incidence of clinical sepsis .
1 incidence of positive blood cultures , necrotising enterocolitis ( NEC ) stage II or III , or death , and the duration of hospital stay .
2 incidence of clinical sepsis
3 Mortality
4 blood cultures
5 incidence of NEC and the duration of hos