# SAMPLE 1

This notebook was used to select a sample of 200 files for an exploratory annotation round. This round was performed by the project team before the beginning of the official annotations. The goals of this round were (a) to try out and refine the annotation guidelines, (b) to check whether keywords are useful / necessary for selection of files for annotation.

**There is no need to re-run this notebook since its outputs are stored:**

- the sample is stored here: `../../to_inception_conll/sample1.pkl`
- the results of the keyword search are stored here: `../../data/keyword_results/`

The sample contains 2 types of files:

- Files that contain keywords from at least 6 different domains (**kwd**)
- Randomly selected files (**rndm**)
   
Each of the 5 annotators received the following composition of files:

- 2017 data: 5 kwd + 5 rndm
- 2018 data: 5 kwd + 5 rndm
- 2020 COVID diagnosis: 5 kwd + 5 rndm
- 2020 other diagnoses: 5 kwd + 5 rndm

In [1]:
import spacy
import pandas as pd
from pathlib import Path
from datetime import datetime
from functools import partial

import sys
sys.path.insert(0, '../..')
from src.data_process.keyword_search import *
from src.data_process.text_to_conll import row_to_conllfile
from src.utils.df_funcs import remove_on_multikeys

# Load data

In [2]:
path = Path('../../data')

all_2017 = pd.read_pickle(path / '2017_raw/processed.pkl')
all_2018 = pd.read_pickle(path / '2018_raw/processed.pkl')
all_2020 = pd.read_pickle(path / '2020_raw/processed.pkl')
cov_2020 = pd.read_pickle(path / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')

In [3]:
non_cov_2020 = remove_on_multikeys(all_2020, cov_2020, ['MDN', 'NotitieID'])

In [4]:
annotated = pd.read_csv(path / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

In [5]:
keywords = pd.read_excel('../../keywords/keywords_v1.xlsx')

# Sample for keyword search

From each of the datasets (except cov_2020, see below), a random sample of 50,000 files was selected for the keyword search.

**NOTE**: len(cov_2020) < 50000, therefore the full dataset is used rather than a sample.

In [8]:
# exclude annotated 2017 notes

non_annot_2017 = all_2017.loc[~all_2017.NotitieID.isin(annotated.query("year==2017").NotitieID)]

In [19]:
samp_2017 = non_annot_2017.sample(50000, random_state=19)
samp_2018 = all_2018.sample(50000, random_state=19)
samp_non_cov_2020 = non_cov_2020.sample(50000, random_state=19)

# Keyword search

In [30]:
keywords['regex'] = keywords.apply(lambda row: get_regex(row.keyword, row.regex_template_id), axis=1)
reg_dict = get_reg_dict(keywords)
domains = ['ENR', 'ATT', 'STM', 'ADM', 'INS', 'MBW', 'FAC', 'BER']

dfs = [samp_2017, samp_2018, samp_non_cov_2020, cov_2020]

for df in dfs:
    print(f"{df.name}: {len(df)=}")
    outfile = path / f"keyword_results/{df.name}_kwd_v1.pkl"
    
    start_time = datetime.now()
    
    df = find_keywords(df, reg_dict)
    save_kwd_results(df, domains, outfile)

    print(datetime.now() - start_time)

samp_2017: len(df)=50000
Results len(df)=28705 are saved to ../../data/keyword_results/samp_2017_kwd_v1.pkl
0:01:03.431036
samp_2018: len(df)=50000
Results len(df)=29166 are saved to ../../data/keyword_results/samp_2018_kwd_v1.pkl
0:01:07.358767
samp_non_cov_2020: len(df)=50000
Results len(df)=30695 are saved to ../../data/keyword_results/samp_non_cov_2020_kwd_v1.pkl
0:01:11.967306
cov_2020: len(df)=44938
Results len(df)=35160 are saved to ../../data/keyword_results/cov_2020_kwd_v1.pkl
0:01:06.797263


# Select notes

In [31]:
kwd_2017 = pd.read_pickle(path / 'keyword_results/samp_2017_kwd_v1.pkl')
kwd_2018 = pd.read_pickle(path / 'keyword_results/samp_2018_kwd_v1.pkl')
kwd_non_cov_2020 = pd.read_pickle(path / 'keyword_results/samp_non_cov_2020_kwd_v1.pkl')
kwd_cov_2020 = pd.read_pickle(path / 'keyword_results/cov_2020_kwd_v1.pkl')

In [34]:
# exclude annotated covid 2020 notes

kwd_cov_2020 = remove_on_multikeys(kwd_cov_2020, annotated, ['MDN', 'NotitieID'])

In [46]:
kwd_df = pd.concat(
    [kwd_2017, kwd_2018, kwd_non_cov_2020, kwd_cov_2020],
    keys=['2017', '2018', 'non_cov_2020', 'cov_2020'],
    names=['source', 'source_idx'],
)

In [47]:
domains = ['ENR', 'ATT', 'STM', 'ADM', 'INS', 'MBW', 'FAC', 'BER']
matched_domains = [f"matched_{domain}" for domain in domains]
count_domains = [f"n_{domain}" for domain in domains]

def op_count(df, domain):
    "Number of matches for `domain` keywords."
    return df[domain].apply(len)

def op_bool(df, domain):
    "Are there any matches for `domain` keywords (boolean)."
    return df[domain].apply(bool)

ops_count = {f"n_{domain}":partial(op_count, domain=domain) for domain in domains}
ops_bool = {f"matched_{domain}":partial(op_bool, domain=domain) for domain in domains}

kwd_df = kwd_df.assign(**ops_count, **ops_bool).assign(n_domains=lambda df: df[matched_domains].sum(axis=1))

In [49]:
kwd_df.reset_index().pivot_table(
    columns=['source',],
    index=['n_domains'],
    aggfunc='count',
    values='NotitieID',
    margins=True,
    margins_name='Totals',
)

source,2017,2018,cov_2020,non_cov_2020,Totals
n_domains,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,12373,12017,10066,11647,46103
2,7672,7695,8810,8202,32379
3,4557,4702,6673,5361,21293
4,2319,2538,4077,3142,12076
5,1095,1361,2151,1460,6067
6,456,572,961,629,2618
7,184,211,458,205,1058
8,49,70,281,49,449
Totals,28705,29166,33477,30695,122043


In [61]:
data = {
    '2017': samp_2017,
    '2018': samp_2018,
    'cov_2020': remove_on_multikeys(cov_2020, annotated, ['MDN', 'NotitieID']),
    'non_cov_2020': samp_non_cov_2020,
}

def select_kwds_and_rndm_from_data(data, kwd_df):
    kwd_samples = kwd_df.query("n_domains > 5").groupby(level=0).sample(25)
    add_source = lambda df, source: pd.concat([df], keys=[source])
    all_selected = []
    for source, df in data.items():
        selected_kwds = df.loc[kwd_samples.xs(source).index]
        selected_rndm = df.loc[~df.index.isin(selected_kwds.index)].sample(25)
        dfs = [selected_kwds, selected_rndm]
        all_selected.append(pd.concat(dfs, keys=['kwds', 'rndm']).pipe(add_source, source))
    return pd.concat(all_selected).rename_axis(['source', 'samp_meth', 'source_idx'])

In [63]:
sample = select_kwds_and_rndm_from_data(data, kwd_df)
sample.to_pickle(path / 'to_inception_conll/sample1.pkl')

# Convert to CoNLL

In [8]:
sample = pd.read_pickle(path / 'to_inception_conll/sample1.pkl').reset_index()

In [47]:
conllpath = path / 'to_inception_conll/sample1'

annotators = [
    'edwin',
    'sabina',
    'carel',
    'caroline',
    'marike'
]

nlp = spacy.load('nl_core_news_sm')

for idx, annotator in enumerate(annotators):
    
    outdir = conllpath / annotator
    outdir.mkdir(exist_ok=True, parents=True)
    
    # select sample indices
    base = list(range(5))
    base_range = [i + (25 * n) for n in range(8) for i in base]
    selection = [i+5*idx for i in base_range]
    
    # convert to conll
    df = sample.iloc[selection]
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir, batch='sample1')