In [1]:
import spacy
import pandas as pd
from pathlib import Path
from datetime import datetime

import sys
sys.path.insert(0, '../..')
from src.data_process.keyword_search import get_regex, get_reg_dict, find_keywords
from src.data_process.select_batch_for_annotation import select_notes
from src.data_process.text_to_conll import row_to_conllfile
from src.utils.df_funcs import remove_on_multikeys

# Load data

In [2]:
path = Path('../../data')

In [3]:
all_2017 = pd.read_pickle(path / '2017_raw/processed.pkl')
all_2018 = pd.read_pickle(path / '2018_raw/processed.pkl')
all_2020 = pd.read_pickle(path / '2020_raw/processed.pkl')
cov_2020 = pd.read_pickle(path / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')
non_cov_2020 = remove_on_multikeys(all_2020, cov_2020, ['MDN', 'NotitieID'])

In [42]:
annotated = pd.read_csv(path / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

in_annotation = pd.read_pickle(path / 'to_inception_conll/week_18.pkl').append([
    pd.read_pickle(path / 'to_inception_conll/week_19.pkl'),
])

exclude = annotated.NotitieID.append(in_annotation.NotitieID)

In [43]:
keywords = pd.read_excel('../../keywords/keywords_v3.xlsx')

# Exclude annotated and select note type

In [44]:
data = {'2017': all_2017, '2018': all_2018, 'cov_2020': cov_2020, 'non_cov_2020': non_cov_2020}

def exclude_annotated_and_sample(df, annotated, n_sample=50000, random_state=20):
    print(f"Before exclusion: {len(df)=}")
    df = df.loc[~df.NotitieID.isin(annotated)].copy()
    print(f"After exclusion: {len(df)=}")
    if len(df) > n_sample:
        df = df.sample(n_sample, random_state=random_state)
    print(f"After sampling: {len(df)=}")
    return df

def exclude_annotated_and_select_type(df, annotated, note_types):
    print(f"Before exclusion: {len(df)=}")
    df = df.loc[~df.NotitieID.isin(annotated)].copy()
    print(f"After exclusion: {len(df)=}")
    df = df.query(f"Typenotitie == {note_types}")
    print(f"After type selection: {len(df)=}")
    return df

for source, df in data.items():
    print(f"{source}:")
    data[source] = exclude_annotated_and_select_type(df, exclude, note_types=['Consulten (niet-arts)'])

2017:
Before exclusion: len(df)=4244705
After exclusion: len(df)=4240551
After type selection: len(df)=65700
2018:
Before exclusion: len(df)=2451973
After exclusion: len(df)=2451643
After type selection: len(df)=22853
cov_2020:
Before exclusion: len(df)=44938
After exclusion: len(df)=42069
After type selection: len(df)=916
non_cov_2020:
Before exclusion: len(df)=2603090
After exclusion: len(df)=2602676
After type selection: len(df)=33490


# Keyword search

In [45]:
keywords['regex'] = keywords.apply(lambda row: get_regex(row.keyword, row.regex_template_id), axis=1)
reg_dict = get_reg_dict(keywords)

for source, df in data.items():
    print(f"{source}: {len(df)=}")   
    start_time = datetime.now()
    data[source] = find_keywords(df, reg_dict)
    print(datetime.now() - start_time)

2017: len(df)=65700
0:01:17.873327
2018: len(df)=22853
0:00:26.020856
cov_2020: len(df)=916
0:00:01.052641
non_cov_2020: len(df)=33490
0:00:45.982318


# Select notes

In [47]:
annotators = ['avelli', 'katsburg', 'meskers', 'opsomer', 'swartjes', 'vervaart', 'ze_edwin']

week_20 = select_notes(data, annotators=annotators, min_matched_domains=1)

In [48]:
week_20.pivot_table(
    index=['annotator'],
    columns=['source', 'samp_meth'],
    values='NotitieID',
    aggfunc='count',
    margins=True,
    margins_name='Total',
)

source,2017,2017,2018,2018,cov_2020,cov_2020,cov_2020,non_cov_2020,non_cov_2020,Total
samp_meth,kwd,rndm,kwd,rndm,kwd,kwd_iaa,rndm,kwd,rndm,Unnamed: 10_level_1
annotator,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
avelli,9,5,9,5,10,3,5,9,5,60
katsburg,9,5,9,5,10,3,5,9,5,60
meskers,9,5,9,5,10,3,5,9,5,60
opsomer,9,5,9,5,10,3,5,9,5,60
swartjes,9,5,9,5,10,3,5,9,5,60
vervaart,9,5,9,5,10,3,5,9,5,60
ze_edwin,9,5,9,5,10,3,5,9,5,60
Total,63,35,63,35,70,21,35,63,35,420


In [49]:
# exclude week 20 sample from data before selecting notes for week 21
data = {source:df.query("NotitieID not in @week_20.NotitieID") for source, df in data.items()}

annotators = ['avelli', 'katsburg', 'meskers', 'opsomer', 'swartjes', 'vervaart']
week_21 = select_notes(data, annotators=annotators, min_matched_domains=1, iaa_sources=['2017'])

In [50]:
week_21.pivot_table(
    index=['annotator'],
    columns=['source', 'samp_meth'],
    values='NotitieID',
    aggfunc='count',
    margins=True,
    margins_name='Total',
)

source,2017,2017,2017,2018,2018,cov_2020,cov_2020,non_cov_2020,non_cov_2020,Total
samp_meth,kwd,kwd_iaa,rndm,kwd,rndm,kwd,rndm,kwd,rndm,Unnamed: 10_level_1
annotator,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
avelli,7,3,4,9,5,12,6,9,5,60
katsburg,7,3,4,9,5,12,6,9,5,60
meskers,7,3,4,9,5,12,6,9,5,60
opsomer,7,3,4,9,5,12,6,9,5,60
swartjes,7,3,4,9,5,12,6,9,5,60
vervaart,7,3,4,9,5,12,6,9,5,60
Total,42,18,24,54,30,72,36,54,30,360


In [51]:
# exclude weeks 20 and 21 from data before selecting notes for week 22
q = "(NotitieID not in @week_20.NotitieID) & (NotitieID not in @week_21.NotitieID)"
data = {source:df.query(q) for source, df in data.items()}

annotators = ['avelli', 'katsburg', 'meskers', 'opsomer', 'swartjes', 'vervaart']
week_22 = select_notes(data, annotators=annotators, min_matched_domains=1, iaa_sources=['2018'])

In [52]:
week_22.pivot_table(
    index=['annotator'],
    columns=['source', 'samp_meth'],
    values='NotitieID',
    aggfunc='count',
    margins=True,
    margins_name='Total',
)

source,2017,2017,2018,2018,2018,cov_2020,cov_2020,non_cov_2020,non_cov_2020,Total
samp_meth,kwd,rndm,kwd,kwd_iaa,rndm,kwd,rndm,kwd,rndm,Unnamed: 10_level_1
annotator,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
avelli,9,5,7,3,4,12,6,9,5,60
katsburg,9,5,7,3,4,12,6,9,5,60
meskers,9,5,7,3,4,12,6,9,5,60
opsomer,9,5,7,3,4,12,6,9,5,60
swartjes,9,5,7,3,4,12,6,9,5,60
vervaart,9,5,7,3,4,12,6,9,5,60
Total,54,30,42,18,24,72,36,54,30,360


In [53]:
week_20.to_pickle(path / 'to_inception_conll/week_20.pkl')
week_21.to_pickle(path / 'to_inception_conll/week_21.pkl')
week_22.to_pickle(path / 'to_inception_conll/week_22.pkl')

# Convert to CoNLL

In [54]:
annotators = ['avelli', 'katsburg', 'meskers', 'opsomer', 'swartjes', 'vervaart']
ze = ['ze_edwin']

conllpath = path / 'to_inception_conll'

nlp = spacy.load('nl_core_news_sm')

for annotator in annotators:

    outdir_w20 = conllpath / 'week_20' / annotator
    outdir_w20.mkdir(exist_ok=True, parents=True)

    outdir_w21 = conllpath / 'week_21' / annotator
    outdir_w21.mkdir(exist_ok=True, parents=True)

    outdir_w22 = conllpath / 'week_22' / annotator
    outdir_w22.mkdir(exist_ok=True, parents=True)

    df = week_20.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir_w20, batch='week_20')
2
    df = week_21.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir_w21, batch='week_21')

    df = week_22.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir_w22, batch='week_22')

for annotator in ze:

    outdir = conllpath / 'ze_batch3' / annotator
    outdir.mkdir(exist_ok=True, parents=True)

    df = week_20.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir, batch='ze_batch3')