In [1]:
import spacy
import pandas as pd
from pathlib import Path
from datetime import datetime

import sys
sys.path.insert(0, '../..')
from src.data_process.keyword_search import get_regex, get_reg_dict, find_keywords
from src.data_process.select_batch_for_annotation import select_notes
from src.data_process.text_to_conll import row_to_conllfile
from src.utils.df_funcs import remove_on_multikeys

# Load data

In [3]:
path = Path('../../data')

In [2]:
all_2017 = pd.read_pickle(path / '2017_raw/processed.pkl')
all_2018 = pd.read_pickle(path / '2018_raw/processed.pkl')
all_2020 = pd.read_pickle(path / '2020_raw/processed.pkl')
cov_2020 = pd.read_pickle(path / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')
non_cov_2020 = remove_on_multikeys(all_2020, cov_2020, ['MDN', 'NotitieID'])

In [4]:
annotated = pd.read_csv(path / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

In [5]:
keywords = pd.read_excel('../../keywords/keywords_v2.xlsx')

# Exclude annotated and sample

In [10]:
data = {'2017': all_2017, '2018': all_2018, 'cov_2020': cov_2020, 'non_cov_2020': non_cov_2020}

def exclude_annotated_and_sample(df, annotated, n_sample=50000, random_state=20):
    print(f"Before exclusion: {len(df)=}")
    df = df.loc[~df.NotitieID.isin(annotated.NotitieID)].copy()
    print(f"After exclusion: {len(df)=}")
    if len(df) > n_sample:
        df = df.sample(n_sample, random_state=random_state)
    return df

for source, df in data.items():
    print(f"{source}:")
    data[source] = exclude_annotated_and_sample(df, annotated)

2017:
Before exclusion: len(df)=4244705
After exclusion: len(df)=4240944
2018:
Before exclusion: len(df)=2451973
After exclusion: len(df)=2451973
cov_2020:
Before exclusion: len(df)=44938
After exclusion: len(df)=43182
non_cov_2020:
Before exclusion: len(df)=2603090
After exclusion: len(df)=2603090


# Keyword search

In [13]:
keywords['regex'] = keywords.apply(lambda row: get_regex(row.keyword, row.regex_template_id), axis=1)
reg_dict = get_reg_dict(keywords)

for source, df in data.items():
    print(f"{source}: {len(df)=}")   
    start_time = datetime.now()
    data[source] = find_keywords(df, reg_dict)
    print(datetime.now() - start_time)

2017: len(df)=50000
0:01:11.106147
2018: len(df)=50000
0:01:16.138884
cov_2020: len(df)=43182
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[k] = df.all_text.str.findall(v, flags=re.IGNORECASE)
0:01:11.273365
non_cov_2020: len(df)=50000
0:01:20.571392


# Select notes

In [21]:
week_14 = select_notes(data)

In [23]:
week_14.pivot_table(
    index=['annotator'],
    columns=['source', 'samp_meth'],
    values='NotitieID',
    aggfunc='count',
    margins=True,
    margins_name='Total',
)

source,2017,2017,2018,2018,cov_2020,cov_2020,cov_2020,non_cov_2020,non_cov_2020,Total
samp_meth,kwd,rndm,kwd,rndm,kwd,kwd_iaa,rndm,kwd,rndm,Unnamed: 10_level_1
annotator,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
avelli,8,2,8,2,20,5,5,8,2,60
katsburg,8,2,8,2,20,5,5,8,2,60
meskers,8,2,8,2,20,5,5,8,2,60
opsomer,8,2,8,2,20,5,5,8,2,60
swartjes,8,2,8,2,20,5,5,8,2,60
vervaart,8,2,8,2,20,5,5,8,2,60
ze_edith,8,2,8,2,20,5,5,8,2,60
ze_hinke,8,2,8,2,20,5,5,8,2,60
ze_ron,8,2,8,2,20,5,5,8,2,60
Total,72,18,72,18,180,45,45,72,18,540


In [25]:
# exclude week 14 sample from data before selecting notes for week 15
data = {source:df.query("NotitieID not in @week_14.NotitieID") for source, df in data.items()}
week_15 = select_notes(data, iaa_sources=['2018'])

In [26]:
week_15.pivot_table(
    index=['annotator'],
    columns=['source', 'samp_meth'],
    values='NotitieID',
    aggfunc='count',
    margins=True,
    margins_name='Total',
)

source,2017,2017,2018,2018,2018,cov_2020,cov_2020,non_cov_2020,non_cov_2020,Total
samp_meth,kwd,rndm,kwd,kwd_iaa,rndm,kwd,rndm,kwd,rndm,Unnamed: 10_level_1
annotator,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
avelli,8,2,4,5,1,24,6,8,2,60
katsburg,8,2,4,5,1,24,6,8,2,60
meskers,8,2,4,5,1,24,6,8,2,60
opsomer,8,2,4,5,1,24,6,8,2,60
swartjes,8,2,4,5,1,24,6,8,2,60
vervaart,8,2,4,5,1,24,6,8,2,60
ze_edith,8,2,4,5,1,24,6,8,2,60
ze_hinke,8,2,4,5,1,24,6,8,2,60
ze_ron,8,2,4,5,1,24,6,8,2,60
Total,72,18,36,45,9,216,54,72,18,540


In [27]:
week_14.to_pickle(path / 'to_inception_conll/week_14.pkl')
week_15.to_pickle(path / 'to_inception_conll/week_15.pkl')

# Convert to CoNLL

In [4]:
week_14 = pd.read_pickle(path / 'to_inception_conll/week_14.pkl')
week_15 = pd.read_pickle(path / 'to_inception_conll/week_15.pkl')

In [12]:
annotators    = ['avelli', 'katsburg', 'meskers', 'opsomer', 'swartjes', 'vervaart']
ze = ['ze_edith', 'ze_hinke', 'ze_ron']

conllpath = path / 'to_inception_conll'

nlp = spacy.load('nl_core_news_sm')

for annotator in annotators:

    outdir_w14 = conllpath / 'week_14' / annotator
    outdir_w14.mkdir(exist_ok=True, parents=True)

    outdir_w15 = conllpath / 'week_15' / annotator
    outdir_w15.mkdir(exist_ok=True, parents=True)

    df = week_14.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir_w14, batch='week_14')

    df = week_15.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir_w15, batch='week_15')

for annotator in ze:

    outdir = conllpath / 'ze_batch1' / annotator
    outdir.mkdir(exist_ok=True, parents=True)

    df = week_14.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir, batch='ze_batch1')

    df = week_15.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir, batch='ze_batch1')