In [1]:
import spacy
import pandas as pd
from pathlib import Path
from datetime import datetime

import sys
sys.path.insert(0, '../..')
from src.data_process.keyword_search import get_regex, get_reg_dict, find_keywords
from src.data_process.select_batch_for_annotation import select_notes
from src.data_process.text_to_conll import row_to_conllfile
from src.utils.df_funcs import remove_on_multikeys

# Load data

In [2]:
path = Path('../../data')

In [3]:
all_2017 = pd.read_pickle(path / '2017_raw/processed.pkl')
all_2018 = pd.read_pickle(path / '2018_raw/processed.pkl')
all_2020 = pd.read_pickle(path / '2020_raw/processed.pkl')
cov_2020 = pd.read_pickle(path / '2020_raw/ICD_U07.1/notes_[U07.1]_2020_q1_q2_q3.pkl')
non_cov_2020 = remove_on_multikeys(all_2020, cov_2020, ['MDN', 'NotitieID'])

In [4]:
annotated = pd.read_csv(path / 'annotated_notes_ids.csv', dtype={'MDN': str, 'NotitieID': str})

in_annotation = pd.read_pickle(path / 'to_inception_conll/week_22.pkl').append([
    pd.read_pickle(path / 'to_inception_conll/week_22-26.pkl'),
])

exclude = annotated.NotitieID.append(in_annotation.NotitieID)

In [5]:
keywords = pd.read_excel('../../keywords/keywords_v4.xlsx')

# Exclude annotated and select note type

In [6]:
data = {'2017': all_2017, '2018': all_2018, 'cov_2020': cov_2020, 'non_cov_2020': non_cov_2020}

def exclude_annotated_and_sample(df, annotated, n_sample=50000, random_state=45):
    print(f"Before exclusion: {len(df)=}")
    df = df.loc[~df.NotitieID.isin(annotated)].copy()
    print(f"After exclusion: {len(df)=}")
    if len(df) > n_sample:
        df = df.sample(n_sample, random_state=random_state)
    print(f"After sampling: {len(df)=}")
    return df

def exclude_annotated_and_select_type(df, annotated, note_types):
    print(f"Before exclusion: {len(df)=}")
    df = df.loc[~df.NotitieID.isin(annotated)].copy()
    print(f"After exclusion: {len(df)=}")
    df = df.query(f"Typenotitie == {note_types}")
    print(f"After type selection: {len(df)=}")
    return df

for source, df in data.items():
    print(f"{source}:")
    data[source] = exclude_annotated_and_sample(df, exclude)

2017:
Before exclusion: len(df)=4244705
After exclusion: len(df)=4239723
After sampling: len(df)=50000
2018:
Before exclusion: len(df)=2451973
After exclusion: len(df)=2450887
After sampling: len(df)=50000
cov_2020:
Before exclusion: len(df)=44938
After exclusion: len(df)=40960
After sampling: len(df)=40960
non_cov_2020:
Before exclusion: len(df)=2603090
After exclusion: len(df)=2601753
After sampling: len(df)=50000


# Keyword search

In [7]:
keywords['regex'] = keywords.apply(lambda row: get_regex(row.keyword, row.regex_template_id), axis=1)
reg_dict = get_reg_dict(keywords)

for source, df in data.items():
    print(f"{source}: {len(df)=}")   
    start_time = datetime.now()
    data[source] = find_keywords(df, reg_dict)
    print(datetime.now() - start_time)

2017: len(df)=50000
0:01:18.640919
2018: len(df)=50000
0:01:23.759306
cov_2020: len(df)=40960
0:01:11.170389
non_cov_2020: len(df)=50000
0:01:29.985735


# Select notes

In [8]:
annotators = ['opsomer']
matched_domains = ['ENR', 'ATT', 'STM', 'INS', 'MBW', 'FAC', 'BER', 'ETN']

week_27_30 = select_notes(
    data,
    annotators=annotators,
    n_files=240,
    pct_covid=0.4,
    pct_kwd=0.8,
    matched_domains=matched_domains,
    min_matched_domains=3,
    n_iaa=0,
    iaa_sources=[],
)

In [9]:
week_27_30.pivot_table(
    index=['annotator'],
    columns=['source', 'samp_meth'],
    values='NotitieID',
    aggfunc='count',
    margins=True,
    margins_name='Total',
)

source,2017,2017,2018,2018,cov_2020,cov_2020,non_cov_2020,non_cov_2020,Total
samp_meth,kwd,rndm,kwd,rndm,kwd,rndm,kwd,rndm,Unnamed: 9_level_1
annotator,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
opsomer,38,10,38,10,76,20,38,10,240
Total,38,10,38,10,76,20,38,10,240


In [10]:
week_27_30.to_pickle(path / 'to_inception_conll/week_27_30.pkl')

# Convert to CoNLL

In [11]:
annotators = ['opsomer']

conllpath = path / 'to_inception_conll'

nlp = spacy.load('nl_core_news_sm')

for annotator in annotators:

    outdir = conllpath / 'week_27_30' / annotator
    outdir.mkdir(exist_ok=True, parents=True)

    df = week_27_30.query("annotator == @annotator")
    df.apply(row_to_conllfile, axis=1, nlp=nlp, outdir=outdir, batch='week_27_30')