In [25]:
import json
import re
import pandas as pd
from pathlib import Path

# Annotated data

(check that all 2020 data is COVID diagnosis)

COVID / non-COVID / total

- number notes (total, per institution)
- number disregarded notes
- mean/min/max number sentences per note
- mean/min/max number notes/sentences per annotator
- number sentences (total)
- number sentences by domain label
- number sentences by dimain-level label

## Parse labels

In [2]:
with open('../../tagsets/legacy_stella.json', 'r') as f:
    tagset = json.load(f)['tags']
len(tagset)

35

In [36]:
def categorize_tags(tagset):
    "tbd"
    tag_names = [i['tag_name'].split(':')[0] for i in tagset]
    # define regexes
    rdomain = re.compile('\..*')
    rlevel = re.compile('[A-Z]{3} \d')
    rdisregard = re.compile('disregard_file')
    # find categories
    domains = [tag for tag in tag_names if rdomain.match(tag)]
    levels = [tag for tag in tag_names if rlevel.match(tag)]
    disregard = [tag for tag in tag_names if rdisregard.match(tag)]
    other = [tag for tag in tag_names if tag not in domains + levels + disregard]
    return dict(
        domains=domains,
        levels=levels,
        disregard=disregard,
        other=other,
    )

def create_parse_index(conversions):
    domainlevels = zip(conversions.values(), [f"{i}_lvl" for i in conversions.values()])
    domainlevels = [i for domlev in domainlevels for i in domlev]
    return  pd.Index(domainlevels+['disregard', 'other'])

def parse_label(label, parse_index, cols_to_lbl, reg_others):
    s = pd.Series(index=parse_index)
    for idx in s.index:
        if idx in cols_to_lbl:
            s[idx] = cols_to_lbl[idx] in label
        elif '_lvl' in idx:
            regex = re.compile(f"{idx[:3]} (\d)")
            if regex.search(label):
                s[idx] = int(regex.search(label).group(1))
        else:
            s[idx] = reg_others.findall(label)
    return s

def parse_df(df, tagset):
    cat_tags = categorize_tags(tagset)
    conversions = {
        '.B152': 'STM',
        '.B455': 'INS',
        '.D450': 'FAC',
        '.D840-859': 'BER',
    }
    parse_index = create_parse_index(conversions)
    reg_others = re.compile('|'.join(cat_tags['other']))
    cols_to_lbl = {v:k for k,v in conversions.items()}
    cols_to_lbl['disregard'] = 'disregard\\_file'

    parse_label_from_row = lambda row: parse_label(row.label, parse_index, cols_to_lbl, reg_others)
    select_labels = (df.label != '_') & df.label.notna()
    parsed = df.loc[select_labels].apply(parse_label_from_row, result_type='expand', axis=1)
    return df.join(parsed)

In [5]:
def preprocessing(df):
    return df.assign(
        sen_id = lambda df: df.NotitieID.astype(str) + '_' + df.sen_tok.str.split('-').str[0],
        tok = lambda df: df.sen_tok.str.split('-').str[1],
    )

## Non-COVID (2017)

In [44]:
noncovpath = Path('../../../Non_covid_data_15oct/from_inception_tsv')

In [32]:
# noncov = pd.read_pickle(noncovpath / 'annotated_df_Batch1_pilot.pkl').pipe(preprocessing).pipe(parse_df, tagset)
# noncov.to_pickle(noncovpath / 'annotated_df_Batch1_pilot_parsed.pkl')

In [None]:
noncov = pd.read_pickle(noncovpath / 'annotated_df_Batch1_pilot_parsed.pkl')

In [45]:
noncov.shape

(1589820, 24)

## COVID (2020)

In [41]:
covpath = Path('../../../Covid_data_11nov/from_inception_tsv')

In [40]:
# cov = pd.read_pickle(covpath / 'annotated_df_CovidBatch_pilot.pkl').pipe(preprocessing).pipe(parse_df, tagset)
# cov.to_pickle(covpath / 'annotated_df_CovidBatch_pilot_parsed.pkl')

In [42]:
cov = pd.read_pickle(covpath / 'annotated_df_CovidBatch_pilot_parsed.pkl')

In [43]:
cov.shape

(589329, 24)