In [1]:
import os
import shutil
import sys
import pandas as pd

In [2]:
input_dir = '../data/N2C2-Track3-May3/'
output_dir = '../data/N2C2-Track3-May3_pseudo/'

### Load original dataset

In [3]:
df_train = pd.read_csv(os.path.join(input_dir, 'train.csv'), low_memory=False)
df_dev = pd.read_csv(os.path.join(input_dir, 'dev.csv'), low_memory=False)

In [4]:
# Counts the number of examples with anonymized fields
for name, df in [('train', df_train), ('dev', df_dev)]:
    anon_ex_cnt = 0
    anon_total_cnt = 0
    for i, row in df.iterrows():
        assess_anon = max(row['Assessment'].count('[**'), row['Assessment'].count('**]'))
        plan_anon = max(row['Plan Subsection'].count('[**'), row['Plan Subsection'].count('**]'))
        anon_total_cnt += assess_anon + plan_anon
        if assess_anon + plan_anon:
            anon_ex_cnt += 1
    print(f'{name}: {anon_ex_cnt}/{len(df)} examples, {anon_total_cnt} anon fields')

train: 2952/4633 examples, 6652 anon fields
dev: 349/597 examples, 798 anon fields


In [5]:
# Check for the incomplete anonymized fields, which occur misfunction of mimic-tools
for s in df_train['Assessment'].tolist() + \
         df_train['Plan Subsection'].tolist() + \
         df_dev['Assessment'].tolist() + \
         df_dev['Plan Subsection'].tolist():
    left_idx, right_idx = s.find('[**'), s.find('**]')
    if left_idx == right_idx == -1:
        continue        
    left_idx2, right_idx2 = s.rfind('[**'), s.rfind('**]')
    
    # Anonymized fields at the beginning of the text
    if (left_idx == -1 and right_idx != -1) or left_idx > right_idx:
        print(s + '\n')
        continue
    
    # At the end of the text
    if (left_idx2 != -1 and right_idx2 == -1) or left_idx2 > right_idx2:
        print(s + '\n')
        continue

**Age over 90 **] yo with pancreatic CA obstructing common bile duct, s/p stent, p/w
   with abd pain and fever, found to have cholecystitis.

**Age over 90 **] yo with pancreatic CA obstructing common bile duct, s/p stent, p/w
   with abd pain and fever, found to have cholecystitis.

**Age over 90 **] yo with pancreatic CA obstructing common bile duct, s/p stent, p/w
   with abd pain and fever, found to have cholecystitis.

**Age over 90 **] yo with pancreatic CA obstructing common bile duct, s/p stent, p/w
   with abd pain and fever, found to have cholecystitis.

**Age over 90 **] yo with pancreatic CA obstructing common bile duct, s/p stent, p/w
   with abd pain and fever, found to have cholecystitis.

**Age over 90 **] yo with pancreatic CA obstructing common bile duct, s/p stent, p/w
   with abd pain and fever, found to have cholecystitis.

**Age over 90 **] yo with pancreatic CA obstructing common bile duct, s/p stent, p/w
   with abd pain and fever, found to have cholecystitis.


In [6]:
def fix_wrong_anon_field(text):
    if text.startswith('**') and text[2] != '*':
        text = '[' + text
    if text.endswith('**') and text[-3] != '*':
        text = text + ']'
    return text

delimiter = '\n######\n'

train_assessment_text = delimiter.join(df_train['Assessment'].map(fix_wrong_anon_field))
train_plan_text = delimiter.join(df_train['Plan Subsection'].map(fix_wrong_anon_field))
dev_assessment_text = delimiter.join(df_dev['Assessment'].map(fix_wrong_anon_field))
dev_plan_text = delimiter.join(df_dev['Plan Subsection'].map(fix_wrong_anon_field))

### Pseudonymization

In [7]:
mimic_tools_dpath = 'mimic-tools'
pseudo_in_dir = 'pseudo_in'
pseudo_out_dir = 'pseudo_out'

if os.path.exists(pseudo_in_dir):
    shutil.rmtree(pseudo_in_dir)
os.makedirs(pseudo_in_dir)

with open(os.path.join(pseudo_in_dir, 'train_assessment.txt'), 'w') as fd:
    fd.write(train_assessment_text)
with open(os.path.join(pseudo_in_dir, 'train_plan.txt'), 'w') as fd:
    fd.write(train_plan_text)
with open(os.path.join(pseudo_in_dir, 'dev_assessment.txt'), 'w') as fd:
    fd.write(dev_assessment_text)
with open(os.path.join(pseudo_in_dir, 'dev_plan.txt'), 'w') as fd:
    fd.write(dev_plan_text)

In [8]:
if os.path.exists(pseudo_out_dir):
    shutil.rmtree(pseudo_out_dir)

# pip install requests joblib sqlalchemy gensim\n",
! python {os.path.join(mimic_tools_dpath, 'main.py')} REPLACE \
    --input-dir {os.path.join(os.getcwd(), pseudo_in_dir)} \
    --output-dir {os.path.join(os.getcwd(), pseudo_out_dir)} \
    --list-dir {os.path.join(mimic_tools_dpath, 'lists')}

2022-05-12 13:10:27,110 Starting placeholder replacing
2022-05-12 13:10:27,110 Loading lists
2022-05-12 13:10:27,138 * Postal addresses: 20000 [656C Newport Court Coatesville, PA 19320 ...]
2022-05-12 13:10:27,339 * Last names: 88799 [SMITH, JOHNSON, WILLIAMS, JONES, BROWN ...]
2022-05-12 13:10:27,342 * Male first names: 1219 [JAMES, JOHN, ROBERT, MICHAEL, WILLIAM ...]
2022-05-12 13:10:27,351 * Female first names: 4275 [MARY, PATRICIA, LINDA, BARBARA, ELIZABETH ...]
2022-05-12 13:10:27,373 * Phone numbers: 20000 [(666) 372-7835, (923) 739-2644 ...]
2022-05-12 13:10:27,395 * Companies: 20000 [Ligula Aenean Gravida Ltd, Non Bibendum Sed LLC ...]
2022-05-12 13:10:27,395 * Countries: 264 [Afghanistan, Albania, Algeria, American Samoa ...]
2022-05-12 13:10:27,427 * Emails: 20000 [enim.Suspendisse.aliquet@Crasdictum.com, sapien.Cras.dolor@Curabitur.org ...]
2022-05-12 13:10:27,427 * Holiday names: 187 [Administrative Professionals Day, Air Force Birthday ...]
2022-05-12 13:10:27,434 * Hospit

In [9]:
with open(os.path.join(pseudo_out_dir, 'train_assessment.txt'), 'r') as fd:
    train_assessment_text_pseudo = fd.read()
with open(os.path.join(pseudo_out_dir, 'train_plan.txt'), 'r') as fd:
    train_plan_text_pseudo = fd.read()
with open(os.path.join(pseudo_out_dir, 'dev_assessment.txt'), 'r') as fd:
    dev_assessment_text_pseudo = fd.read()
with open(os.path.join(pseudo_out_dir, 'dev_plan.txt'), 'r') as fd:
    dev_plan_text_pseudo = fd.read()

In [10]:
print(train_assessment_text[:50000])

51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, BRONCHI

In [11]:
print(train_assessment_text_pseudo[:50000])

51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
51 yr old F with a history of 3V CAD, confirmed on C. cath during this
   admission, EF of 40%, who is transferred to CCU for monitoring of
   recurrent chest pain. Plan for CABG today to revascularize due to
   3-vessel disease.
######
CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, BRONCHI

### Output new dataset

In [12]:
df_train['Assessment'] = train_assessment_text_pseudo.split(delimiter)
df_train['Plan Subsection'] = train_plan_text_pseudo.split(delimiter)
df_dev['Assessment'] = dev_assessment_text_pseudo.split(delimiter)
df_dev['Plan Subsection'] = dev_plan_text_pseudo.split(delimiter)

In [13]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_train.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
df_dev.to_csv(os.path.join(output_dir, 'dev.csv'), index=False)