In [363]:
import json
import csv
import os
import pandas as pd
import re
import numpy as np
import pickle

In [419]:
data_dir = '/home/dc925/project/data/seq_pair/MEDSTS/k_fold_0'

In [420]:
data_dir

'/home/dc925/project/data/seq_pair/MEDSTS/k_fold_2'

In [421]:
## read in filtered metamap mappings
dev_mapping_path = 'filtered_dev.p'
train_mapping_path = 'filtered_train.p'

In [422]:
with open(os.path.join(data_dir, dev_mapping_path), 'rb') as f:
    dev_mapping = pickle.load(f)
with open(os.path.join(data_dir, train_mapping_path), 'rb') as f:
    train_mapping = pickle.load(f)


In [423]:
## read in original sentence dataset
dev_path = os.path.join(data_dir, 'dev.jsonl')
train_path = os.path.join(data_dir, 'train.jsonl')

In [424]:
with open(dev_path, 'r') as f:
    dev = []
    for line in f:
        example = json.loads(line)
        dev.append(example)
with open(train_path, 'r') as f:
    train = []
    for line in f:
        example = json.loads(line)
        train.append(example)

In [425]:
banned_words = ['contextual', 'qualifier', 'value', 'action', '-', 'patients', 'of', 'person', 'feels', 'time', 'to', \
               'contents', 'escalation', 'cavity', 'region', 'medical', 'discussion', 'procedure', 'unit', 'dose', 'appearance', \
               'feelings', 'has', 'does', 'finding', 'function', 'in', 'qualitative', 'changing', 'publication', 'educational', \
                'by', 'for', 'with', 'from', 'continuous', 'transducers', 'process', 'needs', 'individual', 'reporting', 'chief', \
               'relationships', '6', '10', 'syncytial', 'human', 'masks', 'muscle', 'training', 'virus',]
               

In [426]:
def extract_strings(mappings, sentence):
    strings = []
    for phrase in mappings:
        if phrase['mapping']:
            for m in phrase['mapping']:
                text = m['preferred'].lower().split()
                strings += text
                
                if 'Clinical Drug' in m['semtypes']:
                    strings += ['clinical drug']
                if 'Pharmacologic Substance' in m['semtypes']:
                    strings += ['pharmacologic substance']
    out = " ".join(strings)
    out = re.sub(r'[^\w\s]', ' ', out)
    out = out.split()
    out = [t for t in out if t not in banned_words and t not in sentence]
    out = list(set(out))
    out = " ".join(out)
    return out

In [427]:
def augment_sentences(mm_output, data):
    assert len(mm_output) == len(data)*2
    i = 0
    j = 0
    augment_strings = []
    while i < len(mm_output)-1:
        aug = extract_strings(mm_output[i], data[j]['sentence1'])
        data[j]['augment1'] = aug
        i += 1
        aug = extract_strings(mm_output[i], data[j]['sentence2'])
        data[j]['augment2'] = aug
        i += 1
        j += 1

In [428]:
dev[0]

{'sentence1': 'neurologic: alert and oriented x 3, normal strength and tone.',
 'sentence2': 'male, alert and oriented in no apparent distress.',
 'label': 4.5,
 'pid': 1294,
 'label_c': 2,
 'label_type': 0}

In [429]:
dev_mapping[0]

[{'text': 'neurologic alert and oriented x normal strength and tone',
  'mapping': [{'cui': 'C0577134',
    'preferred': 'Normal abdominal muscle tone',
    'semtypes': ['Finding'],
    'in_cui2vec': False},
   {'cui': 'C0205494',
    'preferred': 'Neurologic (qualifier value)',
    'semtypes': ['Qualitative Concept'],
    'in_cui2vec': True},
   {'cui': 'C0239110',
    'preferred': 'Consciousness clear',
    'semtypes': ['Finding'],
    'in_cui2vec': False},
   {'cui': 'C1961028',
    'preferred': 'Oriented to place',
    'semtypes': ['Finding'],
    'in_cui2vec': True},
   {'cui': 'C0442821',
    'preferred': 'Strong',
    'semtypes': ['Qualitative Concept'],
    'in_cui2vec': True}]}]

In [430]:
augment_sentences(dev_mapping, dev)
augment_sentences(train_mapping, train)

656
328
2628
1314


In [431]:
with open(dev_path, 'w') as f:
    for line in dev:
        json.dump(line, fp=f)
        f.write('\n')
with open(train_path, 'w') as f:
    for line in train:
        json.dump(line, fp=f)
        f.write('\n')