In [1]:
import csv
import json
import os
import argparse
import logging
import random
import numpy as np
import re


In [4]:
random.seed(5)
medsts_path = '/home/dc925/project/data/seq_pair/MEDSTS'
type_annotation = os.path.join(medsts_path, 'annotated_data.csv')

In [8]:


def preprocess_medsts(medsts_path):
    original = os.path.join(medsts_path, 'test.txt')
    original_out = os.path.join(medsts_path, 'test.jsonl')

    type_labels_map = {'condition': 0, 'interaction':1, 'medication': 2, 'misc': 3}
    type_labels = {}
    with open(type_annotation, 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            type_labels[line[0]] = line[4]
        type_labels.pop('')

    with open(original, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        lines = []
        # lines_c = []
        for i, line in enumerate(reader):
            sent1 = preprocess_text(line[0])
            sent2 = preprocess_text(line[1])
            lines.append({'sentence1':sent1,'sentence2':sent2})

#     random.shuffle(lines)
    with open(original_out, 'w') as writer:
        for line in lines:
            json.dump(line, fp=writer)
            writer.write("\n")


In [9]:
def preprocess_text(sent):
    sent = sent.lower()
    sent = re.sub(r'[\d]*', '', sent)
    sent = re.sub(r'[^\w\s]', ' ', sent)


    return sent

preprocess_medsts(medsts_path)

In [10]:
## METAMAP UTIL

In [None]:

def generate_metamap_input_files_clean(data_dir):
    test_file = os.path.join(data_dir, 'test.jsonl')
    test_mm_input_file = os.path.join(data_dir, 'test_mm_input.txt')

    with open(test_file, 'r') as f:
        lines = []
        for line in f:
            example = json.loads(line)
            sentence1 = preprocess_mm_input(example['sentence1'])
            sentence2 = preprocess_mm_input(example['sentence2'])
            lines.append(sentence1)
            lines.append(sentence2)
    with open(test_mm_input_file, 'w') as writer:
        for line in lines:
            writer.write(line)
            writer.write('\n')
            writer.write('\n')
        writer.write('\n')

def preprocess_mm_input(sent):
    digit_to_word = {'0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4':'four','5':'five',
                     '6':'six','7':'seven','8':'eight','9':'nine', '10': 'ten'}
    stopwords = ['mg','mcg','ml', 'liter', "tablets", "tablet",'capsules', 'capsule']
    sent = sent.lower()
    nums = re.findall(r'\s[0-9]\s', sent)
    nums = [num.strip() for num in nums]
    for num in nums:
        sent = sent.replace(num, digit_to_word[num])
    sent = re.sub(r'[^\w\s]', ' ', sent) #dash should be replace w a space to prevent collapsing separate terms, that confuses metamap
    filter_words = re.compile('|'.join(map(re.escape, stopwords)))
    sent = filter_words.sub('', sent)
    sent = re.sub(r'[\d]*', '', sent)
    sent = re.sub(r'[\s]+', ' ', sent)
    sent = re.sub('male and female', 'male and female,', sent)
    sent = re.sub('location', ', location,', sent)
    sent = re.sub('discussed', 'discussion', sent)
    sent = re.sub('discussing', 'discussion', sent)
    sent = re.sub('discuss ', 'discussion', sent)
    sent = re.sub('as well as', 'and', sent)
    sent = re.sub('albuterol', ', albuterol,', sent)
    sent = re.sub('vitamin b', ', vitamin b,', sent)
    sent = re.sub('d ay', 'day', sent)
    sent = re.sub('call back', 'call', sent)
    sent = re.sub('immunization', ', immunization,', sent)
    sent = re.sub('diabetes', ', diabetes,', sent)
    sent = re.sub('release', '', sent)
    sent = re.sub('vaginal discharge', ', vaginal discharge,', sent)
    sent = re.sub('back-up', '', sent)
    sent = re.sub('back here', 'here', sent)
    sent = re.sub('suspicious', 'suspected', sent)
    sent = re.sub('care unit', 'care unit, pacu,', sent)
    
    return sent
