-
Notifications
You must be signed in to change notification settings - Fork 0
/
metamap.py
258 lines (195 loc) · 9.11 KB
/
metamap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import os,re
from pymm import Metamap
import subprocess
from tqdm import tqdm
from jparser import convert
import sys, getopt,shutil
from pathlib import Path
import multiprocessing
from joblib import Parallel, delayed
ABOUT = '''
This takes the following arguments
--patient_record or -p
path to patient record file or a directory containing patient files.
--output_dir or -o
Optional. Directory to save the JSON-parsed Metamap output.
--metamap or -m
Metamap path.
--mmi or -i
Whether or not to use MMI.
Default: False.
If you set this to True, then --parse should be False and vice versa.
--parse or -j
Whether or not to parse the metamap output into a JSON output.
Default: False
--use_neg or -n
Whether or not to consider the negative markers when parsing.
Default: False
--keep_temp or -k:
Keep TEMP directory.
Default: False.
If False, the TEMP directory will be deleted after operation.
--use_multiprocessing or -m
Default False
Whether or not to use multiprocessing
'''
#python metamap.py --patient_record /mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/data/output/patient6_en.txt --metamap /mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/public_mm/bin/metamap20 --output_dir /mnt/c/Users/USER/Desktop/PRETTY --parse --keep_temp
def conv_command(input_file):
print('----Code to convert file---')
cmd = ['java','-jar','replace_utf8.jar']
output_file = input_file.split('.')[0]+'conv.'+input_file.split('.')[1]
cmd+=[input_file,'>',output_file]
return ' '.join(cmd)
def read_lines(file_name, fast_forward_to, batch_size):
sentences = list()
with open(file_name, 'r',) as fp:
f=fp.readlines()
return f
def replace_utf8(input_file,filepart,TEMP_DIR):
cmd = ['java','-jar','replace_utf8.jar']
output_file_save = os.path.join(TEMP_DIR,f'{filepart}')
with open(output_file_save,'w+') as o_f:
cmd+=[input_file]
proc = subprocess.Popen(cmd , stdout=o_f, stderr=subprocess.PIPE)
proc.wait()
return output_file_save
def extract(file,TEMP_DIR,useMMI,OUTPUT_METAMAP_DIR,METAMAP_PATH,DEBUG,use_only_snomed,OUTPUT_PARSED_DIR,PARSE,USED_PATIENT_FILES):
ERR_FILES=[]
filepart = str(Path(file).name).split('.txt')[0]
CLINICAL_TEXT_FILE = replace_utf8(file,filepart+'.txt',TEMP_DIR)
if useMMI:
output_file = os.path.join(OUTPUT_METAMAP_DIR,f"{filepart}_mmi.json")
else:
output_file = os.path.join(OUTPUT_METAMAP_DIR,f"{filepart}.json")
if output_file not in USED_PATIENT_FILES:
try:
mm = Metamap(METAMAP_PATH,use_only_snomed,debug=DEBUG,usemmi=useMMI)
except Exception as metamap_error:
raise Exception(f'Problem loading Metamap. See error below. \n {metamap_error}')
timeout=1000
try:
mmos = mm.parse(CLINICAL_TEXT_FILE,output_file,timeout=timeout)
if PARSE:
json_output= os.path.join(OUTPUT_PARSED_DIR,f"{filepart}.json")
convert(output_file,json_output)
except Exception as e:
print(e)
ERR_FILES.append(file)
pass
return ERR_FILES
else:
return []
def main(args):
DEBUG=False
CONVERT_UTF8=False
use_only_snomed=True
useMMI = False
OUTPUT_DIR = None
HOME_FILE=None
METAMAP_PATH = None
PARSE=False
USE_NEG=False
KEEP_TEMP = False
USE_MULT = False
PATIENT_FILES=[]
try:
opts, args = getopt.getopt(args,'p:o:m:ijnkm',["patient_record =","output_dir =","metamap =","mmi","parse","use_neg","keep_temp","use_multiprocessing"])
except getopt.GetoptError as e:
print ('metamap.py'+'\n'+ABOUT)
sys.exit(2)
for opt, arg in opts:
opt=opt.strip()
if opt in ['--patient_record','-p']:
HOME_FILE = arg
elif opt in ["--output_dir", "-o"]:
OUTPUT_DIR= arg
elif opt in ["--metamap", "-m"]:
METAMAP_PATH= arg
elif opt in ["--parse", "-j"]:
PARSE=True
elif opt in ["--use_neg", "-n"]:
USE_NEG=True
elif opt in ["--mmi", "-i"]:
useMMI=True
elif opt in ["--keep_temp", "-k"]:
KEEP_TEMP=True
elif opt in ["--use_multiprocessing", "-m"]:
USE_MULT=True
if METAMAP_PATH==None:
raise Exception(f'Metamap path not specified!')
if not os.path.exists(METAMAP_PATH):
raise Exception(f'Metamap path does not exist. The Metamap path specified was {METAMAP_PATH}')
if PARSE:
if useMMI:
raise Exception(f'Cannot use MMI when --parse is set because parsing cannot be done on MMI output.')
if HOME_FILE==None:
raise Exception(f"--patient_record (-p) cannot be empty!")
if not os.path.exists(HOME_FILE):
HOME_FILE = os.path.join(os.getcwd(),HOME_FILE)
if not os.path.exists(HOME_FILE):
raise Exception(f'Could not find the given patient record file.')
if OUTPUT_DIR ==None:
OUTPUT_DIR = Path(HOME_FILE).parent.absolute()
elif not os.path.exists(OUTPUT_DIR):
raise Exception(f'Output directory {OUTPUT_DIR} does not exist. Please create the directory and run again.')
if os.path.isdir(HOME_FILE):
#It is a dir containing the patient record files.
files = [f.name for f in os.scandir(HOME_FILE)]
PATIENT_FILES = [os.path.join(HOME_FILE,f) for f in files]
else:
#It is a single file
PATIENT_FILES = [HOME_FILE]
if use_only_snomed:
print('Using only SNOMED CT for extraction...')
ERR_FILES=[]
#TEMP DIR
TEMP_DIR = os.path.join(OUTPUT_DIR,'TEMP')
if not os.path.exists(TEMP_DIR):
os.makedirs(TEMP_DIR)
OUTPUT_METAMAP_DIR = os.path.join(OUTPUT_DIR,'FROM_METAMAP')
if not os.path.exists(OUTPUT_METAMAP_DIR):
os.makedirs(OUTPUT_METAMAP_DIR)
if PARSE:
OUTPUT_PARSED_DIR = os.path.join(OUTPUT_DIR,'PARSED_OUTPUT')
if not os.path.exists(OUTPUT_PARSED_DIR):
os.makedirs(OUTPUT_PARSED_DIR)
USED_PATIENT_FILES = list(set([f.name for f in os.scandir(OUTPUT_PARSED_DIR)]))
if not USE_MULT:
with tqdm(total = len(PATIENT_FILES)) as pbar:
for file in PATIENT_FILES:
filepart = str(Path(file).name).split('.txt')[0]
CLINICAL_TEXT_FILE = replace_utf8(file,filepart+'.txt',TEMP_DIR)
if useMMI:
output_file = os.path.join(OUTPUT_METAMAP_DIR,f"{filepart}_mmi.json")
else:
output_file = os.path.join(OUTPUT_METAMAP_DIR,f"{filepart}.json")
if output_file not in USED_PATIENT_FILES:
try:
mm = Metamap(METAMAP_PATH,use_only_snomed,debug=DEBUG,usemmi=useMMI)
except Exception as metamap_error:
raise Exception(f'Problem loading Metamap. See error below. \n {metamap_error}')
timeout=1000
try:
mmos = mm.parse(CLINICAL_TEXT_FILE,output_file,timeout=timeout)
if PARSE:
json_output= os.path.join(OUTPUT_PARSED_DIR,f"{filepart}.json")
convert(output_file,json_output)
except Exception as e:
print(e)
ERR_FILES.append(file)
continue
pbar.update(1)
else:
num_cores=multiprocessing.cpu_count()
print(f'Using multiprocessing with {num_cores} cores.')
ERR_FILES = Parallel(n_jobs=num_cores)(delayed(extract)(file,TEMP_DIR,useMMI,OUTPUT_METAMAP_DIR,METAMAP_PATH,DEBUG,use_only_snomed,OUTPUT_PARSED_DIR,PARSE,USED_PATIENT_FILES) for file in tqdm(PATIENT_FILES))
ERR_FILES =[e for e in ERR_FILES if e!=[]]
if ERR_FILES!=[]:
print(f'I had issues working on the following files: \n {ERR_FILES}')
if not KEEP_TEMP:
shutil.rmtree(TEMP_DIR)
if __name__ == "__main__":
main(sys.argv[1:])
#'/mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/public_mm/bin/metamap20'
#/mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/publiclinuxmain2018/publicmm/bin/metamap18 -c -Q 4 -y -K --sldi -I --XMLf1 --negex /mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/random/rsample3clean.txt /mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/random/metamap_output.txt
#java -jar replace_utf8.jar /mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/random/rsample3.txt > /mnt/c/Users/USER/Desktop/MASTERS/MILA/DIALOGUE/random/rsample3.txt