-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
274 lines (233 loc) · 10.6 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import os
import re
import pandas as pd
import sys, getopt
from pathlib import Path
from tqdm import tqdm
from langdetect import detect
from random import sample
ABOUT = '''
This takes the following arguments
--file or -f
path to CSV file with the patient records, just like the one given for this project.
Each row is a patient record.
--output_dir or -o
Optional. Directory to save the preprocessed patient records. If not given, then the current working directory is used.
--check_lang or -l
Default False.
Whether or not to check the language of the patient record. If True, the language code ('en','fr','undefined') will be appended to the patient file
--columns or -c
Columns to use from the CSV files for the patient records. Should be a string with columns seperated by |
For example, if you want to consider the 'CONCERN', 'HISTORY' and 'PHYSICAL_EXAM' columns then pass:
-- columns 'CONCERN|HISTORY|PHYSICAL_EXAM'
If the columns are not passed, it uses the default columns: 'CONCERN|HISTORY|ASSESSMENT_AND_PLAN|PHYSICAL_EXAM|DX_DESCRIPTIONS'
--txt or -t
Default False.
Set to true if your patient records are in TXT files instead of CSV with columns.
--keep_only_en or -e
Default False
Whether or not to keep only patient records detected as English
'''
def read_txt(f,enc='utf8'):
with open(f,'r',encoding=enc) as out_:
return[o_ for o_ in out_.readlines()]
def read(f):
try:
df=pd.read_csv(f,keep_default_na=False)
except Exception:
try:
df=pd.read_csv(f,sep='\t')
except Exception:
raise Exception(f'Could not read file {f}')
return df
def cleanhtml(raw_html):
# https://stackoverflow.com/a/12982689/11814682
#cleanr = re.compile('<.*?>')
cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def preprocess_sentence(w):
w = cleanhtml(w)
#w=re.sub(r"DISCLAIMER The Add Diagnosis’ function is used solely for data analysis purposes, and thus does not represent an official diagnosis for the patient\. As a result, the content will never be communicated with the patient during or after the consultation, excluding if the medical file is requested\.The collected diagnoses suggestions merely serve as a method to improve and target internal medical protocols, training, and support based on the results of these outcome measures\.ASSESSMENT",'ASSESSMENT',w)
w=re.sub(r"DISCLAIMER[\w\W]+ASSESSMENT",'ASSESSMENT',w)
w=re.sub(r"DISCLAIMER[\w\W]+support based on the results of these outcome measures.",'ASSESSMENT',w)
w=re.sub(r"AVERTISSEMENT[\w\W]+ÉVALUATION ",'ÉVALUATION ',w,flags=re.UNICODE)
w= re.sub(r'^[\w\W]+Reason for consultation','Reason for consultation ',w)
w=re.sub(r'^[\w\W]+Raison de consultation','Raison de consultation ',w,flags=re.UNICODE)
w=re.sub(r'(.)PLAN(.\s)',r'\1\nPLAN\n\2',w)
w=re.sub(r'([a-z])([A-Z][a-z])',r'\1. \2',w)
#w= re.sub(r'PLAN',' PLAN ',w)
w= w.strip()
w = cleanhtml(w)
#w=re.sub(r'[\n\r\t]+',' ',w)
#w=re.sub(r'[a-zA-Z ]+:[ “"«]','',w)
#w=re.sub(r'[^A-Za-z0-9\s\’\-\.,]+','',w)
#w = re.sub(r"[;@#'?!&$]+\ *", " ", w)
w= re.sub(r'[\․●□•☑☐�≥ⓝ💤☺😭↑◆®…😊▪↓©ø!"#$%&\(\)\*\+;<=>?@\[\]^_`‘→{|}~«»”“]+','',w)
#w = re.sub(r'^[0-9,\-\. ]+','',w)
w = re.sub(r' [ ]+',' ',w)
#w = re.sub(r' [0-9]+ [-,] [0-9]+',' ',w)
#w = re.sub(r' [0-9]+ [0-9 ]+ [0-9]+',' ',w)
w=re.sub(r' - ',' ',w)
w = re.sub(r' [ ]+',' ',w)
w = re.sub(r"' '",'',w)
w=re.sub(r'\.[ \.]+','. ',w)
w=re.sub(r'\.([a-zA-Z])',r'. \1',w)
return w
def preprocess_sentence_rn(w):
#w=re.sub(r"DISCLAIMER The Add Diagnosis’ function is used solely for data analysis purposes, and thus does not represent an official diagnosis for the patient\. As a result, the content will never be communicated with the patient during or after the consultation, excluding if the medical file is requested\.The collected diagnoses suggestions merely serve as a method to improve and target internal medical protocols, training, and support based on the results of these outcome measures\.ASSESSMENT",'ASSESSMENT',w)
w=re.sub(r"DISCLAIMER[\w\W]+ASSESSMENT",'ASSESSMENT',w)
w=re.sub(r"DISCLAIMER[\w\W]+support based on the results of these outcome measures.",'ASSESSMENT',w)
w=re.sub(r"AVERTISSEMENT[\w\W]+ÉVALUATION ",'ÉVALUATION ',w,flags=re.UNICODE)
w= re.sub(r'^[\w\W]+Reason for consultation','Reason for consultation ',w)
w=re.sub(r'^[\w\W]+Raison de consultation','Raison de consultation ',w,flags=re.UNICODE)
w= re.sub(r'PLAN',' PLAN ',w)
w= w.strip()
w = cleanhtml(w)
w=re.sub(r'[\n\r\t]+',' ',w)
#w=re.sub(r'[a-zA-Z ]+:[ “"«]','',w)
#w=re.sub(r'[^A-Za-z0-9\s\’\-\.,]+','',w)
#w = re.sub(r"[;@#'?!&$]+\ *", " ", w)
w= re.sub(r'[\․●□•☑☐�≥ⓝ💤☺😭↑◆®…😊▪↓©ø!"#$%&\(\)\*\+;<=>?@\[\]^_`‘→{|}~«»”“]+','',w)
#w = re.sub(r'^[0-9,\-\. ]+','',w)
w = re.sub(r' [ ]+',' ',w)
#w = re.sub(r' [0-9]+ [-,] [0-9]+',' ',w)
#w = re.sub(r' [0-9]+ [0-9 ]+ [0-9]+',' ',w)
w=re.sub(r' - ',' ',w)
w = re.sub(r' [ ]+',' ',w)
w = re.sub(r"' '",'',w)
w=re.sub(r'\.[ \.]+','. ',w)
return w
def separate(s):
s = re.sub(r'(\w+)‣(\w+)','\1\n\2',s)
return s
def preprocess_all(w):
w = preprocess_sentence(w)
w= re.sub(r'(\w)\.(\W)',r'\1.\n\2',w)
w= re.sub(r'^[ \r]+','',w,flags=re.UNICODE)
w = re.sub(r'\n[\n]+','\n',w,flags=re.UNICODE)
w = separate(w)
return w
def is_number(x):
try:
s=int(x)
return True
except Exception:
return False
def fast_detect_lang(s,num=7):
cands = [a for a in s.split('\n') if a.strip()!='' and len(a.split(' '))>3]
ids = [i for i in range(len(cands))]
if ids!=[]:
if len(cands)<=num:
num = len(cands)-1
val = sample(ids,num)
try:
lang_detected = [detect(cands[v]) for v in val]
except Exception:
return '_undefined'
en = len([l for l in lang_detected if l=='en'])
fr = len([l for l in lang_detected if l=='fr'])
if en > fr:
#It is a largely English document
return '_en'
else:
#It is a largely French document
return '_fr'
else:
return '_undefined'
def main(args):
try:
opts, args = getopt.getopt(args,'f:o:c:lte',["file =","output_dir =","columns =","check_lang","txt","keep_only_en"])
except getopt.GetoptError as e:
print ('preprocess.py --file <CSV file containing patient records> --output_dir <path to dir to save preprocessed patient records>' +'\n'+ABOUT)
sys.exit(2)
OUTPUT_DIR = None
HOME_FILE=None
CHECK_LANG = False
COLUMNS=None
IS_TXT =False
KEEP_ONLY_EN=False
for opt, arg in opts:
opt=opt.strip()
if opt in ['--file','-f']:
HOME_FILE = arg
elif opt in ["--output_dir", "-o"]:
OUTPUT_DIR= arg
elif opt in ['--check_lang','-l']:
CHECK_LANG = True
elif opt in ['--columns','-c']:
COLUMNS = arg
elif opt in ['--txt','-t']:
IS_TXT=True
elif opt in ['--keep_only_en','-e']:
KEEP_ONLY_EN=True
if KEEP_ONLY_EN and not CHECK_LANG:
print(f'Checking language has been set to True since you specified to keep only English.')
CHECK_LANG=True
if HOME_FILE==None:
raise Exception(f"--file or -f cannot be empty")
if not os.path.exists(HOME_FILE):
HOME_FILE = os.path.join(os.getcwd(),HOME_FILE)
if not os.path.exists(HOME_FILE):
raise Exception(f'CSV file for patient records not given')
if OUTPUT_DIR ==None:
OUTPUT_DIR = Path(HOME_FILE).parent.absolute()
elif not os.path.exists(OUTPUT_DIR):
raise Exception(f'Output directory {OUTPUT_DIR} does not exist. Please create the directory and run again.')
if os.path.isdir(HOME_FILE):
#It is a dir containing CSVs of patient data
files = [f.name for f in os.scandir(HOME_FILE)]
if not IS_TXT:
dfs = [read(os.path.join(HOME_FILE,f)) for f in files]
df = pd.concat(dfs)
else:
df=[]
for f in files:
df.append('\n'.join(read_txt(os.path.join(HOME_FILE,f))))
else:
#It is a single file
if not IS_TXT:
df = read(HOME_FILE)
else:
df = ['\n'.join(read_txt(HOME_FILE))]
#We have our pandas DataFrame of patient records.
#Preprocess and save output.
#Option to check language. CHECK_LANG
if not IS_TXT:
if COLUMNS == None:
COLUMNS = ['CONCERN', 'HISTORY','ASSESSMENT_AND_PLAN', 'PHYSICAL_EXAM','DX_DESCRIPTIONS']
else:
COLUMNS = [c.strip() for c in COLUMNS.split('|') if c!=' ' and c!='']
try:
df=df[COLUMNS]
except Exception:
raise Exception(f'Error with columns: {COLUMNS}.\n Please check that the columns chosen exist in the CSV file')
print(f'Using columns: {COLUMNS}')
df['text'] = df[COLUMNS[0]].astype(str)
for col in COLUMNS[1:]:
df['text'] = df['text'] + '\n'+df[col].astype(str)
df = df[df['text']!='']
all_sents = df['text'].tolist()
all_sents = [a for a in all_sents]
else:
all_sents = df
print(f'Extracting patient records...')
with tqdm(total = len(all_sents)) as pbar:
for i in range(len(all_sents)):
filename = f'patient{i}'
if CHECK_LANG:
lang_det = fast_detect_lang(all_sents[i])
filename=filename+lang_det+'.txt'
if KEEP_ONLY_EN:
if lang_det!='_en':
filename=None
else:
filename+='.txt'
if filename!=None:
with open(os.path.join(OUTPUT_DIR,filename),'w+',encoding='utf8') as f:
f.write(preprocess_all(all_sents[i]))
pbar.update(1)
print(f'Saved patient records to {OUTPUT_DIR}')
print('ALL DONE!')
if __name__ == "__main__":
main(sys.argv[1:])