-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_utils.py
159 lines (124 loc) · 5.21 KB
/
eval_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import pickle
import glob
import xml.etree.ElementTree as et
def get_disease_names(xml, exclude=set()):
"""Get list of diseases from standoff files"""
disease_names = set()
tree = et.parse(xml)
for disease_elem in tree.iter('disease'):
disease_name = disease_elem.attrib['name']
if disease_name not in exclude:
disease_names.add(disease_name)
return sorted(list(disease_names))
class DatasetProvider():
def __init__(self,
corpus_path,
annot_xml,
disease,
judgement,
# use_pickled_alphabet=False,
# alphabet_pickle=None,
min_token_freq=0):
"""Index words by frequency in a file"""
self.corpus_path = corpus_path
self.annot_xml = annot_xml
self.min_token_freq = min_token_freq
self.disease = disease
self.judgement = judgement
self.alpha_pkl = 'data/lookups/alphabet.pkl'
self.label2int = {'Y':0, 'N':1, 'Q':2, 'U':3}
self.token2int = {}
# alpha_pkl = 'data/lookups/alphabet.pkl'
# when training, make alphabet and pickle it
# when testing, load it from pickle
# if use_pickled_alphabet:
# if os.path.exists(alpha_pkl):
try:
print('reading alphabet alphabet pickle file')
pkl = open(self.alpha_pkl, 'rb')
self.token2int = pickle.load(pkl)
except FileNotFoundError as e:
print(e)
print("Alphabet Pickle file doesn't exist. Please run preprocess file to prepare datasets.")
def parse_standoff_file(self, xml, disease, task):
doc2label = {} # key: doc id, value: label
tree = et.parse(xml)
for task_elem in tree.iter('diseases'):
if task_elem.attrib['source'] == task:
for disease_elem in task_elem:
if disease_elem.attrib['name'] == disease:
for doc_elem in disease_elem:
id = doc_elem.attrib['id']
label = doc_elem.attrib['judgment']
doc2label[id] = label
return doc2label
def parse_standoff(self, pattern, disease, task):
doc2label = {} # key: doc id, value: label
for xml_file in sorted(glob.glob(pattern)):
print('loading annotations from', xml_file)
d2l = self.parse_standoff_file(xml_file, disease, task)
doc2label.update(d2l)
return doc2label
def extract_tokens(self, text, ignore_negation=False):
"""Return a file as a list of CUIs"""
tokens = []
for token in text.lower().split():
if token.isalpha():
tokens.append(token)
return tokens
# if ignore_negation:
# tokens = []
# for token in text.split():
# if token.startswith('n'):
# tokens.append(token[1:])
# else:
# tokens.append(token)
# return tokens
# else:
# return text.split()
def load(self, maxlen=float('inf')):
labels = [] # int labels
examples = [] # examples as int sequences
no_labels = [] # docs with no labels
# document id -> label mapping
doc2label = self.parse_standoff(
self.annot_xml,
self.disease,
self.judgement
)
# load examples and labels
for f in os.listdir(self.corpus_path):
# doc_id = f.split('.')[0]
file_path = os.path.join(self.corpus_path, f)
# file = open(file_path).read()
tree = et.parse(file_path)
root = tree.getroot()
for elem in root:
for doc in elem:
doc_id = doc.attrib['id']
for txt in doc:
file_feat_list = self.extract_tokens(txt.text)
example = []
for token in set(file_feat_list):
if token in self.token2int:
example.append(self.token2int[token])
else:
example.append(self.token2int['oov_word'])
if len(example) > maxlen:
example = example[0:maxlen]
# no labels for some documents for some reason
if doc_id in doc2label:
string_label = doc2label[doc_id]
int_label = self.label2int[string_label]
labels.append(int_label)
examples.append(example)
else:
no_labels.append(doc_id)
print('%d documents with labels for %s/%s in %s' \
% (len(labels), self.disease,
self.judgement, self.annot_xml.split('/')[-1]))
print('%d documents with no labels for %s/%s in %s' \
% (len(no_labels), self.disease,
self.judgement, self.annot_xml.split('/')[-1]))
return examples, labels