-
Notifications
You must be signed in to change notification settings - Fork 0
/
transmit_anno.py
119 lines (91 loc) · 4.86 KB
/
transmit_anno.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
def start_x(box):
return box[0][0]
def start_y(box):
return box[0][1]
def end_x(box):
return box[1][0]
def end_y(box):
return box[1][1]
def box_overlaps(this_box, other_box, overlap_tol_fract):
this_coords = [start_x(this_box), start_y(this_box), end_x(this_box), end_y(this_box)]
other_coords = [start_x(other_box), start_y(other_box), end_x(other_box), end_y(other_box)]
def area(box):
return (end_y(box) - start_y(box)) * (end_x(box) - start_x(box))
dx = min(this_coords[2], other_coords[2]) - max(this_coords[0], other_coords[0])
dy = min(this_coords[3], other_coords[3]) - max(this_coords[1], other_coords[1])
if (dx >= 0) and (dy >= 0):
intersection_area = dx * dy
return float(intersection_area) / min(area(this_box), area(other_box)) > overlap_tol_fract
else:
return False
def find_constituent_boxes(unmerged_anno, merged_anno_box, overlap_tol_fract, box_type):
constituent_boxes = []
for box_name, box_vals in unmerged_anno[box_type].items():
if box_overlaps(box_vals['rectangle'], merged_anno_box['rectangle'], overlap_tol_fract):
box_vals['category'] = merged_anno_box['category']
constituent_boxes.append(box_vals)
return constituent_boxes
def transmit_labels(unmerged_annotations, merged_annotations, overlap_tol_fract):
new_annotations = []
for box_name, box_val in merged_annotations['text'].items():
atomic_boxes = find_constituent_boxes(unmerged_annotations, box_val, overlap_tol_fract, box_type='text')
if atomic_boxes:
# print 'box val'
# print box_val
# print 'ab'
# print atomic_boxes
# print
new_annotations.extend(atomic_boxes)
sorted_boxes = sorted(new_annotations, key=lambda x: x['rectangle'][0][1])
unmerged_text_named = {'T' + str(i + 1): sorted_boxes[i] for i in range(len(sorted_boxes))}
for name, detection in unmerged_text_named.items():
detection['box_id'] = name
return unmerged_text_named
def transmit_question_labels(unmerged_annotations, merged_annotations, overlap_tol_fract):
new_annotations = []
if 'question' not in merged_annotations.keys():
print 'no con'
return None
for box_name, box_val in merged_annotations['question'].items():
atomic_boxes = find_constituent_boxes(unmerged_annotations, box_val, overlap_tol_fract, 'question')
if atomic_boxes:
new_annotations.extend(atomic_boxes)
sorted_boxes = sorted(new_annotations, key=lambda x: x['rectangle'][0][1])
unmerged_text_named = {'Q' + str(i + 1): sorted_boxes[i] for i in range(len(sorted_boxes))}
for name, detection in unmerged_text_named.items():
detection['box_id'] = name
return unmerged_text_named
def write_transmitted_annotations(unmerged_text_anno, new_file_path):
full_anno = {"text": unmerged_text_anno, "figure": {}, "relationship": {}}
with open(new_file_path, 'w') as f:
json.dump(full_anno, f, indent=4, sort_keys=True)
def write_transmitted_question_annotations(non_q_annotations, unmerged_text_anno, new_file_path):
full_anno = {"question": unmerged_text_anno, "text": non_q_annotations['text'],"figure": {}, "relationship": {}}
with open(new_file_path, 'w') as f:
json.dump(full_anno, f, indent=4, sort_keys=True)
def transmit_boxes_single_page(page_image, overlap_tol_fract,
base_path, overmerged_dir, unmerged_dir, lessmerged_dir, question_flag):
json_file = page_image.replace('.jpeg', '.json')
merged_anno_path = base_path + overmerged_dir + json_file
unmerged_anno_path = base_path + unmerged_dir + json_file
lessmerged_anno_path = base_path + lessmerged_dir + json_file
with open(merged_anno_path, 'rb') as f:
merged_anno = json.load(f)
with open(unmerged_anno_path, 'rb') as f:
unmerged_anno = json.load(f)
if question_flag:
unmerged_text_boxes = transmit_question_labels(unmerged_anno, merged_anno, overlap_tol_fract)
write_transmitted_question_annotations(unmerged_anno, unmerged_text_boxes, lessmerged_anno_path)
else:
unmerged_text_boxes = transmit_labels(unmerged_anno, merged_anno, overlap_tol_fract)
write_transmitted_annotations(unmerged_text_boxes, lessmerged_anno_path)
def transmit_anno_single_textbook(book_name, (start_n, stop_n), overlap_tol_fract,
base_path, overmerged_dir, unmerged_dir, destination_dir, question_round=False):
for page_n in range(start_n, stop_n):
page_image = book_name.replace('.pdf', '') + '_' + str(page_n) + '.jpeg'
try:
transmit_boxes_single_page(page_image, overlap_tol_fract,
base_path, overmerged_dir, unmerged_dir, destination_dir, question_round)
except IOError as e:
print e