In [1]:
"""
用于处理thyoid公开数据集数据，在原jpg文件中画出xml中所标注的结节区域

- thyoid目录
- maligns目录
- benigns目录
"""

import xml.etree.ElementTree as ET
import cv2
import os
import json
import numpy
import matplotlib.pyplot as plt
import imageio


In [2]:
"""
检查文件，参数（目录，后缀）
"""

def find_file_thyroid(directory, extension):
    print("[INFO] directory: " + directory)
    target_files = []
    for target_file in os.listdir(directory):
        if target_file.endswith(extension):
            print("[INFO] " + extension + "_file: " + directory + "/" + target_file)
            target_files += [target_file]
    target_files.sort()
    return target_files

In [3]:
"""
检查文件，参数（目录，后缀）
"""

def find_file(directory, extension):
    print("[INFO] directory: " + directory)
    target_files = []
    for target_dir in os.listdir(directory):
        for target_file in os.listdir(directory + "/" + target_dir):
            if target_file.endswith(extension):
                print("[INFO] " + extension + "_file: " + directory + "/" + target_dir + "/" + target_file)
                target_files += [target_file]
    target_files.sort()
    return target_files

In [4]:
"""
读取XML文件
"""
def read_xml_file(directory, filename,output_directory, prefix_filename):
    base_name=os.path.splitext(filename)[0]
    print("[INFO] read_xml_file: " + directory + "/" + base_name + "/" + filename)
    xml_tree = ET.parse(directory + "/" + base_name + "/" + filename)
    root = xml_tree.getroot()
    for mark in root.iter('mark'):
        # assert it has mark information
        if mark[1].text is not None:
            jpg_file = directory + "/" + base_name + "/" + mark[0].text + ".jpg"
            try:
                region_json = json.loads(mark[1].text)
                jpg_name = base_name + "_" + mark[0].text + ".jpg"
                output_jpg_name = prefix_filename + "_" + jpg_name
                # assert jpg file exist
                if os.path.exists(jpg_file):
                    jpg_output_file = directory + "/" + mark[0].text + "_mask.jpg"
                    print("[INFO] find_jpg_file: " + jpg_file)
                    annotation_image = create_image(jpg_file)
                    box_image = create_image(jpg_file)
                    image = imageio.imread(jpg_file)
                    height = image.shape[0]
                    width  = image.shape[1]
                    mask_image = numpy.zeros((height,width))
                    a = numpy.array([[int(region_json[0]['points'][0]['x']),int(region_json[0]['points'][0]['y'])]], numpy.int32)
                    for points in region_json[0]['points']:
                        a = numpy.append(a,[[int(points['x']), int(points['y'])]],axis=0)
                    cv2.fillConvexPoly(mask_image, a, (255, 255, 255))
                    pre_point = region_json[0]['points'][0]
                    for points in region_json[0]['points']:
                        if  pre_point != points:
                            cv2.line(annotation_image, (points['x'], points['y']), (pre_point['x'], pre_point['y']), (0, 0, 255))
                        pre_point = points
                    lmax = numpy.max(a[1:len(a)][:, 0])
                    lmin = numpy.min(a[1:len(a)][:, 0])
                    hmax = numpy.max(a[1:len(a)][:, 1])
                    hmin = numpy.min(a[1:len(a)][:, 1])
                    roi_image = image[hmin: hmax, lmin: lmax]
                    box_image = cv2.rectangle(box_image, (lmin, hmin), (lmax, hmax), color = (0, 255, 255), thickness = 3) # 图像，点集，是否闭合，颜色，线条粗细
#                     save_jpg_file(output_directory + "/annotation/" + output_jpg_name, annotation_image)
                    save_jpg_file(output_directory + "/mask/" + output_jpg_name, mask_image)
#                     save_jpg_file(output_directory + "/roi/" + output_jpg_name, roi_image)
                    save_jpg_file(output_directory + "/image/" + output_jpg_name, image)
#                     save_jpg_file(output_directory + "/box/" + output_jpg_name, box_image)
#                    cv2.namedWindow("Image")
#                    cv2.imshow("Image", label_image)
#                    cv2.waitKey(0)
#                    cv2.destroyAllWindows()
            except json.decoder.JSONDecodeError as e:
                print(e)

In [5]:
def read_xml_file_thyroid(directory, filename, output_directory):
    print("[INFO] read_xml_file: " + directory + "/" + filename)
    prefix_filename = filename[:-4]
    xml_tree = ET.parse(directory + "/" + filename)
    root = xml_tree.getroot()
    for mark in root.iter('mark'):
        if mark[1].text is not None:
            try:
                region_json = json.loads(mark[1].text)
                jpg_name = prefix_filename + "_" + mark[0].text + ".jpg"
                jpg_file = directory + "/" + jpg_name
                output_jpg_name = jpg_name
                if os.path.exists(jpg_file):
                    jpg_output_file = output_directory + "/" + jpg_name
                    print("[INFO] find_jpg_file: " + jpg_file)
                    annotation_image = create_image(jpg_file)
                    box_image = create_image(jpg_file)
                    image = imageio.imread(jpg_file)
                    height = image.shape[0]
                    width  = image.shape[1]
                    mask_image = numpy.zeros((height,width))
                    a = numpy.array([[int(region_json[0]['points'][0]['x']),int(region_json[0]['points'][0]['y'])]], numpy.int32)
                    for points in region_json[0]['points']:
                        a = numpy.append(a,[[int(points['x']), int(points['y'])]],axis=0)
                    cv2.fillConvexPoly(mask_image, a, (255, 255, 255))
                    pre_point = region_json[0]['points'][0]
                    for points in region_json[0]['points']:
                        if  pre_point != points:
                            cv2.line(annotation_image, (points['x'], points['y']), (pre_point['x'], pre_point['y']), (0, 0, 255))
                        pre_point = points
                    lmax = numpy.max(a[1:len(a)][:, 0])
                    lmin = numpy.min(a[1:len(a)][:, 0])
                    hmax = numpy.max(a[1:len(a)][:, 1])
                    hmin = numpy.min(a[1:len(a)][:, 1])
                    roi_image = image[hmin: hmax, lmin: lmax]
                    box_image = cv2.rectangle(box_image, (lmin, hmin), (lmax, hmax), color = (0, 255, 255), thickness = 3) # 图像，点集，是否闭合，颜色，线条粗细
#                     save_jpg_file("data/thyroid_mask/" + jpg_name, mask_image)
#                     save_jpg_file("data/thyroid_image/" + jpg_name, image)
#                     save_jpg_file(output_directory + "/annotation/" + output_jpg_name, annotation_image)
                    save_jpg_file(output_directory + "/mask/" + output_jpg_name, mask_image)
#                     save_jpg_file(output_directory + "/roi/" + output_jpg_name, roi_image)
                    save_jpg_file(output_directory + "/image/" + output_jpg_name, image)
#                     save_jpg_file(output_directory + "/box/" + output_jpg_name, box_image)

#                     return label_image,annotation
            except Exception as e:
                print(e)


In [6]:
def create_image(jpg_file):
    image = imageio.imread(jpg_file)
    height = image.shape[0]
    width = image.shape[1]
    new_image = cv2.resize(image, (width, height), interpolation=cv2.INTER_LINEAR)
    return new_image


In [7]:
def save_jpg_file(jpg_output_file, output_image):
    cv2.imwrite(jpg_output_file, output_image)

In [8]:
# 公开数据集
maligns_folder_name = '../../data/origin/public/maligns/maling'
benigns_folder_name = '../../data/origin/public/benigns/bening'
thyroid_folder_name = '../../data/origin/public/thyroid'

# folder_name=maligns_folder_name
# folder_name=benigns_folder_name
# folder_name=thyroid_folder_name

# 郴州数据输出目录
output_folder_name = '../../data/preprocess/public/'
# 标注图像输出目录
if os.path.exists(output_folder_name + "/annotation/") == False:
    os.makedirs(output_folder_name + "/annotation/")
# mask图像输出目录
if os.path.exists(output_folder_name + "/mask/") == False:
    os.makedirs(output_folder_name + "/mask/")
# ROI图像输出目录
if os.path.exists(output_folder_name + "/roi/") == False:
    os.makedirs(output_folder_name + "/roi/")
# 原始图像输出目录
if os.path.exists(output_folder_name + "/box/") == False:
    os.makedirs(output_folder_name + "/box/")
# 原始图像输出目录
if os.path.exists(output_folder_name + "/image/") == False:
    os.makedirs(output_folder_name + "/image/")
    
for folder_name in [maligns_folder_name, benigns_folder_name, thyroid_folder_name]:
    if folder_name == thyroid_folder_name:
        xml_files = find_file_thyroid(folder_name, '.xml')
    else:
        xml_files = find_file(folder_name, '.xml')
    for xml_file in xml_files:
        if folder_name == maligns_folder_name:
            read_xml_file(folder_name, xml_file, output_folder_name, "maligns")
        if folder_name == benigns_folder_name:
            read_xml_file(folder_name, xml_file, output_folder_name, "benigns")
        if folder_name == thyroid_folder_name:
            read_xml_file_thyroid(folder_name, xml_file, output_folder_name)

# image = misc.imread("data/thyroid_mask/maligns_562_1.jpg")
# print(image[100][440])
# plt.imshow(image)

[INFO] directory: ../../data/origin/public/maligns/maling
[INFO] .xml_file: ../../data/origin/public/maligns/maling/576/576.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/569/569.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/631/631.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/608/608.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/552/552.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/564/564.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/566/566.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/619/619.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/625/625.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/616/616.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/613/613.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/639/639.xml
[INFO] .xml_file: ../../data/origin/public/maligns/maling/644/644.xml
[INFO] .xml_file: ../../data/ori

[INFO] read_xml_file: ../../data/origin/public/maligns/maling/616/616.xml
[INFO] find_jpg_file: ../../data/origin/public/maligns/maling/616/1.jpg
[INFO] read_xml_file: ../../data/origin/public/maligns/maling/618/618.xml
[INFO] find_jpg_file: ../../data/origin/public/maligns/maling/618/1.jpg
[INFO] read_xml_file: ../../data/origin/public/maligns/maling/619/619.xml
[INFO] find_jpg_file: ../../data/origin/public/maligns/maling/619/1.jpg
[INFO] read_xml_file: ../../data/origin/public/maligns/maling/622/622.xml
[INFO] find_jpg_file: ../../data/origin/public/maligns/maling/622/1.jpg
[INFO] read_xml_file: ../../data/origin/public/maligns/maling/623/623.xml
[INFO] find_jpg_file: ../../data/origin/public/maligns/maling/623/1.jpg
[INFO] read_xml_file: ../../data/origin/public/maligns/maling/624/624.xml
[INFO] find_jpg_file: ../../data/origin/public/maligns/maling/624/1.jpg
[INFO] read_xml_file: ../../data/origin/public/maligns/maling/625/625.xml
[INFO] find_jpg_file: ../../data/origin/public/mal

[INFO] read_xml_file: ../../data/origin/public/thyroid/10.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/10_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/100.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/100_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/101.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/101_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/102.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/102_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/103.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/103_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/104.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/104_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/105.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/105_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/106.xml
[INFO] find_jpg_file: ../../data/origin/pub

[INFO] read_xml_file: ../../data/origin/public/thyroid/153.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/153_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/154.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/154_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/155.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/155_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/156.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/156_2.jpg
[INFO] find_jpg_file: ../../data/origin/public/thyroid/156_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/157.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/157_2.jpg
[INFO] find_jpg_file: ../../data/origin/public/thyroid/157_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/158.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/158_2.jpg
[INFO] find_jpg_file: ../../data/origin/public/thyroid/158_1.jpg
[INFO] find_jpg_file: ../../data/orig

[INFO] read_xml_file: ../../data/origin/public/thyroid/209.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/209_1.jpg
[INFO] find_jpg_file: ../../data/origin/public/thyroid/209_2.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/21.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/21_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/210.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/210_3.jpg
[INFO] find_jpg_file: ../../data/origin/public/thyroid/210_1.jpg
[INFO] find_jpg_file: ../../data/origin/public/thyroid/210_2.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/211.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/211_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/212.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/212_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/213.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/213_3.jpg
[INFO] find_jpg_file: ../../data/origin

[INFO] find_jpg_file: ../../data/origin/public/thyroid/266_2.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/267.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/267_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/268.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/268_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/269.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/269_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/27.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/27_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/270.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/270_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/271.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/271_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/272.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/272_1.jpg
[INFO] read_xml_file: ../../data/origin/p

[INFO] read_xml_file: ../../data/origin/public/thyroid/32.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/32_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/320.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/320_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/321.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/321_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/323.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/323_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/326.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/326_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/329.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/329_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/33.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/33_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/330.xml
[INFO] find_jpg_file: ../../data/origin/publi

[INFO] read_xml_file: ../../data/origin/public/thyroid/375.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/375_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/376.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/376_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/377.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/377_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/378.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/378_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/379.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/379_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/38.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/38_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/380.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/380_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/381.xml
[INFO] find_jpg_file: ../../data/origin/pub

[INFO] read_xml_file: ../../data/origin/public/thyroid/70.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/70_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/71.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/71_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/72.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/72_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/73.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/73_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/74.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/74_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/75.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/75_2.jpg
[INFO] find_jpg_file: ../../data/origin/public/thyroid/75_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid/76.xml
[INFO] find_jpg_file: ../../data/origin/public/thyroid/76_1.jpg
[INFO] read_xml_file: ../../data/origin/public/thyroid