/
prepare_meta_dicom.py
143 lines (113 loc) · 4.32 KB
/
prepare_meta_dicom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import pickle
import random
import glob
import datetime
import pandas as pd
import numpy as np
import torch
import cv2
import pydicom
from tqdm import tqdm
from joblib import delayed, Parallel
import zipfile
from pydicom.filebase import DicomBytesIO
import sys
sys.path.insert(0, 'scripts')
from logs import get_logger, dumpobj, loadobj
# Print info about environments
logger = get_logger('Prepare Data', 'INFO') # noqa
logger.info('Cuda set up : time {}'.format(datetime.datetime.now().time()))
def get_dicom_value(x, cast=int):
if type(x) in [pydicom.multival.MultiValue, tuple]:
return cast(x[0])
else:
return cast(x)
def cast(value):
if type(value) is pydicom.valuerep.MultiValue:
return tuple(value)
return value
def get_dicom_raw(dicom):
return {attr:cast(getattr(dicom,attr)) for attr in dir(dicom) if attr[0].isupper() and attr not in ['PixelData']}
def rescale_image(image, slope, intercept):
return image * slope + intercept
def apply_window(image, center, width):
image = image.copy()
min_value = center - width // 2
max_value = center + width // 2
image[image < min_value] = min_value
image[image > max_value] = max_value
return image
def get_dicom_meta(dicom):
return {
'PatientID': dicom.PatientID, # can be grouped (20-548)
'StudyInstanceUID': dicom.StudyInstanceUID, # can be grouped (20-60)
'SeriesInstanceUID': dicom.SeriesInstanceUID, # can be grouped (20-60)
'WindowWidth': get_dicom_value(dicom.WindowWidth),
'WindowCenter': get_dicom_value(dicom.WindowCenter),
'RescaleIntercept': float(dicom.RescaleIntercept),
'RescaleSlope': float(dicom.RescaleSlope), # all same (1.0)
}
def apply_window_policy(image):
image1 = apply_window(image, 40, 80) # brain
image2 = apply_window(image, 80, 200) # subdural
image3 = apply_window(image, 40, 380) # bone
image1 = (image1 - 0) / 80
image2 = (image2 - (-20)) / 200
image3 = (image3 - (-150)) / 380
image = np.array([
image1 - image1.mean(),
image2 - image2.mean(),
image3 - image3.mean(),
]).transpose(1,2,0)
return image
def convert_dicom_to_jpg(name):
try:
data = f.read(name)
dirtype = 'train' if 'train' in name else 'test'
imgnm = (name.split('/')[-1]).replace('.dcm', '')
dicom = pydicom.dcmread(DicomBytesIO(data))
image = dicom.pixel_array
image = rescale_image(image, rescaledict['RescaleSlope'][imgnm], rescaledict['RescaleIntercept'][imgnm])
image = apply_window_policy(image)
image -= image.min((0,1))
image = (255*image).astype(np.uint8)
cv2.imwrite(os.path.join(PATHPROC, dirtype, imgnm)+'.jpg', image)
except:
logger.info(name)
def generate_df(base, files):
train_di = {}
for filename in tqdm(files):
path = os.path.join( base , filename)
dcm = pydicom.dcmread(path)
all_keywords = dcm.dir()
ignored = ['Rows', 'Columns', 'PixelData']
for name in all_keywords:
if name in ignored:
continue
if name not in train_di:
train_di[name] = []
train_di[name].append(dcm[name].value)
df = pd.DataFrame(train_di)
return df
DATAPATH = 'data'
TRAIN_DIR = os.path.join(DATAPATH, 'raw/stage_2_train_images')
TEST_DIR = os.path.join(DATAPATH, 'raw/stage_2_test_images')
PATHPROC = os.path.join(DATAPATH, 'proc')
logger.info('Create test meta files')
test_files = os.listdir(TEST_DIR)
test_df = generate_df(TEST_DIR, test_files)
test_df.to_csv(os.path.join(DATAPATH, 'test_metadata.csv'))
logger.info('Create train meta files')
train_files = os.listdir( TRAIN_DIR)
train_df = generate_df(TRAIN_DIR, train_files)
train_df.to_csv(os.path.join(DATAPATH, 'train_metadata.csv'))
logger.info('Load meta files')
trnmdf = pd.read_csv(os.path.join(DATAPATH, 'train_metadata.csv'))
tstmdf = pd.read_csv(os.path.join(DATAPATH, 'test_metadata.csv'))
mdf = pd.concat([trnmdf, tstmdf], 0)
rescaledict = mdf.set_index('SOPInstanceUID')[['RescaleSlope', 'RescaleIntercept']].to_dict()
logger.info('Create windowed images')
with zipfile.ZipFile(os.path.join(DATAPATH, "raw/rsna-intracranial-hemorrhage-detection.zip"), "r") as f:
for t, name in enumerate(tqdm(f.namelist())):
convert_dicom_to_jpg(name)