#  sarcopenia preprocess

### 준비 사항
- dcm 파일 혹은 img
- dcm인 경우 몇 번째 dcm이 l3인지 알려주는 정보
- mask 파일
- 

## 포맷

- [segmentation_models_pytorch](https://segmentation-modelspytorch.readthedocs.io/en/latest/) 라이브러리를 채택
- 위 라이브러리의 인풋과 아웃풋을 따름

```md
code_home
├── annotations
│   ├── trimaps
│   └── xmls
└── images

```

- annotations/trimaps: PNG files, 0(배경) or 1(obj)로 전처리하여 사용
- annotations/xmls: 이미지 메타 정보
- images: JPG images


## Training set 처리

```md
train_data_home
├── SarcopeniaCase1
│   ├── 20061220_DCM
│   ├── 20061220_ROI
│   ├── 20061220_RAW.txt
│   ├── [날짜]_DCM
│   ├── [날짜]_ROI
│   └── [날짜]_RAW.txt
├── SarcopeniaCase2
├── SarcopeniaCase3
└── SarcopeniaCase4
```

In [7]:
import os
from glob import glob
from PIL import Image
import pydicom
import numpy as np

### image 처리

#### img 처리 함수들

In [8]:
def norm_img(img):
    norm_image = np.array(img, dtype=np.float64)
    norm_image -= np.min(norm_image)
    norm_image /= np.max(norm_image)
    normalized_image = norm_image * 255
    normalized_image = normalized_image.astype(np.uint8)
    return normalized_image
    
def dcm_windowing(ds, center=40, width=400):
    dcm_array = ds.pixel_array
    hu_image = dcm_array * ds.RescaleSlope + ds.RescaleIntercept
    # try:
    #     hu_image = dcm_array * dcm.RescaleSlope + dcm.RescaleIntercept
    # except AttributeError:
    #     hu_image = dcm_array
    # hu_image[hu_image < -1024] = -1024

    # img_min = dcm.WindowCenter - dcm.WindowWidth // 2
    # img_max = dcm.WindowCenter + dcm.WindowWidth // 2
    img_min = center - width // 2
    img_max = center + width // 2
    
    window_image = hu_image.copy()
    window_image[window_image < img_min] = img_min
    window_image[window_image > img_max] = img_max

    normalized_img = norm_img(window_image)
    p_image = Image.fromarray(normalized_img)
    p_image = p_image.resize((512, 512))
    
    return p_image


def get_slice_no(raw_path):
    l3_dcm_id = None
    with open(raw_path, 'r', encoding="utf-8") as f:
        lines = f.readlines()
    for i, line in enumerate(lines):
        if line.strip() == "#SliceNo":
            l3_dcm_id = int(lines[i+1])
            return l3_dcm_id
    return l3_dcm_id

#### 원본 이미지 저장(train_data)

In [None]:
train_data_home = "C:\\Users\\qwe14\\0.code\\sarcopenia\\data_train_raw\\#AsanNas\\"
train_img_save_path = "C:\\Users\\qwe14\\0.code\\sarcopenia\\data_train\\images\\"

In [9]:
train_data_dirs = os.listdir(train_data_home)
data_numbering = 0
relation_table = ""
error_table = ""
for train_dir in train_data_dirs:
    dir_path = os.path.join(train_data_home, train_dir)
    data_ids = glob(f"{dir_path}\\*")
    data_ids = list(set([date.split("\\")[-1].split(".")[0][:-4] for date in data_ids]))
    for data_id in data_ids:
        data_numbering += 1
        raw_path = os.path.join(dir_path, f"{data_id}_RAW.txt")
        try:
            dcm_no = get_slice_no(raw_path)
        except FileNotFoundError as e:
            error_table += f"FileNotFound(RAW.txt): {raw_path}\n"
        
        # image code
        dcm_path = os.path.join(dir_path, f"{data_id}_DCM{os.sep}{dcm_no:05d}.dcm")
        try:
            ds = pydicom.dcmread(dcm_path)
        except pydicom.errors.InvalidDicomError as e:
            error_table += f"InvalidDicom: {dcm_path}\n"
        except FileNotFoundError as e:
            error_table += f"FileNotFoundError(dcm): {dcm_path}\n"

        jpg_path = os.path.join(train_img_save_path, f"{data_numbering:05d}_{data_id}.jpg")

        generate_jpg = jpg_path.split("\\")[-1]
        dcm_path = "/".join(dcm_path.split("\\")[-3:-1])
        relation_table += f"{dcm_path}\t{generate_jpg}\n"
        img_array = dcm_windowing(ds)
        img_array.save(jpg_path)
    with open("data_relation.txt", "w", encoding='utf-8') as f:
        f.write(relation_table)
    with open("data_error.txt", "w", encoding='utf-8') as f:
        f.write(error_table)


  _warn_about_invalid_encoding(encoding)


### annotation 처리

In [10]:
import numpy as np
import copy
from PIL import Image

In [11]:
data_home = "C:\\Users\\qwe14\\0.code\\sarcopenia\\data_train_raw\\#AsanNas\\"
annot_home = "C:\\Users\\qwe14\\0.code\\sarcopenia\\data_train\\annotations\\"

In [16]:
with open("data_relation.txt", "r", encoding='utf-8') as f:
    lines = f.readlines()

err_msg = ""
train_annot_data = ""
for line in lines:
    # SarcopeniaCase138/20150717_DCM	00081_20150717.jpg
    dcm_dir_path, jpg_path = line.strip().split("\t")
    dcm_roi_path = dcm_dir_path.replace("DCM", "ROI") # SarcopeniaCase138/20150717_DOI
    try:
        roi_img_path = glob(os.path.join(data_home, dcm_roi_path, "Mask*.png"))[0]
        hu_val = np.array(Image.open(roi_img_path))
        
        hu_val = hu_val.astype(np.int8)
        unique = np.unique(hu_val)
        if len(unique) != 4:
            err_msg += f"Not enough annot:\t{dcm_dir_path}\t{jpg_path}\t{unique}\n"
        else:
            class_d = ["S", "M", "V"]# 1 == Subcutaneous Fat, 2 == Muscle, 3 == Visceral Fat
            for i, class_init in enumerate(class_d):
                mask_img = np.zeros_like(hu_val)
                mask_img[hu_val == i + 1] = hu_val[hu_val == i + 1]
                mask_img[mask_img == i+1] = np.array(255, dtype=np.int8)
                # mask_img = (mask_img - mask_img.min()) / (mask_img.max() - mask_img.min())
                p_image = Image.fromarray(mask_img)
                p_image = p_image.resize((512, 512))
                new_annot_img = f"{annot_home}{class_init}{jpg_path}"
                p_image = p_image.convert('RGB')
                p_image.save(new_annot_img)
                train_annot_data += f"{class_init}{jpg_path} {i+1} {i+1} {1}\n"
    except IndexError:
        err = os.path.join(data_home, dcm_roi_path, "Mask*.jpg")
        err_msg += f"Index:\t{err}\n"
    
with open("annot_error.txt", "w", encoding='utf-8') as f:
    f.write(err_msg)
with open("annot_train.txt", "w", encoding='utf-8') as f:
    f.write(train_annot_data)

For the old behavior, usually:
    np.array(value).astype(dtype)
will give the desired result (the cast overflows).
  mask_img[mask_img == i+1] = np.array(255, dtype=np.int8)
