# FEATURED : **RSNA 2024 Lumbar Spine Degenerative Classification**

## >> **BUILD DATASET**

- [LSDC Gen Yolo Data SS](https://www.kaggle.com/code/namgalielei/lsdc-gen-yolo-data-ss)

## 00. **SET WORK ENVORINMENT**

#### 00.1. **DEFINE PRE-VARIABLE**

In [2]:
seed_num = 2024
dataset_paths = f'../data/raw'

#### 00.2. **IMPORT PACKAGES AND SET OPTIONS**

In [4]:
#(1) Import packages
import os
import warnings
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import iterstrat.ml_stratifiers
import pydicom
import matplotlib.pyplot as plt
import cv2
import glob

#(2) Set system options
warnings.filterwarnings(action='ignore')
np.set_printoptions(precision=3, suppress=True)
pd.options.display.float_format = '{:.4f}'.format
pd.options.display.max_rows = 150

#### 00.3. **CREATE FUNCTIONS**

In [33]:
#(1)
def get_level(text:str) -> str :
    for lev in ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']:
        if lev in text :
            split = lev.split('_')
            split[0] = split[0].capitalize()
            split[1] = split[1].capitalize()
            return '/'.join(split)
    raise ValueError(f'Level not found {lev}')
    
#(2) 
def get_condition(text:str) -> list :
    split = text.split('_')
    for i in range(len(split)) :
        split[i] = split[i].capitalize()
    split = split[:-2]
    return ' '.join(split)

#(3)
def query_train_xy_row(tar_df, std_df, study_id, series_id=None, instance_num=None):
    if series_id is not None and instance_num is not None:
        return tar_df[(tar_df.study_id==study_id) & (tar_df.series_id==series_id) &
            (tar_df.instance_number==instance_num)]
    elif series_id is None and instance_num is None:
        return tar_df[(tar_df.study_id==study_id)]
    else:
        return tar_df[(std_df.study_id==study_id) & (tar_df.series_id==series_id)]

#(4) 
def read_dcm(src_path):
    dicom_data = pydicom.dcmread(src_path)
    image = dicom_data.pixel_array
    image = (image - image.min()) / (image.max() - image.min() +1e-6) * 255
    image = np.stack([image]*3, axis=-1).astype('uint8')
    return image

#(5)
def get_accronym(text):
    split = text.split(' ')
    return ''.join([x[0] for x in split])

#(6)
def gen_yolo_format(OUT_DIR, IMG_DIR, ann_df, phase='train', ):
    for name, group in tqdm(ann_df.groupby(['study_id', 'series_id', 'instance_number'])):
        study_id, series_id, instance_num = name[0], name[1], name[2]
        path = f'{IMG_DIR}/{study_id}/{series_id}/{instance_num}.dcm'
        img = read_dcm(path)
        H, W = img.shape[:2]

        img_dir = os.path.join(OUT_DIR, 'images', phase)
        os.makedirs(img_dir, exist_ok=True)
        img_path = os.path.join(img_dir, f'{study_id}_{series_id}_{instance_num}.jpg')
        cv2.imwrite(img_path, img)

        ann_dir = os.path.join(OUT_DIR, 'labels', phase)
        os.makedirs(ann_dir, exist_ok=True)
        ann_path = os.path.join(ann_dir, f'{study_id}_{series_id}_{instance_num}.txt')
        
        contain_nulls = False
        
        with open(ann_path, 'w') as f:
            for i, row in group.iterrows():
                try : 
                    cond = row['condition']
                    level = row['level']
                    severity = row['label']
                    if pd.isnull(severity):
                        contain_nulls = True
                        break
                    # class_label = f"{cond.lower().replace(' ', '_')}_{level.lower().replace('/', '_')}_{severity.lower()}"
                    class_label = (
                        f"{cond.lower().replace(' ', '_')}_"
                        f"{level.lower().replace('/', '_')}_"
                        f"{severity.lower().replace('/', '_')}"
                    )
                    class_id = lb2id[class_label]
                    x_center = row['x'] / W
                    y_center = row['y'] / H
                    width = W / OD_INPUT_SIZE * STD_BOX_SIZE / W
                    height = H /  OD_INPUT_SIZE * STD_BOX_SIZE / H
                    f.write(f'{class_id} {x_center} {y_center} {width} {height}\n')
                except : 
                    pass
        
        if not contain_nulls:
            cv2.imwrite(img_path, img)
#         break

#### 00.4. **CREATE CLASSES**

In [4]:
pass

<b></b>

## 01. **READ AND CONCATENATE DATASETS**

##### 01.1. **READ DATASETS**

In [7]:
#(1) Read Datasets
train_desc_raw = pd.read_csv(filepath_or_buffer=f'{dataset_paths}/train_series_descriptions.csv') # id 별 description ? (MRI 이미지 측정 기법?같은 설명인듯) 
train_coord_raw = pd.read_csv(filepath_or_buffer=f'{dataset_paths}/train_label_coordinates.csv')  # 이미지 데이터의 좌표값(x, y) 
train_label_raw = pd.read_csv(filepath_or_buffer=f'{dataset_paths}/train.csv')                    # 멀티 라벨(정답값배열) 
test_desc_raw = pd.read_csv(filepath_or_buffer=f'{dataset_paths}/test_series_descriptions.csv')
submission_raw = pd.read_csv(filepath_or_buffer=f'{dataset_paths}/sample_submission.csv')

In [None]:
train_label_raw

##### 01.2. **SPLIT DATASETS**

In [None]:
#(1)
train_label = train_label_raw.fillna(value='Unknown')

#(2)
mskf = iterstrat.ml_stratifiers.MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)

#(3)
fold = 0
for train_index, test_index in mskf.split(X=train_label, y=train_label.iloc[:,1:]):
    train_label.loc[test_index, 'fold_num'] = fold
    fold += 1

#(4)
train_label['fold_num'] = train_label['fold_num'].astype(dtype='int')

#(5)
fold_idx = train_label.loc[:, ['study_id', 'fold_num']]

#(6)
fold_idx

##### 01.3. **_**

In [None]:
#(1) 
train_label_lf = {
    'study_id'  : [], 
    'condition' : [], 
    'level'     : [], 
    'label'     : []
}
for i, row in train_label_raw.iterrows() :
    study_id = row['study_id']
    for k, label in row.iloc[1:].to_dict().items() :
        level = get_level(k)
        condition = get_condition(k)
        train_label_lf['study_id'].append(study_id)
        train_label_lf['condition'].append(condition)
        train_label_lf['level'].append(level)
        train_label_lf['label'].append(label)

#(2)
train_label_lf = pd.DataFrame(data=train_label_lf)

#(3)
train_label_lf = pd.merge(
    left=train_label_lf, 
    right=fold_idx, 
    on='study_id'
)
train_xy = pd.merge(
    left=train_coord_raw,
    right=train_desc_raw, 
    how='inner', 
    on=['study_id', 'series_id']
)
train_label_lf = pd.merge(
    left=train_label_lf,
    right=train_xy, 
    how='inner', 
    on=['study_id', 'condition', 'level']
)

#(4)
train_label_lf

<b></b>

## 02.

##### 02.1. **CHECK SAMPLE**

In [None]:
#(1)
example = train_label_lf.sample(n=1).iloc[0, :]
study_id = example.study_id
series_id = example.series_id
instance_num = example.instance_number
src_path = f'{dataset_paths}/train_images/{study_id}/{series_id}/{instance_num}.dcm'

#(2)
img = read_dcm(src_path=src_path)

#(3)
tmp_df = query_train_xy_row(
    tar_df=train_label_lf, 
    std_df=train_xy, 
    study_id=study_id, 
    series_id=series_id, 
    instance_num=instance_num
)

#(4)
tmp_df

In [None]:
#(5)
WIDTH = 10
OD_INPUT_SIZE = 384
STD_BOX_SIZE = 20
for i, row in tmp_df.iterrows():
    lbl = f"{get_accronym(row['condition'])}_{row['level']}"
    x, y = row['x'], row['y']
    x1 = int(x - WIDTH)
    x2 = int(x + WIDTH)
    y1 = int(y - WIDTH)
    y2 = int(y + WIDTH)
    color = None
    if row['label'] == 'Normal/Mild':
        color =  (0, 255, 0) # GREEN
    elif row['label'] == 'Moderate':
        color = (255,255,0) # YELLOW
    elif row['label'] == 'Severe':
        color = (255,0,0) # RED
        
    fontFace = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    thickness = 1
    cv2.rectangle(img, (x1,y1), (x2,y2), color, 2)
    cv2.putText(img, lbl, (x1,y1), fontFace, fontScale, color, thickness, cv2.LINE_AA)

#(6)
plt.imshow(img)
plt.show()

##### 02.2. **_**

In [None]:
#(1)
conditions = ['Left Subarticular Stenosis', 'Right Subarticular Stenosis']
# conditions = train_label_lf.loc[:, 'condition'].unique()
severities = ['Normal/Mild', 'Moderate', 'Severe']
levels = ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']

#(2)
train_label_lf_flt = train_label_lf[train_label_lf.condition.map(lambda x: x in conditions)]

#(3)
train_label_lf_flt

In [None]:
#(4)
lb2id = {}
id2lb = {}
i = 0
for cond in conditions :
    for level in levels :
        for severity in severities :
            cls_ = f"{cond.lower().replace(' ', '_')}_{level}_{severity.lower()}"
            lb2id[cls_] = i
            id2lb[i] = cls_
            i+=1

#(5)
id2lb

In [None]:
#(6)
folds = [0, 1, 2, 3, 4]
img_path = f'{dataset_paths}/train_images'

#(4) 
for fold in folds :
    print('>> Gen data fold', fold)
    output_path = f'../data/yolo/fold{fold}'
    os.makedirs(name=output_path, exist_ok=True)
    
    train_df = train_label_lf_flt[train_label_lf_flt.fold_num != fold]
    val_df = train_label_lf_flt[train_label_lf_flt.fold_num == fold]
    
    gen_yolo_format(IMG_DIR=img_path, OUT_DIR=output_path, ann_df=train_df, phase='train')
    gen_yolo_format(IMG_DIR=img_path, OUT_DIR=output_path, ann_df=val_df, phase='val')