# FEATURED : **RSNA 2024 Lumbar Spine Degenerative Classification**

## >> **EDA**

## 00. **SET WORK ENVORINMENT**

#### 00.1. **DEFINE PRE-VARIABELS**

In [1]:
seed_num = 2024

#### 00.2. **IMPORT PACKAGES AND SET OPTIONS**

In [2]:
#(1) Import packages
import os
import warnings
import glob
# import re
import numpy as np
import pandas as pd
import pydicom
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import rc

#(2) Set system options
warnings.filterwarnings(action='ignore')
np.set_printoptions(precision=3, suppress=True)
pd.options.display.float_format = '{:.4f}'.format
pd.options.display.max_rows = 150
rc(group='animation', html='jshtml')

#### 00.3. **CREATE FUNCTIONS**

In [3]:
#(1) Create `relocate_col()` function
def relocate_col(df:pd.DataFrame, tar_col:str, std_col:str, how:str='after') -> pd.DataFrame : 
    '''
    Reorder columns in a DataFrame by moving a target column relative to a standard column.

    Parameters :
    - df (pd.DataFrame): The DataFrame from which the column will be relocated.
    - tar_col (str): The name of the column to be relocated.
    - std_col (str): The column relative to which `tar_col` will be relocated.
    - how (str, optional): Specifies the placement of `tar_col` relative to `std_col`.
      It can be 'after' (default) or 'before'.

    Returns :
    - pd.DataFrame: A new DataFrame with the column `tar_col` relocated as specified.
    '''
    col_ary = np.array(object=df.columns)
    tar_col_idx = np.where(col_ary==tar_col)[0][0]
    std_col_idx = np.where(col_ary==std_col)[0][0]
    col_ary = np.delete(arr=col_ary, obj=tar_col_idx)
    if how == 'after' : 
        if std_col_idx == len(col_ary) : 
            col_ary = np.insert(arr=col_ary, obj=std_col_idx, values=tar_col)
        else :
            col_ary = np.insert(arr=col_ary, obj=std_col_idx+1, values=tar_col)
    elif how == 'before' : 
        if std_col_idx == 0 : 
            col_ary = np.insert(arr=col_ary, obj=std_col_idx, values=tar_col) 
        else : 
            col_ary = np.insert(arr=col_ary, obj=std_col_idx-1, values=tar_col)
    else : 
        pass
    df = df.loc[:, col_ary]
    return df 

#(2) Create `diagnose_df()` function
def diagnose_df(df:pd.DataFrame) -> pd.DataFrame : 
    '''
    Generates a diagnostic summary for a pandas DataFrame, reporting details like data types, 
    count of missing values, and uniqueness for each column.

    Parameters :
    - df (pd.DataFrame): The DataFrame to be diagnosed.

    Returns :
    - pd.DataFrame: A summary table with diagnostics for each column in the input DataFrame, 
      including the column name, data type, total rows, count and rate of missing values, 
      and count and rate of unique values.
    '''
    output = pd.DataFrame(data=df.dtypes).reset_index()
    output.columns = ['COLUMN_NM', 'DATA_TYPE']
    output.loc[:, 'ROW_CNT'] = len(df)
    output.loc[:, 'NA_CNT'] = df.isna().sum().values
    output.loc[:, 'NA_RATE'] = output.loc[:, 'NA_CNT'] / output.loc[:, 'ROW_CNT']
    output.loc[:, 'UNIQUE_CNT'] = df.nunique().values
    output.loc[:, 'UNIQUE_RATE'] = output.loc[:, 'UNIQUE_CNT'] / output.loc[:, 'ROW_CNT']
    format_columns = ['ROW_CNT', 'NA_CNT', 'UNIQUE_CNT']
    for col in format_columns:
        output[col] = output[col].apply(func=lambda x: f'{x:,.0f}')
    return output

#(3) Create `check_grby_cnt()` function
def check_grby_cnt(df: pd.DataFrame, grby_cols: list, cnt_col: str, cnt_col_nm: str) -> pd.DataFrame :
    output = df.groupby(
        by=grby_cols, 
        as_index=False
    ).agg(
        {cnt_col: 'nunique'}
    ).rename(
        columns={cnt_col: cnt_col_nm}
    )
    return output

#(4) Create `get_grby_outlier_pk()` function
def get_grby_outlier_pk(df:pd.DataFrame, grby_cols:list, cnt_col:str, tar_col:str, condition:str) -> np.array : 
    temp = check_grby_cnt(df=df, grby_cols=grby_cols, cnt_col=cnt_col, cnt_col_nm='_') 
    CON = eval(f'(temp.loc[:, "_"] {condition})')
    output = temp.loc[CON, tar_col].unique()
    return output

In [4]:
#(5)
def check_img(dir_path:str, coord_df:pd.DataFrame, desc_df:pd.DataFrame, study_id:str, series_id:str) -> plt.figure :
    '''
    '''
    img_path = f'{dir_path}/train_images/{study_id}/{series_id}'
    dicom_files = [f for f in os.listdir(img_path) if f.endswith('.dcm')]
    CON = (coord_df.loc[:, 'series_id'] == int(series_id))
    study_label_coordinates = coord_df.loc[CON, :]
    CON = (desc_df.loc[:, 'series_id'] == series_id)
    series_description = desc_df.loc[CON, 'series_description'].unique()

    filtered_dicom_files = []
    filtered_label_coordinates = []

    for dicom_file in dicom_files :
        instance_number = int(dicom_file.split('.')[0])
        CON = (study_label_coordinates.loc[:, 'instance_number'] == instance_number)
        corresponding_coordinates = study_label_coordinates.loc[CON, :]
        if not corresponding_coordinates.empty :
            filtered_dicom_files.append(dicom_file)
            filtered_label_coordinates.append(corresponding_coordinates)

    num_images = len(filtered_dicom_files)
    num_columns = 5
    num_rows = (num_images + num_columns - 1) // num_columns

    fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(4 * num_columns, 4 * num_rows))
    axs = axs.flatten()

    for i, (dicom_file, label_coordinates) in enumerate(zip(filtered_dicom_files, filtered_label_coordinates)) :
        dicom_data = pydicom.dcmread(fp=os.path.join(img_path, dicom_file))
        image = dicom_data.pixel_array
        axs[i].imshow(X=image, cmap='gray')
        axs[i].set_title(label=f'{study_id}/{series_id}/{dicom_file}')

        conditions = ','.join(label_coordinates.loc[:, 'condition'].unique())
        level = ','.join(map(str, label_coordinates.loc[:, 'level'].unique()))

        axs[i].set_xlabel(xlabel=f'Series Descriptions: {series_description} \n Conditions: {conditions} \n Level: {level}', fontsize=8)
        # axs[i].axis(option='on')  # Ensure axis is on to display labels

        # Ensure the plotting of red circles
        for _, row in label_coordinates.iterrows():
            axs[i].plot(row['x'], row['y'], 'o', markerfacecolor='none', markeredgecolor='red', markersize=10, markeredgewidth=2)

    for ax in axs[i+1:]:
        ax.set_visible(False)

    plt.subplots_adjust(hspace=0.5, wspace=0.5)  # Adjust spacing to make sure labels and titles do not overlap
    plt.show()

#(6)
def check_vid(dir_path:str, coord_df, desc_df, study_id:int, series_id:int) -> animation.FuncAnimation :
    # Load DICOM files sorted by the number in filenames
    img_path = f'{dir_path}/train_images/{study_id}/{series_id}'
    t_paths = sorted(
        glob.glob(os.path.join(img_path, "*")), 
        key=lambda x: int(os.path.splitext(os.path.basename(x))[0].split("-")[-1])
    )
    images = []
    coordinates = []  # Store coordinates for each image
    series_description = desc_df[desc_df['series_id'] == int(series_id)]['series_description'].unique()
    conditions = coord_df[coord_df['series_id'] == int(series_id)]['condition'].unique()
    
    # Read DICOM files into an image list and extract coordinates
    for filename in t_paths:
        instance_number = int(os.path.splitext(os.path.basename(filename))[0].split("-")[-1])
        ds = pydicom.dcmread(filename)
        data = ds.pixel_array
        if data.max() == 0:  # Skip images with no maximum pixel value
            continue
        
        images.append(data)
        
        # Extract coordinates for this instance number
        instance_coords = coord_df[(coord_df['series_id'] == int(series_id)) & (coord_df['instance_number'] == instance_number)]
        image_coords = [(row['x'], row['y']) for index, row in instance_coords.iterrows()]
        coordinates.append(image_coords)

    # Create an animation of the images with markers
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.axis('off')
    im = ax.imshow(images[0], cmap="gray")
    title_text = ax.text(x=0.5, y=1.05, s='', transform=ax.transAxes, ha="center", fontsize=16)
    label_text = ax.text(0.5, -0.1, "", transform=ax.transAxes, ha="center", fontsize=12)

    # Initialize markers (empty at start)
    markers, = ax.plot([], [], 'o', markerfacecolor='none', markeredgecolor='red', markersize=10, markeredgewidth=2)

    def animate_func(i):
        im.set_array(images[i])
        title_text.set_text(f'{study_id}/{series_id}')
        label_text.set_text(f'Series Descriptions: {series_description} \nConditions: {conditions}')
        if i < len(coordinates) and coordinates[i]:  # Update markers if coordinates are available
            x_coords, y_coords = zip(*coordinates[i])
            markers.set_data(x_coords, y_coords)
        else:
            markers.set_data([], [])  # No markers for this frame
        return [im, markers, title_text, label_text]

    anim = animation.FuncAnimation(fig, animate_func, frames=len(images), interval=500)
    plt.close(fig)
    return anim

In [5]:
#(7)
def reshape_row(row) -> pd.DataFrame :
    data = {'study_id': [], 'condition': [], 'level': [], 'severity': []}
    
    for column, value in row.items():
        if column not in ['study_id', 'series_id', 'instance_number', 'x', 'y', 'series_description']:
            parts = column.split('_')
            condition = ' '.join([word.capitalize() for word in parts[:-2]])
            level = parts[-2].capitalize() + '/' + parts[-1].capitalize()
            data['study_id'].append(row['study_id'])
            data['condition'].append(condition)
            data['level'].append(level)
            data['severity'].append(value)
    
    return pd.DataFrame(data)

#### 00.4. **CREATE CLASSES**

In [6]:
pass

<b></b>

## 01. **READ AND CONCATENATE DATASETS**

##### 01.1. **READ DATASETS**

In [42]:
#(1) Define file-path
file_path = f'{os.getcwd()}/../data/raw'

#(2) Read Datasets
train_desc_raw = pd.read_csv(filepath_or_buffer=f'{file_path}/train_series_descriptions.csv') # id 별 description ? (MRI 이미지 측정 기법?같은 설명인듯) 
train_coord_raw = pd.read_csv(filepath_or_buffer=f'{file_path}/train_label_coordinates.csv')  # 이미지 데이터의 좌표값(x, y) 
train_label_raw = pd.read_csv(filepath_or_buffer=f'{file_path}/train.csv')                    # 멀티 라벨(정답값배열) 
test_desc_raw = pd.read_csv(filepath_or_buffer=f'{file_path}/test_series_descriptions.csv')
submission_raw = pd.read_csv(filepath_or_buffer=f'{file_path}/sample_submission.csv')

<b></b>

## 02. **EDA**

#### 02.1. **`train_series_descriptions` 파악**

- 총 $ \, 1975 $ 개의 study_id 중 $ \, 3 $ 개의 유일한 촬영기법(description)을 갖지 않는 study_id 파악 : $ 343 $ 개

$ \hspace{0.5cm} $ ($ 340 $ 개의 study_id는 $ \, 3 $ 개 이상의 중복된 촬영기법, $ \, 3 $ 개의 study_id는 $ \, 2 $ 개 이하의 촬영기법)

- 총 $ \, 6294 $ 개의 study_id 중 중복된 촬영 기법을 갖는 series_id 파악 : $ \, 714 $ 개

$ \hspace{0.5cm} $ ($ 700 $ 개의 study_id는 axial T2 촬영기법, $ \, 14 $ 개의 study_id는 Sagittal T1 촬영기법)

$ \hspace{0.5cm} \Rightarrow{} $ 02.2. `coord` 데이터셋 탐색

In [None]:
#(1)
diagnose_df(df=train_desc_raw)

In [None]:
#(2)
pd.DataFrame(
    data=check_grby_cnt(df=train_desc_raw, grby_cols=['study_id'], cnt_col='series_id', cnt_col_nm='series_cnt').value_counts(subset='series_cnt')
).reset_index()

In [None]:
study_id_nt3_desc = get_grby_outlier_pk(
    df=train_desc_raw.sort_values(by='series_description'), 
    grby_cols=['study_id'], 
    cnt_col='series_id', 
    tar_col='study_id', 
    condition='!= 3'
)
study_id_nt3_desc = np.unique(study_id_nt3_desc)
print(f'>> study id that has not 3 descriptions : {len(study_id_nt3_desc)}')

In [None]:
CON = (train_desc_raw.loc[:, 'study_id'].isin(values=study_id_nt3_desc))
temp = train_desc_raw.loc[CON, :]
temp

In [None]:
#(3)
CON = temp.duplicated(subset=['study_id', 'series_description'], keep=False)
series_id_nt3_desc = temp.loc[CON, 'series_id'].values
CON = train_desc_raw.loc[:, 'series_id'].isin(values=series_id_nt3_desc)
study_id_ov3_desc = train_desc_raw.loc[CON, 'study_id'].unique()
study_id_un3_desc = study_id_nt3_desc[~np.isin(element=study_id_nt3_desc, test_elements=study_id_ov3_desc)]
pd.DataFrame(data=train_desc_raw.loc[CON, :].value_counts(subset='series_description')).reset_index().rename(columns={'count':'series_cnt'})

#### 02.2. **`train_label_coordinates` 파악**

##### 02.2.1. **정형 데이터 파악**

- 중복된 촬영기법을 갖는 $ \, 343 $ 개의 study_id 의 이유 

$ \hspace{0.5cm} \rightarrow{} $ axial T2 에서 증상(condition) LSS, RSS 를 파악하기 위함 (단, 한 series_id 로도 잡을 때도 있음)

$ \hspace{0.975cm} $ 추가로 증상 판단을 위한 모든 디스크 위치(level) 이 한 series_id 에 안잡힐 때도 있어 여러 번 촬영

$ \hspace{0.975cm} $ ($ 631 $ 개의 series_id 는 **일부 디스크 위치가 존재하지 않음**)

$ \hspace{0.6cm} \Rightarrow{} $ series_id 는 **중요하지 않음**

- `train_series_description` 데이터셋 보다 study_id, series_id **적음** $ \rightarrow{} $ 추후 결합할 때 inner join 해야할 듯

- 총 $ \, 1974 $ 개의 study_id, condition 결합키 중 $ \, 5 $ 개의 디스크 위치를 갖지 않는 키 파악 : $ \, 183 $ 개

In [None]:
#(1)
diagnose_df(df=train_coord_raw)

In [None]:
#(2)
CON = (
    (train_coord_raw.loc[:, 'study_id'].isin(values=study_id_nt3_desc)) &
    (train_coord_raw.loc[:, 'series_id'].isin(values=series_id_nt3_desc)) 
)
train_coord_raw.loc[CON, :].sort_values(by=['study_id', 'condition', 'level']).head(n=20)

In [15]:
#(3)
# pd.DataFrame(
#     data=check_grby_cnt(df=train_coord_raw, grby_cols=['study_id', 'series_id'], cnt_col='level', cnt_col_nm='level_cnt').value_counts(subset='level_cnt')
# ).reset_index()

In [16]:
# series_id_nt5_level = get_grby_outlier_pk(
#     df=train_coord_raw, 
#     grby_cols=['study_id', 'series_id'], 
#     cnt_col='level', 
#     tar_col='series_id', 
#     condition='!= 5'
# )
# print(f'>> series id that has not 5 level : {len(series_id_nt5_level)}')

In [17]:
# pd.DataFrame(data=temp.value_counts(subset='condition')).reset_index()

In [18]:
# temp.loc[CON, :].head(n=12)

In [None]:
#(4)
pd.DataFrame(
    data=check_grby_cnt(df=train_coord_raw, grby_cols=['study_id', 'condition'], cnt_col='level', cnt_col_nm='level_cnt').value_counts(subset='level_cnt')
).reset_index()

In [None]:
study_id_nt5_level = get_grby_outlier_pk(
    df=train_coord_raw, 
    grby_cols=['study_id', 'condition'], 
    cnt_col='level', 
    tar_col='study_id', 
    condition='!= 5'
)

print(f'>> study id (and condition) that has not 5 descriptions : {len(study_id_nt5_level)}')

In [None]:
CON = (train_coord_raw.loc[:, 'study_id'].isin(values=study_id_nt5_level))
train_coord_raw.loc[CON, :].head(n=21).sort_values(by=['study_id', 'condition', 'level'])

##### 02.2.2. **이미지 파악**

- 여러 이미지(instance) 중 일부만 `train_series_description` 데이터셋의 좌표가 찍혀있는 이유 $ \rightarrow{} $ 디스크 위치의 정확한 식별이 안되서

In [None]:
#(4)
# tar_study_list = np.array(object=os.listdir(path=f'{file_path}/train_images/'))
tar_study_list = train_coord_raw.loc[:, 'study_id'].unique()
tar_study_idx = 1900
tar_study = int(tar_study_list[tar_study_idx])
tar_series_list = [f for f in os.listdir(path=f'{file_path}/train_images/{tar_study}') if not f.endswith('.DS_Store')]
tar_series_idx = 2
tar_series = int(tar_series_list[tar_series_idx])
CON = train_desc_raw.loc[:, 'series_id'] == tar_series
tar_desc = train_desc_raw.loc[CON, 'series_description'].values
# img_path = f'{file_path}/train_images/{tar_study}/{tar_series}'
print(f'>> Target study id : "{tar_study}"')
print(f'   (index : "{tar_study_idx}", max index : "{len(tar_study_list)}")')
print(f'>> Target series id : "{tar_series}"')
print(f'   (index : "{tar_series_idx}", max index : "{len(tar_series_list)}")')
print(f'>> Target Description : {tar_desc}')
# print(f'>> Image Count : {len(os.listdir(path=file_path))}')

In [None]:
check_img(dir_path=file_path, coord_df=train_coord_raw, desc_df=train_desc_raw, study_id=tar_study, series_id=tar_series)

In [None]:
#(4)
check_vid(dir_path=file_path, coord_df=train_coord_raw, desc_df=train_desc_raw, study_id=tar_study, series_id=tar_series)

##### PLUS. **$ \, 343 $ 개의 중복된 촬영 기법(description)을 갖는 `study_id` 파악**

In [None]:
#(1)
tar_study_idx = 175

#(2)
CON = (
    (train_desc_raw.loc[:, 'series_id'].isin(values=series_id_nt3_desc)) &
    (train_desc_raw.loc[:, 'study_id'] == study_id_nt3_desc[tar_study_idx])
)
tar_study = train_desc_raw.loc[CON, 'study_id'].unique()[0]
tar_series = train_desc_raw.loc[CON, 'series_id'].unique()

#(3)
train_desc_raw.loc[CON, :]

In [None]:
CON = (
    # (train_coord_raw.loc[:, 'study_id'] == tar_study) 
    (train_coord_raw.loc[:, 'series_id'].isin(values=tar_series))
)
train_coord_raw.loc[CON, :]

In [None]:
for i in np.arange(stop=len(tar_series)) : 
    check_img(dir_path=file_path, coord_df=train_coord_raw, desc_df=train_desc_raw, study_id=tar_study, series_id=tar_series[i])

#### 02.3. **`train_label_coordinates` 파악**

- 일부 컬럼에서 결측값 파악 $ \rightarrow{} $ **대부분** study_id 별 전체 이미지 파일(해당되는 모든 study_id)에서 디스크 위치가 파악이 안되서

$ \hspace{0.5cm} $ (파악되지 않는 부분은 추후 모델링하며 성능 개선이 필요하다고 판단되면 더 탐색)

In [None]:
#(1)
diagnose_df(df=train_label_raw)

<b></b>

## 03. **PREPARE DATASETS**

##### 03.1. **WIDE FORM TO LONG FORM**

In [None]:
#(1)
train_label_lf = pd.concat(
    objs=[reshape_row(row) for _, row in train_label_raw.iterrows()], 
    ignore_index=True
)

#(2)
train_label_lf

#### 03.2. **MERGE**

$ \hspace{0.5cm} $ ① `train_raw` (inner_join) `train_coord_raw` $ \Rightarrow{} $ `train_raw`

$ \hspace{0.5cm} $ ② `train_raw` (inner_join) `train_desc` $ \Rightarrow{} $ `train_raw`

$ \hspace{0.5cm} $ ③ `train_raw` (in) `train_image_study_ids`(*) $ \Rightarrow{} $ `train_raw`

$ \hspace{0.5cm} $ ④ `train_raw` (in) `train_image_series_ids`(*) $ \Rightarrow{} $ `train_raw`

$ \hspace{0.5cm} $ (*) : train_image 디렉토리에 존재하는 모든 study_id, series_id 리스트들

In [None]:
#(1)
train_raw = pd.merge(
    left=train_label_lf,
    right=train_coord_raw,
    on=['study_id', 'condition', 'level'],
    how='inner' # left ?
)

#(2)
train_raw = pd.merge(
    left=train_raw,
    right=train_desc_raw,
    on=['study_id', 'series_id'],
    how='inner'
)

#(3)
# train_img_study_ids = np.array(object=os.listdir(path=f'{file_path}/train_images/'))
# CON = train_raw.loc[:, 'study_id'].isin(values=train_img_study_ids)
# train_raw = train_raw.loc[CON, :]

#(4)
train_raw.loc[:, 'row_id'] = (
    train_raw['study_id'].astype(str) + '_' +
    train_raw['condition'].str.lower().str.replace(' ', '_') + '_' +
    train_raw['level'].str.lower().str.replace('/', '_')
) 

#(5)
train_raw = relocate_col(df=train_raw, tar_col='row_id', std_col='study_id', how='before')
train_raw = relocate_col(df=train_raw, tar_col='series_id', std_col='study_id', how='after')
train_raw = relocate_col(df=train_raw, tar_col='series_description', std_col='series_id', how='after')
train_raw = relocate_col(df=train_raw, tar_col='instance_number', std_col='series_description', how='after')
train_raw = relocate_col(df=train_raw, tar_col='condition', std_col='y', how='after')
train_raw = relocate_col(df=train_raw, tar_col='severity', std_col='y', how='after')

#(6)
train_raw.head(n=5)

#### 03.3. **CHECK**

In [None]:
#(1)
diagnose_df(df=train_raw)

In [None]:
#(2)
CON = (train_raw.loc[:, 'severity'].isna())
temp_01 = train_raw.loc[CON, 'study_id'].unique()
temp_02 = train_raw.loc[CON, 'series_id'].unique()
print(f'>> study id that has NA severitiy : {len(temp_01)}, series_id : {len(temp_02)}')

In [None]:
train_raw.loc[CON, :].head(n=5)

In [None]:
CON = (train_raw.loc[:, 'study_id'].isin(values=study_id_nt5_level))
diagnose_df(df=train_raw.loc[CON, :])

In [None]:
CON = (train_coord_raw.loc[:, 'study_id'].isin(values=temp))
train_coord_raw.loc[CON, :]

In [None]:
CON = (train_label_lf.loc[:, 'study_id'].isin(values=study_id_nt5_level))
diagnose_df(df=train_label_lf.loc[CON, :])

In [None]:
#(3)
train_raw.loc[:, ['study_id', 'series_description', 'condition']].drop_duplicates().groupby(by=['series_description', 'condition']).count()

In [None]:
#(3-PLUS)
CON = (
    (train_raw.loc[:, 'series_description'] == 'Sagittal T1') &
    (train_raw.loc[:, 'condition'] == 'Spinal Canal Stenosis')
)   
train_raw.loc[CON, :]

In [None]:
check_vid(dir_path=file_path, coord_df=train_coord_raw, desc_df=train_desc_raw, study_id=3637444890, series_id=3951475160)

In [None]:
CON = (train_raw.loc[:, 'condition'] == 'Spinal Canal Stenosis')
train_raw.loc[CON, ['study_id', 'series_id', 'condition']].drop_duplicates()

In [None]:
check_vid(dir_path=file_path, coord_df=train_coord_raw, desc_df=train_desc_raw, study_id=4282019580, series_id=1547999333)

<b></b>

## 04. **EDA 결론 및 추후 모델링 고려사항**

##### 04.1. **EDA 결론**

- 총 $ \, 1975 $ 개의 study_id 중 $ \, 3 $ 개의 유일한 촬영기법(description)을 갖지 않는 study_id 파악 : $ 343 $ 개

$ \hspace{0.5cm} $ ($ 340 $ 개의 study_id는 $ \, 3 $ 개 이상의 중복된 촬영기법, $ \, 3 $ 개의 study_id는 $ \, 2 $ 개 이하의 촬영기법)

- 총 $ \, 6294 $ 개의 study_id 중 중복된 촬영 기법을 갖는 series_id 파악 : $ \, 714 $ 개

$ \hspace{0.5cm} $ ($ 700 $ 개의 study_id는 axial T2 촬영기법, $ \, 14 $ 개의 study_id는 Sagittal T1 촬영기법)

$ \hspace{0.5cm} \Rightarrow{} $ 02.2.axial T2는 L(좌)SS, R(우)SS 구분을 위해 중복 생성됨, **단  sagittal T1은 좀 더 봐야할 듯(추후 확인 필요)**

- 총 $ \, 1974 $ 개의 study_id 중 $ \, 5 $ 개의 디스크 위치를 갖지 않는 study_id 파악 : $ \, 183 $ 개

$ \hspace{0.5cm} \Rightarrow{} $ **inner join 과정에서 삭제됨(추후 확인 필요)**

- 여러 이미지(instance) 중 일부만 `train_series_description` 데이터셋의 좌표가 찍혀있는 이유 $ \rightarrow{} $ 디스크 위치의 정확한 식별이 안되기 때문

- 촬영 기법(description) 별로 증상(condition)이 그룹화됨

- description = "Sagittal T1" & condition="Spinal Canal Stenosis" 인 건 확인 $ \rightarrow{} $ 추후 확인 필요

##### 04.2. **모델링 고려 사항**

(1) 확인된 목표 

- 여러 이미지 중 유의미한 (좌표가 찍힌?, 증상을 알 수 있는?) 이미지 구분

- 유의미한 이미지 중 심각도(severity) 를 $ \, 3 $ 단계의 확률(소프트맥스) 파악 

(2) 방안

- 1 모델 처리? : 무의미한 이미지(좌표가 없는 이미지)에 대하여 좌표를 (0, 0), 증상을 `unknown` 처리 

- 2 모델 처리? : 무의미한 이미지와 유의미한 이지를 객체 탐식을 통해 식별 ? 