# Move label
Data cleaning from `raw_data` to `label_data`
## 0. Prepare
### Library and path

In [1]:
import time, random
import os, glob, ntpath
import re
import logging
from shutil import copyfile, copytree, copy
from datetime import datetime as ddt

import numpy as np
import pandas as pd
import pydicom as dicom

from lxml import etree
from tqdm import tqdm

In [2]:
# basically you don't need to change these
detail_descrip_path = '/home/d/pancreas/raw_data/detail_pancreas.csv'
brief_descrip_normal_path = '/home/d/pancreas/raw_data/brief_normal.xlsx'
brief_descrip_tumor_path = '/home/d/pancreas/raw_data/brief_pancreas.xlsx'
source_label_path = '/home/d/pancreas/raw_data/label/'
source_scan_path = '/home/d/pancreas/raw_data/'

# target path
target_base_path = '/home/d/pancreas/new_label_data/'

### Black list
add the id that need to removed

In [26]:
black_list = ['PT3', 'PT5', 'PT43']
# PT3: wrong disease
# PT5: cystic
# PT43: cystic

### Read Description 

In [15]:
detail_df = pd.read_csv(detail_descrip_path, converters={'Code': str}).fillna('')

### Move Labeled Data function

In [127]:
def move_labeldata(label, brief_descrip_path, data_type):
    brief_df = pd.read_excel(brief_descrip_path).fillna('')
    '''
    Usage: Move DICOM and label (nrrd) to specific location.
    '''
    # st = time.time()
    tumor_id = ntpath.basename(label).split('_')[0]
    dtumor_df = detail_df[detail_df['Number'] == tumor_id].reset_index()
    assert dtumor_df.shape[0] == 1, "Tumor id duplicated!"
    
    if tumor_id in black_list:
        print('Skip {} from black list!'.format(tumor_id))
        return False
    
    # Basic Info
    patient_id = dtumor_df['Code'][0]
    btumor_df = brief_df[brief_df['No.'] == tumor_id].reset_index()
    
    # TODO: Check exam date from brief and detail are same or not
    exam_date = ddt.strftime(ddt.strptime(btumor_df['Date'][0], '%Y.%m.%d'), '%Y%m%d')
    series_no = str(int(btumor_df['Series Number'][0]))
    
    # Find
    tumor_parent_path = source_scan_path + '{}/{}/{}/'.format(data_type, patient_id, exam_date)
    target_tumor_parent_path = target_base_path + '{}/{}/'.format(patient_id, tumor_id)
    
    if not os.path.exists(target_tumor_parent_path):
        os.makedirs(target_tumor_parent_path)
    
    check_copy = False
    for dcmpath in glob.glob(tumor_parent_path+'*/*0001.dcm'):
        # Get DICOM series number and avoid dose description
        try:
            dcm_series_no = str(dicom.read_file(dcmpath)[0x0020, 0x0011].value)
        except:
            continue
        if dcm_series_no == series_no:
            # Check if A phase and V phase mix up
            time_list = []
            dcmfiles = glob.glob(os.path.dirname(dcmpath)+'/*.dcm')
            for dcmfile in dcmfiles:
                time_list.append(str(dicom.read_file(dcmfile)[0x0008, 0x0032].value))

            if len(set(time_list)) == 2:
                os.makedirs(target_tumor_parent_path + 'scans')
                for (file, time) in zip(dcmfiles, time_list):
                    if time == max(time_list):
                        copy(file, target_tumor_parent_path + 'scans/')
                copyfile(label, target_base_path + '{}/{}/label.nrrd'.format(patient_id, tumor_id))
                check_copy = True
                continue
            else:
                copytree(os.path.dirname(dcmpath), target_tumor_parent_path + 'scans/')
                copyfile(label, target_base_path + '{}/{}/label.nrrd'.format(patient_id, tumor_id))
                check_copy = True
                continue
    # print('Done {} in {:<5.3f} seconds'.format(tumor_id, time.time()-st))
    if check_copy == False:
        print(tumor_id)
    return check_copy

In [28]:
def move_labeldata_PC(label, data_type):
    '''
    Usage: Move DICOM and label (nrrd) to specific location.
    '''
    # st = time.time()
    tumor_id = ntpath.basename(label).split('_')[0]
    dtumor_df = detail_df[detail_df['Number'] == tumor_id].reset_index()
    assert dtumor_df.shape[0] == 1, "Tumor id duplicated!"
    
    if tumor_id in black_list:
        print('Skip {} from black list!'.format(tumor_id))
        return False
    
    # Basic Info
    patient_id = dtumor_df['Code'][0]
    
    # TODO: Check exam date from brief and detail are same or not
    exam_date = dtumor_df['Exam Date'][0]
    series_no = ntpath.basename(label).split('_')[-1].split('.')[0]
    
    # Find
    tumor_parent_path = source_scan_path + '{}/{}/{}/'.format(data_type, patient_id, exam_date)
    target_tumor_parent_path = target_base_path + '{}/{}/'.format(patient_id, tumor_id)
    
    if not os.path.exists(target_tumor_parent_path):
        os.makedirs(target_tumor_parent_path)
    
    check_copy = False
#     print(len())
    for dcmpath in glob.glob(tumor_parent_path+'*/*I1.dcm'):
        # Get DICOM series number and avoid dose description
        try:
            dcm_series_no = str(dicom.read_file(dcmpath, force = True)[0x0020, 0x0011].value)
        except:
            continue
        if dcm_series_no == series_no:
            # Check if A phase and V phase mix up
            time_list = []
            dcmfiles = glob.glob(os.path.dirname(dcmpath)+'/*.dcm')
            for dcmfile in dcmfiles:
                time_list.append(str(dicom.read_file(dcmfile, force = True)[0x0008, 0x0032].value))
            if len(set(time_list)) == 2:
                print("Find different phase in ", tumor_id)
                os.makedirs(target_tumor_parent_path + 'scans')
                for (file, time) in zip(dcmfiles, time_list):
                    if time == max(time_list):
                        copy(file, target_tumor_parent_path + 'scans/')
                copyfile(label, target_base_path + '{}/{}/label.nrrd'.format(patient_id, tumor_id))
                check_copy = True
                continue
            else:
                copytree(os.path.dirname(dcmpath), target_tumor_parent_path + 'scans/')
                copyfile(label, target_base_path + '{}/{}/label.nrrd'.format(patient_id, tumor_id))
                check_copy = True
                continue
    # print('Done {} in {:<5.3f} seconds'.format(tumor_id, time.time()-st))
    if check_copy == False:
        print(tumor_id)
    return check_copy

## Run the specific task
Please run the type you need. <br>
### 1. normal pancreas: NP

In [6]:
st_tol = time.time()
path = brief_descrip_normal_path
cnt = 0
for label in tqdm(glob.glob(source_label_path + 'NP*.nrrd')):
    try:
        check_copy = move_labeldata(label, path, 'normal')
        cnt += 1 if check_copy else 0
    except:
        continue
print('Done cleaning {} data in {} seconds'.format(cnt, time.time()-st_tol))

100%|██████████| 10/10 [01:00<00:00,  6.07s/it]

Done cleaning 10 data in 60.748693227767944 seconds





### 2.  pancreas with tumor: PT

In [7]:
st_tol = time.time()
path = brief_descrip_tumor_path
cnt = 0
for label in tqdm(glob.glob(source_label_path + 'PT*.nrrd')):
    try:
        check_copy = move_labeldata(label, path, 'tumor')
        cnt += 1 if check_copy else 0
    except:
        continue
print('Done cleaning {} data in {} seconds'.format(cnt, time.time()-st_tol))

  9%|▊         | 4/47 [00:27<04:58,  6.95s/it]

Skip PT43 from black list!


 53%|█████▎    | 25/47 [02:40<02:21,  6.41s/it]

Skip PT3 from black list!


 79%|███████▊  | 37/47 [03:41<00:59,  6.00s/it]

Skip PT5 from black list!


100%|██████████| 47/47 [04:46<00:00,  6.10s/it]

Done cleaning 44 data in 286.6362235546112 seconds





### 3. thick cut pancreas: PC

In [29]:
st_tol = time.time()
cnt = 0
for label in tqdm(glob.glob(source_label_path + 'PC*.nrrd')):
    try:
        check_copy = move_labeldata_PC(label, 'tumor55')
        cnt += 1 if check_copy else 0
    except:
        continue
print('Done cleaning {} data in {} seconds'.format(cnt, time.time()-st_tol))

  9%|▉         | 2/22 [00:09<01:36,  4.80s/it]

Find different phase in  PC35


 77%|███████▋  | 17/22 [00:42<00:12,  2.50s/it]

Find different phase in  PC41


100%|██████████| 22/22 [00:56<00:00,  2.55s/it]

Done cleaning 0 data in 56.164583683013916 seconds





In [38]:
dcmfiles_test = glob.glob('/home/d/pancreas/raw_data/tumor55/000135/20160131/3/*.dcm')

In [41]:
time_list = []
for dcmfile in dcmfiles_test:
    time_list.append(str(dicom.read_file(dcmfile, force = True)[0x0008, 0x0032].value))

In [44]:
time_list

['171510.205892',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171510.205892',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017',
 '171427.112017']