In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import pydicom
import xgboost as xgb
from glob import glob
from sklearn import cross_validation
from keras.models import Model
from keras.applications.resnet50 import ResNet50, preprocess_input

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
seed=42

In [3]:
BASE_PATH = "/opt/data/liver_cancer"
TRAIN_PATH = os.path.join(BASE_PATH, "train_dataset")
TEST_PATH = os.path.join(BASE_PATH, "test_dataset")
TRAIN_LABEL_PATH = os.path.join(BASE_PATH, "train_label.csv")
SUBMIT_EXAMPLE_PATH = os.path.join(BASE_PATH, "submit_example.csv")
SUBMIT_PATH = os.path.join(BASE_PATH, "results", "submit.csv")

FEATURE_SAVE_PATH = os.path.join(BASE_PATH, "results", "resnet50_feature")

img_width, img_height, img_channel = 224, 224, 3

In [4]:
def read_ct_3d(ct_folder_path):
    slice_imgs = []
    for slice_name in sorted(os.listdir(ct_folder_path)):
            slice_folder_path = os.path.join(ct_folder_path, slice_name)
            slice_dicom = pydicom.read_file(slice_folder_path)
            slice_img = slice_dicom.pixel_array
            slice_img = cv2.resize(slice_img, (img_width, img_height))
            slice_imgs.append(slice_img)
            
    return np.stack(slice_imgs).astype(np.float32)

In [5]:
def get_data_batch(ct_3d_img):
    # combine 3 slices to make RGB images
    batch = []
    for i in range(0, ct_3d_img.shape[0]-3, 3):
        rgb_img = []
        for j in range(3):
            rgb_img.append(ct_3d_img[i+j])
            
        rgb_img = np.array(rgb_img)
        # channel first to channel last
        rgb_img = np.transpose(rgb_img, (1, 2, 0))
        batch.append(rgb_img)
        
    return np.array(batch)

In [6]:
def calc_features(folder_path):
    model = ResNet50(include_top=None, weights='imagenet')
    for ct_folder_name in tqdm(os.listdir(folder_path)):
        ct_folder_path = os.path.join(folder_path, ct_folder_name)
        
        ct_3d_img = read_ct_3d(ct_folder_path)
        batch = get_data_batch(ct_3d_img)
        batch = preprocess_input(batch)
        
        preds = model.predict(batch)
        # average pred for every patient
        preds = np.mean(preds, axis=0)
        preds= preds.reshape(-1)
        np.save(os.path.join(FEATURE_SAVE_PATH, ct_folder_name), preds)

In [7]:
calc_features(TRAIN_PATH)

100%|██████████| 3600/3600 [18:24<00:00,  3.26it/s]


In [153]:
calc_features(TEST_PATH)

100%|██████████| 3974/3974 [22:21<00:00,  2.96it/s]


In [8]:
def train_xgboost():
    df = pd.read_csv(TRAIN_LABEL_PATH)
    
    x = np.array([np.load(FEATURE_SAVE_PATH+"/%s.npy"%(name)) for name in df["id"].tolist()])
    y = df["ret"].as_matrix()
    print (x.shape, y.shape)
    
    train_X, val_X, train_y, val_y = cross_validation.train_test_split(x, y, random_state=seed, stratify=y, test_size=0.2)
    
    clf = xgb.XGBRegressor(max_depth=10,
                          n_estimators=1500,
                          min_child_weight=9,
                          learning_rate=0.05,
                          nthread=8,
                          subsample=0.80,
                          colsample_bylevel=0.80,
                          seed=seed)
    
    clf.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=True, eval_metric="logloss", early_stopping_rounds=50)
    
    return clf

In [9]:
def test_and_make_submission():
    df = pd.read_csv(SUBMIT_EXAMPLE_PATH)
    
    clf = train_xgboost()
    x = np.array([np.load(FEATURE_SAVE_PATH+"/%s.npy"%(name)) for name in df["id"].tolist()])
    preds = clf.predict(x)
    
    # make submission
    df["ret"] = [1 if pred > 0.5 else 0 for pred in preds]
    df.to_csv(SUBMIT_PATH, index=False)
    print(df.head())

In [10]:
test_and_make_submission()

                                     id  ret
0  0013EDC2-8D7A-4A41-AEB5-D3BB592306D2    1
1  0030CBD1-2472-42C4-8CE4-E01A4E8E2F09    1
2  0036DF08-EEEC-467C-8CF1-5A54E0B13CE8    1
3  003D2553-266F-47E3-A420-F5B8F95217A7    0
4  0072E2C1-C395-409B-8078-365DD5C0513E    0
(3600, 2048) (3600,)
[0]	validation_0-logloss:0.663743
Will train until validation_0-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.637379
[2]	validation_0-logloss:0.612236
[3]	validation_0-logloss:0.58747
[4]	validation_0-logloss:0.565178
[5]	validation_0-logloss:0.545206
[6]	validation_0-logloss:0.527159
[7]	validation_0-logloss:0.508771
[8]	validation_0-logloss:0.493235
[9]	validation_0-logloss:0.479213
[10]	validation_0-logloss:0.463816
[11]	validation_0-logloss:0.45074
[12]	validation_0-logloss:0.438447
[13]	validation_0-logloss:0.425956
[14]	validation_0-logloss:0.413383
[15]	validation_0-logloss:0.403099
[16]	validation_0-logloss:0.393063
[17]	validation_0-logloss:0.382723
[18]	validation_0-loglo

[222]	validation_0-logloss:0.133214
[223]	validation_0-logloss:0.133176
[224]	validation_0-logloss:0.133121
[225]	validation_0-logloss:0.133069
[226]	validation_0-logloss:0.133049
[227]	validation_0-logloss:0.13304
[228]	validation_0-logloss:0.133024
[229]	validation_0-logloss:0.133006
[230]	validation_0-logloss:0.13298
[231]	validation_0-logloss:0.132949
[232]	validation_0-logloss:0.132914
[233]	validation_0-logloss:0.132864
[234]	validation_0-logloss:0.132862
[235]	validation_0-logloss:0.132858
[236]	validation_0-logloss:0.132828
[237]	validation_0-logloss:0.132815
[238]	validation_0-logloss:0.13278
[239]	validation_0-logloss:0.132767
[240]	validation_0-logloss:0.132737
[241]	validation_0-logloss:0.13272
[242]	validation_0-logloss:0.132691
[243]	validation_0-logloss:0.132675
[244]	validation_0-logloss:0.13265
[245]	validation_0-logloss:0.132607
[246]	validation_0-logloss:0.132565
[247]	validation_0-logloss:0.132554
[248]	validation_0-logloss:0.132512
[249]	validation_0-logloss:0.1324

[451]	validation_0-logloss:0.13118
[452]	validation_0-logloss:0.131178
[453]	validation_0-logloss:0.131177
[454]	validation_0-logloss:0.131176
[455]	validation_0-logloss:0.131174
[456]	validation_0-logloss:0.131172
[457]	validation_0-logloss:0.131169
[458]	validation_0-logloss:0.131168
[459]	validation_0-logloss:0.131168
[460]	validation_0-logloss:0.131168
[461]	validation_0-logloss:0.131165
[462]	validation_0-logloss:0.131166
[463]	validation_0-logloss:0.131167
[464]	validation_0-logloss:0.131164
[465]	validation_0-logloss:0.131163
[466]	validation_0-logloss:0.131165
[467]	validation_0-logloss:0.131164
[468]	validation_0-logloss:0.131163
[469]	validation_0-logloss:0.131164
[470]	validation_0-logloss:0.131164
[471]	validation_0-logloss:0.131163
[472]	validation_0-logloss:0.131162
[473]	validation_0-logloss:0.131162
[474]	validation_0-logloss:0.131163
[475]	validation_0-logloss:0.13116
[476]	validation_0-logloss:0.131158
[477]	validation_0-logloss:0.131156
[478]	validation_0-logloss:0.1