In [5]:
import pandas as pd
import os
import numpy as np 
import pandas as pd 
from datetime import datetime
import time
import random
from tqdm import tqdm_notebook as tqdm # progress bar
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, FasterRCNN
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import cv2
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
from sklearn.model_selection import StratifiedKFold
from glob import glob
import numba
import re
from numba import jit
from PIL import Image
import gc
import warnings
warnings.filterwarnings('ignore')

In [6]:
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes
    

def scale_box(row):
    if row['class'] == 'opacity':
        scale_x = 256/row.dim1
        scale_y = 256/row.dim0

        scaled_boxes = []
        for box in row.xyxy:
            x = int(np.round(box[0]*scale_x, 4))
            y = int(np.round(box[1]*scale_y, 4))
            w = int(np.round(box[2]*(scale_x), 4))
            h = int(np.round(box[3]*scale_y, 4))
            scaled_boxes.append([x, y, w, h])

        return scaled_boxes

df = pd.read_csv('train_image_level.csv')
df['class'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)
df['filename'] = df.apply(lambda row: row.id[:-6], axis=1)

meta = pd.read_csv('meta.csv')
meta.columns = ['filename', 'dim0', 'dim1', 'split']

df = df.merge(meta, on='filename', how='left')

df['xyxy'] = df.apply(get_bbox, axis=1)
df['xyxy'] = df.apply(scale_box, axis=1)
# df.drop(columns=['split'], inplace=True)

df.head(3)

Unnamed: 0,id,boxes,label,StudyInstanceUID,class,filename,dim0,dim1,split,xyxy
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,opacity,000a312787f2,3488,4256,train,"[[47, 42, 109, 183], [135, 43, 200, 172]]"
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed,none,000c3a3f293f,2320,2832,train,
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,opacity,0012ff7358bc,2544,3056,train,"[[56, 19, 129, 120], [150, 40, 201, 161]]"


In [7]:
opacity = {}
none = []

for index, row in df.iterrows():
    name = row.filename
    if row['class'] == 'opacity':
        opacity[name]= row.xyxy
    else:
        none.append(name)
        
len(opacity), len(none)

(4294, 2040)

In [8]:
old_competition_df = pd.read_csv('rsna-pneumonia/stage_2_train_labels.csv')

extract_box = lambda row: [i*256/1024 for i in [row['x'], row['y'], row['x']+row['width'], row['y']+row['height']]]

for index, row in old_competition_df.iterrows():
    pid = row['patientId']
    if row.Target == 1:
        if pid not in opacity:
            opacity[pid] = []
        opacity[pid].append(extract_box(row))
    ''' want less negative samples
    else:
        if none[-1] != pid:
            none.append(pid)
    '''
            
len(opacity), len(none)

(10306, 2040)

In [9]:
data = opacity.copy()

'''
for name in none:
    data[name] = None
'''
    
train, valid  = [i.to_dict() for i in train_test_split(pd.Series(data), train_size=0.8, random_state=42)]

len(train), len(valid)

(8244, 2062)

In [None]:
class LungDataset(Dataset):

    def __init__(self, data):
        super().__init__()
        self.all_names, self.all_boxes = zip(*data.items())

    def __getitem__(self, index: int):
        
        name = self.all_names[index]
        boxes = self.all_boxes[index]
        
        if '-' in name:
            img = cv2.imread(f'../input/rsna-256/{name}.png', 0)
        else:
            img = cv2.imread(f'../input/siim-covid19-resized-to-256px-png/train/{name}.png', 0)
                
        if boxes != None:
            transform = A.Compose([
                A.HorizontalFlip(p=.5),
                A.RandomGamma(p=1),
                A.ShiftScaleRotate(rotate_limit=10, p=.5),
                A.Cutout(p=.3),
                A.RandomBrightness(p=.5),
                ToTensorV2()
            ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=[]))
            
            sample = transform(image=img, bboxes=boxes)

            tmp = np.array(sample['bboxes'])
            
            assert np.all(tmp[:, 3]>tmp[:, 1]) & np.all(tmp[:, 2]>tmp[:, 0])
          
            target = {"boxes": torch.as_tensor(sample['bboxes'], dtype=torch.float32),
                      "labels": torch.ones((len(boxes)), dtype=torch.int64),
                      "image_id": torch.tensor([index]),
                      "area": torch.as_tensor((tmp[:,2]-tmp[:,0])*(tmp[:,3]-tmp[:,1]), dtype=torch.float32),
                      "iscrowd": torch.zeros(len(boxes), dtype=torch.int64)}
        else:
            transform= A.Compose([
                A.HorizontalFlip(p=.5),
                A.RandomGamma(p=1),
                A.ShiftScaleRotate(rotate_limit=10, p=.5),
                A.Cutout(p=.3),
                A.RandomBrightness(p=.5),
                ToTensorV2()
            ])
            
            sample = transform(image=img)
            
            target = {"boxes": torch.zeros((0,4), dtype=torch.float32),
                      "labels": torch.zeros(0, dtype=torch.int64),
                      "image_id": torch.tensor([index]),
                      "area": torch.zeros(0, dtype=torch.float32),
                      "iscrowd": torch.zeros((0), dtype=torch.int64)}
            
        return sample['image']/255, target
        
    def __len__(self) -> int:
        return len(self.all_names)

In [None]:
def plot_box(img, boxes, ax=None): # box format: xyxy
    ax = plt.gca() if ax is None else ax
    for box in boxes:
        rect = patches.Rectangle((box[0], box[1]), box[2]-box[0], box[3]-box[1], linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
    ax.imshow(img, cmap='gray')
    
train_dataset = LungDataset(train)
valid_dataset = LungDataset(valid)

image, target = train_dataset[31]

image = image.reshape(256, 256)
boxes = target['boxes'].tolist()

plot_box(image, boxes)

In [10]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 2 # opacity + none
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) 
# model.load_state_dict(torch.load('../input/siim-packages/weight/epoch4.pth'))

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to C:\Users\82106/.cache\torch\hub\checkpoints\fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:51<00:00, 3.22MB/s] 


In [14]:
import os
import csv
import random
import pydicom
import numpy as np
import pandas as pd
from skimage import measure
from skimage.transform import resize

import tensorflow as tf
from tensorflow import keras

from matplotlib import pyplot as plt

In [11]:
import keras.preprocessing.image as KPImage
from PIL import Image
import pydicom
def read_dicom_image(in_path):
    img_arr = pydicom.read_file(in_path).pixel_array
    return img_arr/img_arr.max()
    
class medical_pil():
    @staticmethod
    def open(in_path):
        if '.dcm' in in_path:
            c_slice = read_dicom_image(in_path)
            int_slice =  (255*c_slice).clip(0, 255).astype(np.uint8) # 8bit images are more friendly
            return Image.fromarray(int_slice)
        else:
            return Image.open(in_path)
    fromarray = Image.fromarray
KPImage.pil_image = medical_pil

Using TensorFlow backend.


In [12]:
# load and shuffle filenames
folder = 'rsna-pneumonia/stage_2_train_images'
filenames = os.listdir(folder)
random.shuffle(filenames)
# split into train and validation filenames
n_valid_samples = 2560
train_filenames = filenames[n_valid_samples:20000]
valid_filenames = filenames[:n_valid_samples]
print('n train samples', len(train_filenames))
print('n valid samples', len(valid_filenames))
n_train_samples = len(filenames) - n_valid_samples

n train samples 17440
n valid samples 2560


In [15]:
# empty dictionary
pneumonia_locations = {}
# load table
with open(os.path.join('rsna-pneumonia/stage_2_train_labels.csv'), mode='r') as infile:
    # open reader
    reader = csv.reader(infile)
    # skip header
    next(reader, None)
    # loop through rows
    for rows in reader:
        # retrieve information
        filename = rows[0]
        location = rows[1:5]
        pneumonia = rows[5]
        # if row contains pneumonia add label to dictionary
        # which contains a list of pneumonia locations per filename
        if pneumonia == '1':
            # convert string to float to int
            location = [int(float(i)) for i in location]
            # save pneumonia location in dictionary
            if filename in pneumonia_locations:
                pneumonia_locations[filename].append(location)
            else:
                pneumonia_locations[filename] = [location]

In [16]:
class generator(keras.utils.Sequence):
    
    def __init__(self, folder, filenames, pneumonia_locations=None, batch_size=32, image_size=256, shuffle=True, augment=False, predict=False):
        self.folder = folder
        self.filenames = filenames
        self.pneumonia_locations = pneumonia_locations
        self.batch_size = batch_size
        self.image_size = image_size
        self.shuffle = shuffle
        self.augment = augment
        self.predict = predict
        self.on_epoch_end()
        
    def __load__(self, filename):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, filename)).pixel_array
        # create empty mask
        msk = np.zeros(img.shape)
        # get filename without extension
        filename = filename.split('.')[0]
        # if image contains pneumonia
        if filename in pneumonia_locations:
            # loop through pneumonia
            for location in pneumonia_locations[filename]:
                # add 1's at the location of the pneumonia
                x, y, w, h = location
                msk[y:y+h, x:x+w] = 1
        # if augment then horizontal flip half the time
        if self.augment and random.random() > 0.5:
            img = np.fliplr(img)
            msk = np.fliplr(msk)
        # resize both image and mask
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        msk = resize(msk, (self.image_size, self.image_size), mode='reflect') > 0.45
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        msk = np.expand_dims(msk, -1)
        return img, msk

    def __loadpredict__(self, filename):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, filename)).pixel_array
        # resize image
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        return img
        
    def __getitem__(self, index):
        # select batch
        filenames = self.filenames[index*self.batch_size:(index+1)*self.batch_size]
        # predict mode: return images and filenames
        if self.predict:
            # load files
            imgs = [self.__loadpredict__(filename) for filename in filenames]
            # create numpy batch
            imgs = np.array(imgs)
            return imgs, filenames
        # train mode: return images and masks
        else:
            # load files
            items = [self.__load__(filename) for filename in filenames]
            # unzip images and masks
            imgs, msks = zip(*items)
            # create numpy batch
            imgs = np.array(imgs)
            msks = np.array(msks)
            return imgs, msks
        
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.filenames)
        
    def __len__(self):
        if self.predict:
            # return everything
            return int(np.ceil(len(self.filenames) / self.batch_size))
        else:
            # return full batches only
            return int(len(self.filenames) / self.batch_size)

In [17]:
# create train and validation generators
folder = 'rsna-pneumonia/stage_2_train_images'
train_gen = generator(folder, train_filenames, pneumonia_locations, batch_size=16, image_size=256, shuffle=True, augment=True, predict=False)
valid_gen = generator(folder, valid_filenames, pneumonia_locations, batch_size=16, image_size=256, shuffle=False, predict=False)

In [23]:
from engine import train_one_epoch, evaluate

num_epochs = 1

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, train_gen, device, epoch, print_freq=10)
    torch.save(model.state_dict(), f'epoch{epoch}.pth')
    evaluate(model, valid_gen, device=device)

ModuleNotFoundError: No module named 'engine'