In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATASETS
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE CELL.

# import os
# import sys
# from tempfile import NamedTemporaryFile
# from urllib.request import urlopen
# from urllib.parse import unquote
# from urllib.error import HTTPError
# from zipfile import ZipFile

# CHUNK_SIZE = 40960
# DATASET_MAPPING = 'mayo-clinic-strip-ai:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F37333%2F3949526%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20220722%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20220722T161126Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8ff583574eb3fb741bcf7940bf2749a0067094b59e39715d58a03c2da2197856f4740913f1cec57cd110825dee7047ad7badad305eeaa91290ed915babae1278772a1ad0d41c1460a1fc505fa5e8708db38091ea870a470314516ba2b471a5f9fda0703b3d16eee7e0a24d4ddeae68981e9378ec6ab24ae3733187972934c7a4d2bdd42d4f0f6b753ff9477e5a72cfdf3b1906b1f03cda6fa46ff44c47acc93588303862416854d2a163a8c8a819237a92712ecfe3ff9630e45ae2d1a5ab9743d6265970d15483db3c18726d5d1be7e6cde1f2d99bd33b914181143256aa9ff92443b8d7bc0a29d76e5e2f00af9241151a660168a6d2c7706d6a0446c6e372cf'
# KAGGLE_INPUT_PATH='/home/kaggle/input'
# KAGGLE_INPUT_SYMLINK='/kaggle'

# os.makedirs(KAGGLE_INPUT_PATH, 777)
# os.symlink(KAGGLE_INPUT_PATH, os.path.join('..', 'input'), target_is_directory=True)
# os.makedirs(KAGGLE_INPUT_SYMLINK)
# os.symlink(KAGGLE_INPUT_PATH, os.path.join(KAGGLE_INPUT_SYMLINK, 'input'), target_is_directory=True)

# for dataset_mapping in DATASET_MAPPING.split(','):
#     directory, download_url_encoded = dataset_mapping.split(':')
#     download_url = unquote(download_url_encoded)
#     destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
#     try:
#         with urlopen(download_url) as zipfileres, NamedTemporaryFile() as tfile:
#             total_length = zipfileres.headers['content-length']
#             print(f'Downloading {directory}, {total_length} bytes zipped')
#             dl = 0
#             data = zipfileres.read(CHUNK_SIZE)
#             while len(data) > 0:
#                 dl += len(data)
#                 tfile.write(data)
#                 done = int(50 * dl / int(total_length))
#                 sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
#                 sys.stdout.flush()
#                 data = zipfileres.read(CHUNK_SIZE)
#             print(f'\nUnzipping {directory}')
#             with ZipFile(tfile) as zfile:
#                 zfile.extractall(destination_path)
#     except HTTPError as e:
#         print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
#         continue
#     except OSError as e:
#         print(f'Failed to load {download_url} to path {destination_path}')
#         continue
# print('Dataset import complete.')


Strip AI Kaggle

In [2]:
import time
import os
import glob
import gc

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

import pandas as pd
import cv2 as cv
import tifffile as tifi

import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import skimage
from skimage.filters import sobel
from skimage import segmentation
from skimage.measure import regionprops, regionprops_table

from scipy import ndimage as ndi

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier

## Disabling the benchmarking feature with torch.backends.cudnn.benchmark = False 
## causes cuDNN to deterministically select an algorithm, possibly at the cost of reduced performance.
## https://pytorch.org/docs/stable/notes/randomness.html
torch.backends.cudnn.benchmark = False

First we define the PyTorch model that we will use to extract the features.

In [3]:
class RCF(nn.Module):
    """A model for extracting Random Convolution Features (RCF) from input imagery."""
    def __init__(self, num_features=16, kernel_size=3, num_input_channels=3):
        super(RCF, self).__init__()
        # We create `num_features / 2` filters so require `num_features` to be divisible by 2
        assert num_features % 2 == 0, "Please enter an even number of features."
        # Applies a 2D convolution over an input image composed of several input planes.
        self.conv1 = nn.Conv2d(
            num_input_channels,
            num_features // 2,
            kernel_size=kernel_size,
            stride=1,
            padding=0,
            dilation=1,
            bias=True,
        )
        # Fills the input Tensor 'conv1.weight' with values drawn from the normal distribution
        nn.init.normal_(self.conv1.weight, mean=0.0, std=1.0) 
        # Fills the input Tensor 'conv1.bias' with the value 'val = -1'.
        nn.init.constant_(self.conv1.bias, -1.0)
    def forward(self, x):
        # The rectified linear activation function or ReLU for short is a piecewise linear function 
        # that will output the input directly if it is positive, otherwise, it will output zero.
        x1a = F.relu(self.conv1(x), inplace=True)
        # The below step is where we take the inverse which is appended later
        x1b = F.relu(-self.conv1(x), inplace=True)
        # Applies a 2D adaptive average pooling over an input signal composed of several input planes.
        x1a = F.adaptive_avg_pool2d(x1a, (1, 1)).squeeze()
        x1b = F.adaptive_avg_pool2d(x1b, (1, 1)).squeeze()
        if len(x1a.shape) == 1:  # case where we passed a single input
            return torch.cat((x1a, x1b), dim=0)
        elif len(x1a.shape) == 2:  # case where we passed a batch of > 1 inputs
            return torch.cat((x1a, x1b), dim=1)

In [4]:
torch.cuda.is_available()

True

In [5]:
print(torch.__version__)

1.11.0


Next, we initialize the model and pytorch components

In [6]:
num_features = 4096
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RCF(num_features).eval().to(device)

Read in metadata

In [7]:
train_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/train.csv')
test_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/test.csv')
other_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/other.csv')

List images for training and testing

In [8]:
train_path = "/kaggle/input/mayo-clinic-strip-ai/train/*.tif"
train_images = sorted(list(glob.glob(train_path)))
train_image_ids = [i.split("/")[-1].rstrip('.tif') for i in train_images]

test_path = "/kaggle/input/mayo-clinic-strip-ai/test/*.tif"
test_images = sorted(list(glob.glob(test_path)))
test_image_ids = [i.split("/")[-1].rstrip('.tif') for i in test_images]

In [9]:
test_images

['/kaggle/input/mayo-clinic-strip-ai/test/006388_0.tif',
 '/kaggle/input/mayo-clinic-strip-ai/test/008e5c_0.tif',
 '/kaggle/input/mayo-clinic-strip-ai/test/00c058_0.tif',
 '/kaggle/input/mayo-clinic-strip-ai/test/01adc5_0.tif']

Define helper functions

In [10]:
img_to_tensor = T.ToTensor()

def read_tiff(path):
    image = tifi.imread(path)
    filename = path.split('/')[-1].rstrip('.tif')
    return image, filename


def resize_image(image):
    re_sized_image = (
        cv.resize(
            image,
            (int(image.shape[1]/33),
             int(image.shape[0]/33)),
            interpolation=cv.INTER_LINEAR))
    return re_sized_image


def convert_image_grayscale(image):
    gray_image = cv.cvtColor(image, cv.COLOR_RGB2GRAY)
    return gray_image


def segment_images(resized_gray_img):
    elevation_map = sobel(resized_gray_img)
    markers = np.zeros_like(resized_gray_img)
    markers[resized_gray_img >= resized_gray_img.mean()] = 1
    markers[resized_gray_img < resized_gray_img.mean()] = 2
    segmented_img = segmentation.watershed(elevation_map, markers)
    filled_segments = ndi.binary_fill_holes(segmented_img - 1)
    labeled_segments, _ = ndi.label(filled_segments)
    return labeled_segments


def get_object_coordinates(labeled_segments):
    properties = ['area','bbox','convex_area','bbox_area', 'major_axis_length', 'minor_axis_length', 'eccentricity']
    df = pd.DataFrame(regionprops_table(labeled_segments, properties=properties))
    standard_scaler = StandardScaler()
    scaled_area = standard_scaler.fit_transform(df.area.values.reshape(-1,1))
    df['scaled_area'] = scaled_area
    df.sort_values(by="scaled_area", ascending=False, inplace=True)
    objects = df[df['scaled_area']>=.75]
    object_coordinates = [
        (int(row['bbox-0']),
         int(row['bbox-1']),
         int(row['bbox-2']),
         int(row['bbox-3']))
        for index, row in objects.iterrows()
    ]
    return object_coordinates


def crop_patch(coordinates, image):
    x1, y1, x2, y2 = coordinates
    if x2-x1<3:
        x1-=1
        x2+=1
    if y2-y1<3:
        y1-=1
        y2+=1
    cropped_image = image[x1:x2, y1:y2]
    return cropped_image


def compute_features(images):
    tic = time.time()
    j = 0
    white_pixels = [255,255,255]
    image_patches = []
    features = []
    filled_space = []
    for img in images: 
        image, filename = read_tiff(img)
        re_sized_image = resize_image(image)
        del image
        gc.collect()
        resized_gray_img = convert_image_grayscale(re_sized_image)
        labeled_segments = segment_images(resized_gray_img)
        object_coordinates = get_object_coordinates(labeled_segments)
        del labeled_segments, resized_gray_img
        gc.collect()
        for i in range(len(object_coordinates)):
            patch_name = str(filename)+"_"+str(i)
            coordinates = object_coordinates[i]
            cropped_image = crop_patch(coordinates, re_sized_image)
            
            white_space = np.count_nonzero(np.all(cropped_image==white_pixels,axis=2)) / cropped_image.size
            filled_space.append(1-white_space)

            tensor = img_to_tensor(cropped_image)
            tensor = tensor.to(device)
            with torch.no_grad():
                feats = model(tensor.unsqueeze(0)).cpu().numpy()
            features.append(feats)
            image_patches.append(patch_name)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            del tensor, feats, white_space, cropped_image, coordinates
            gc.collect()
        ten_percent = int(len(images)/10 + 1)
        if j % ten_percent == 0:
            print(f"{j}/{len(images)} -- {j / len(images) * 100:0.2f}% -- {time.time()-tic:0.2f} seconds")
            tic = time.time()
        j += 1
    arrays = [image_patches, filled_space]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=["image_patch", "filled_space"])
    data = pd.DataFrame(features, index=index).reset_index()
    return data

Open file, resize, convert to grayscale, segment, crop, and featurize. 

In [11]:
# %%time
# train_features = compute_features(train_images)
# train_features.to_csv("train_features.csv", index=False)
train_features = pd.read_csv('/kaggle/input/trainfeatures/train_features.csv')
train_features

Unnamed: 0,image_patch,filled_space,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_1,0.666667,10.309859,0.0,5.466567,0.0,9.844102,0.0,3.331920,0,...,0.0,2.561325,0.166650,0.123626,0.0,0.0,7.032601,0.0,0.417935,7.646232
1,006388_0_2,0.666667,10.309859,0.0,5.466567,0.0,9.844103,0.0,3.331920,0,...,0.0,2.561325,0.166650,0.123626,0.0,0.0,7.032600,0.0,0.417935,7.646231
2,006388_0_3,0.666667,10.309853,0.0,5.466567,0.0,9.844109,0.0,3.331920,0,...,0.0,2.561325,0.166650,0.123626,0.0,0.0,7.032601,0.0,0.417934,7.646232
3,006388_0_4,0.666667,10.309859,0.0,5.466566,0.0,9.844103,0.0,3.331920,0,...,0.0,2.561325,0.166650,0.123626,0.0,0.0,7.032601,0.0,0.417935,7.646232
4,006388_0_5,0.666667,10.309858,0.0,5.466566,0.0,9.844103,0.0,3.331920,0,...,0.0,2.561325,0.166650,0.123626,0.0,0.0,7.032601,0.0,0.417935,7.646232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4808,ffec5c_0_5,1.000000,10.338737,0.0,5.177093,0.0,9.630226,0.0,3.207168,0,...,0.0,2.648089,0.243233,0.233593,0.0,0.0,6.769437,0.0,0.401388,7.331738
4809,ffec5c_1_1,1.000000,10.338736,0.0,5.177093,0.0,9.630228,0.0,3.207168,0,...,0.0,2.648089,0.243233,0.233593,0.0,0.0,6.769436,0.0,0.401388,7.331740
4810,ffec5c_1_2,1.000000,10.338730,0.0,5.177093,0.0,9.630228,0.0,3.207167,0,...,0.0,2.648090,0.243233,0.233593,0.0,0.0,6.769433,0.0,0.401388,7.331740
4811,ffec5c_1_3,1.000000,10.338732,0.0,5.177093,0.0,9.630227,0.0,3.207167,0,...,0.0,2.648090,0.243233,0.233593,0.0,0.0,6.769435,0.0,0.401388,7.331739


In [12]:
%%time
test_features = compute_features(test_images)
test_features.to_csv("test_features.csv", index=False)
test_features

0/4 -- 0.00% -- 45.40 seconds
1/4 -- 25.00% -- 4.34 seconds
2/4 -- 50.00% -- 14.06 seconds
3/4 -- 75.00% -- 26.89 seconds
CPU times: user 55.8 s, sys: 10.8 s, total: 1min 6s
Wall time: 1min 31s


Unnamed: 0,image_patch,filled_space,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.865378,4.6e-05,1.625261,0.520059,2.4e-05,0.015904,0.0007784601,9.517697e-07,0.0,...,0.055721,3.91218,0.008376,3.787171,0.245836,4.07723,4.901089,1.458248,2.743868,0.004851704
1,006388_0_1,0.912732,0.000127,1.420024,0.395189,2.5e-05,0.027515,0.001239377,0.0,0.0,...,0.0849,3.533336,0.006669,3.304373,0.331617,3.769839,4.879319,1.155285,2.971848,0.005244206
2,006388_0_2,0.888242,0.000366,1.729684,0.623237,0.000138,0.041116,0.003099132,1.200579e-05,0.0,...,0.082427,4.067952,0.014079,4.021696,0.299761,4.139699,4.694666,1.734623,2.469399,0.01559918
3,006388_0_3,0.870811,0.000327,1.802264,0.626469,0.000127,0.035152,0.003034845,1.281178e-05,0.0,...,0.060352,4.181851,0.008685,4.126547,0.252618,4.259101,4.858738,1.725435,2.543947,0.01126874
4,006388_0_4,0.832417,3.4e-05,1.813169,0.612924,3.2e-05,0.013095,0.0006593181,1.259825e-05,0.0,...,0.048451,4.23996,0.006396,4.120693,0.195889,4.377765,5.166974,1.585698,2.78004,0.004042187
5,006388_0_5,0.848537,7.2e-05,1.729984,0.579994,3e-05,0.013957,0.000732093,1.047178e-05,0.0,...,0.05398,4.12098,0.008415,3.983861,0.213631,4.270829,5.098356,1.519718,2.794852,0.005736559
6,006388_0_6,0.832626,7e-06,1.845381,0.624762,0.0,0.014015,0.0003350424,0.0,0.0,...,0.042616,4.313166,0.00178,4.230864,0.20967,4.427025,5.105887,1.682397,2.675137,0.001073916
7,006388_0_7,0.849507,4.4e-05,1.783357,0.606139,0.0,0.021279,0.0008092185,0.0,0.0,...,0.058427,4.199319,0.004491,4.111626,0.25582,4.309389,4.989641,1.653043,2.645022,0.002467673
8,006388_0_8,0.881352,0.000756,1.802113,0.666631,0.000265,0.070303,0.005613439,4.787599e-07,0.0,...,0.118798,4.162895,0.017007,4.12265,0.372666,4.222496,4.720844,1.834904,2.442764,0.02178752
9,006388_0_9,0.928205,0.000857,1.688716,0.645618,0.000311,0.082582,0.006844416,1.066781e-05,0.0,...,0.150047,3.97872,0.029943,3.936707,0.439497,4.017703,4.476201,1.817058,2.348012,0.03508523


In [13]:
%%time
train = train_features.copy()

train[['patient_id', 'image_num', 'patch_num']] = train.image_patch.str.split("_", expand=True)

train["image_id"] = train[["patient_id", "image_num"]].apply("_".join, axis=1)
train.drop(['image_patch'], axis = 1, inplace = True)

var_cols = train.columns[1:-4].values.tolist()
train = (
    train
    .groupby(['image_id', 'patient_id'], as_index=False)
    .apply(lambda x: pd.Series([sum(x[v] * x.filled_space) / sum(x.filled_space) for v in var_cols]))
)
# df_2 = df.groupby(['image_id', 'patient_id'], as_index = False).mean()
train = train.set_index('image_id').join(train_meta.set_index('image_id')['label']).reset_index()
train 

CPU times: user 8min 36s, sys: 1.85 s, total: 8min 38s
Wall time: 8min 38s


Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,label
0,006388_0,006388,10.309857,0.0,5.466567,0.0,9.844104,0.0,3.331920,0.0,...,2.561325,0.166650,0.123626,0.0,0.000000,7.032602,0.0,0.417935,7.646232,CE
1,008e5c_0,008e5c,10.309859,0.0,5.466568,0.0,9.844103,0.0,3.331920,0.0,...,2.561325,0.166650,0.123626,0.0,0.000000,7.032601,0.0,0.417935,7.646231,CE
2,00c058_0,00c058,10.338733,0.0,5.177093,0.0,9.630228,0.0,3.207168,0.0,...,2.648089,0.243233,0.233593,0.0,0.000000,6.769436,0.0,0.401388,7.331739,LAA
3,01adc5_0,01adc5,9.868591,0.0,5.254456,0.0,9.421617,0.0,3.174802,0.0,...,2.499751,0.196685,0.124887,0.0,0.000006,6.792432,0.0,0.450187,7.395343,LAA
4,028989_0,028989,10.320169,0.0,5.363185,0.0,9.767721,0.0,3.287366,0.0,...,2.592313,0.194001,0.162900,0.0,0.000000,6.938611,0.0,0.412025,7.533912,LAA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,fe9645_0,fe9645,10.309859,0.0,5.466568,0.0,9.844104,0.0,3.331920,0.0,...,2.561325,0.166650,0.123626,0.0,0.000000,7.032602,0.0,0.417934,7.646232,CE
747,fe9bec_0,fe9bec,10.309858,0.0,5.466567,0.0,9.844104,0.0,3.331920,0.0,...,2.561325,0.166650,0.123626,0.0,0.000000,7.032602,0.0,0.417934,7.646233,LAA
748,ff14e0_0,ff14e0,7.607613,0.0,4.344944,0.0,7.507661,0.0,2.469641,0.0,...,2.099806,0.276658,0.131094,0.0,0.000000,5.870748,0.0,0.596268,6.426944,CE
749,ffec5c_0,ffec5c,10.338734,0.0,5.177093,0.0,9.630227,0.0,3.207168,0.0,...,2.648090,0.243233,0.233593,0.0,0.000000,6.769436,0.0,0.401388,7.331739,LAA


In [14]:
x_train = train.drop(['image_id', 'patient_id', 'label'], axis = 1)
y_train = train['label']
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,10.309857,0.0,5.466567,0.0,9.844104,0.0,3.331920,0.0,0.0,0.0,...,0.0,2.561325,0.166650,0.123626,0.0,0.000000,7.032602,0.0,0.417935,7.646232
1,10.309859,0.0,5.466568,0.0,9.844103,0.0,3.331920,0.0,0.0,0.0,...,0.0,2.561325,0.166650,0.123626,0.0,0.000000,7.032601,0.0,0.417935,7.646231
2,10.338733,0.0,5.177093,0.0,9.630228,0.0,3.207168,0.0,0.0,0.0,...,0.0,2.648089,0.243233,0.233593,0.0,0.000000,6.769436,0.0,0.401388,7.331739
3,9.868591,0.0,5.254456,0.0,9.421617,0.0,3.174802,0.0,0.0,0.0,...,0.0,2.499751,0.196685,0.124887,0.0,0.000006,6.792432,0.0,0.450187,7.395343
4,10.320169,0.0,5.363185,0.0,9.767721,0.0,3.287366,0.0,0.0,0.0,...,0.0,2.592313,0.194001,0.162900,0.0,0.000000,6.938611,0.0,0.412025,7.533912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,10.309859,0.0,5.466568,0.0,9.844104,0.0,3.331920,0.0,0.0,0.0,...,0.0,2.561325,0.166650,0.123626,0.0,0.000000,7.032602,0.0,0.417934,7.646232
747,10.309858,0.0,5.466567,0.0,9.844104,0.0,3.331920,0.0,0.0,0.0,...,0.0,2.561325,0.166650,0.123626,0.0,0.000000,7.032602,0.0,0.417934,7.646233
748,7.607613,0.0,4.344944,0.0,7.507661,0.0,2.469641,0.0,0.0,0.0,...,0.0,2.099806,0.276658,0.131094,0.0,0.000000,5.870748,0.0,0.596268,6.426944
749,10.338734,0.0,5.177093,0.0,9.630227,0.0,3.207168,0.0,0.0,0.0,...,0.0,2.648090,0.243233,0.233593,0.0,0.000000,6.769436,0.0,0.401388,7.331739


In [15]:
test = test_features.copy()

test[['patient_id', 'image_num', 'patch_num']] = test.image_patch.str.split("_", expand=True)

test["image_id"] = test[["patient_id", "image_num"]].apply("_".join, axis=1)
test.drop(['image_patch'], axis = 1, inplace = True)
var_cols = test.columns[1:-4].values.tolist()
test = (
    test
    .groupby(['image_id', 'patient_id'], as_index=False)
    .apply(lambda x: pd.Series([sum(x[v] * x.filled_space) / sum(x.filled_space) for v in var_cols]))
)
# df_2 = df.groupby(['image_id', 'patient_id'], as_index = False).mean()
# test = test.set_index('image_id').join(test_meta.set_index('image_id')['label']).reset_index()
test 

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0,006388,0.0003367411,1.753616,0.582558,0.000167,0.040142,0.003258,1.4e-05,0.0,...,0.083063,4.041311,0.012161,3.968688,0.312231,4.138859,4.806515,1.639701,2.580303,0.013128
1,008e5c_0,008e5c,0.0001213483,1.782409,0.550707,1.7e-05,0.050841,0.002873,5e-06,0.0,...,0.075474,4.151555,0.01256,4.075252,0.310811,4.250176,4.895897,1.681449,2.592818,0.011198
2,00c058_0,00c058,6.146293e-06,2.555711,0.713785,0.0,0.001368,1.3e-05,0.0,0.0,...,0.000399,4.486734,9e-06,4.526445,0.046569,4.671673,5.525951,1.701755,2.933773,1e-06
3,01adc5_0,01adc5,2.547615e-07,1.954395,0.609154,0.0,0.010442,0.000183,0.0,0.0,...,0.010613,4.511083,4.2e-05,4.50502,0.127882,4.564424,5.042263,1.871412,2.485925,0.000186


In [16]:
x_test = test.drop(['image_id', 'patient_id'], axis = 1)
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.0003367411,1.753616,0.582558,0.000167,0.040142,0.003258,1.4e-05,0.0,0.025609,3.391389,...,0.083063,4.041311,0.012161,3.968688,0.312231,4.138859,4.806515,1.639701,2.580303,0.013128
1,0.0001213483,1.782409,0.550707,1.7e-05,0.050841,0.002873,5e-06,0.0,0.020246,3.56388,...,0.075474,4.151555,0.01256,4.075252,0.310811,4.250176,4.895897,1.681449,2.592818,0.011198
2,6.146293e-06,2.555711,0.713785,0.0,0.001368,1.3e-05,0.0,0.0,0.000472,3.760329,...,0.000399,4.486734,9e-06,4.526445,0.046569,4.671673,5.525951,1.701755,2.933773,1e-06
3,2.547615e-07,1.954395,0.609154,0.0,0.010442,0.000183,0.0,0.0,0.00368,3.941889,...,0.010613,4.511083,4.2e-05,4.50502,0.127882,4.564424,5.042263,1.871412,2.485925,0.000186


In [17]:
rf_class = RandomForestClassifier(max_depth=50, random_state=42, n_estimators=3000)
rf_class.fit(x_train, y_train)

RandomForestClassifier(max_depth=50, n_estimators=3000, random_state=42)

In [18]:
rf_class.score(x_train, y_train)

1.0

In [19]:
train['prediction'] = rf_class.predict(x_train)
train[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_train)
train

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4090,4091,4092,4093,4094,4095,label,prediction,CE,LAA
0,006388_0,006388,10.309857,0.0,5.466567,0.0,9.844104,0.0,3.331920,0.0,...,0.0,0.000000,7.032602,0.0,0.417935,7.646232,CE,CE,0.890000,0.110000
1,008e5c_0,008e5c,10.309859,0.0,5.466568,0.0,9.844103,0.0,3.331920,0.0,...,0.0,0.000000,7.032601,0.0,0.417935,7.646231,CE,CE,0.898667,0.101333
2,00c058_0,00c058,10.338733,0.0,5.177093,0.0,9.630228,0.0,3.207168,0.0,...,0.0,0.000000,6.769436,0.0,0.401388,7.331739,LAA,LAA,0.247000,0.753000
3,01adc5_0,01adc5,9.868591,0.0,5.254456,0.0,9.421617,0.0,3.174802,0.0,...,0.0,0.000006,6.792432,0.0,0.450187,7.395343,LAA,LAA,0.135000,0.865000
4,028989_0,028989,10.320169,0.0,5.363185,0.0,9.767721,0.0,3.287366,0.0,...,0.0,0.000000,6.938611,0.0,0.412025,7.533912,LAA,LAA,0.258667,0.741333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,fe9645_0,fe9645,10.309859,0.0,5.466568,0.0,9.844104,0.0,3.331920,0.0,...,0.0,0.000000,7.032602,0.0,0.417934,7.646232,CE,CE,0.879333,0.120667
747,fe9bec_0,fe9bec,10.309858,0.0,5.466567,0.0,9.844104,0.0,3.331920,0.0,...,0.0,0.000000,7.032602,0.0,0.417934,7.646233,LAA,LAA,0.250333,0.749667
748,ff14e0_0,ff14e0,7.607613,0.0,4.344944,0.0,7.507661,0.0,2.469641,0.0,...,0.0,0.000000,5.870748,0.0,0.596268,6.426944,CE,CE,0.900333,0.099667
749,ffec5c_0,ffec5c,10.338734,0.0,5.177093,0.0,9.630227,0.0,3.207168,0.0,...,0.0,0.000000,6.769436,0.0,0.401388,7.331739,LAA,LAA,0.244333,0.755667


In [20]:
train.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()

Unnamed: 0,patient_id,CE,LAA
0,006388,0.890000,0.110000
1,008e5c,0.898667,0.101333
2,00c058,0.247000,0.753000
3,01adc5,0.135000,0.865000
4,028989,0.258667,0.741333
...,...,...,...
626,fe0cca,0.885000,0.115000
627,fe9645,0.879333,0.120667
628,fe9bec,0.250333,0.749667
629,ff14e0,0.900333,0.099667


In [21]:
test['prediction'] = rf_class.predict(x_test)
test[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_test)

In [22]:
submission = test.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()
submission

Unnamed: 0,patient_id,CE,LAA
0,006388,0.468667,0.531333
1,008e5c,0.471333,0.528667
2,00c058,0.469,0.531
3,01adc5,0.474667,0.525333


In [23]:
submission.to_csv("submission.csv", index = False)