Strip AI Kaggle

In [1]:
import time
import os
import glob
import gc

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

import pandas as pd
import cv2 as cv
import tifffile as tifi

import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import skimage
from skimage.filters import sobel
from skimage import segmentation
from skimage.measure import regionprops, regionprops_table

from scipy import ndimage as ndi

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier

## Disabling the benchmarking feature with torch.backends.cudnn.benchmark = False 
## causes cuDNN to deterministically select an algorithm, possibly at the cost of reduced performance.
## https://pytorch.org/docs/stable/notes/randomness.html
torch.backends.cudnn.benchmark = False

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fa4a8704d90>

First we define the PyTorch model that we will use to extract the features.

In [2]:
# class RCF(nn.Module):
#     """A model for extracting Random Convolution Features (RCF) from input imagery."""
#     def __init__(self, num_features=16, kernel_size=3, num_input_channels=3):
#         super(RCF, self).__init__()
#         # We create `num_features / 2` filters so require `num_features` to be divisible by 2
#         assert num_features % 2 == 0, "Please enter an even number of features."
#         # Applies a 2D convolution over an input image composed of several input planes.
#         self.conv1 = nn.Conv2d(
#             num_input_channels,
#             num_features // 2,
#             kernel_size=kernel_size,
#             stride=1,
#             padding=0,
#             dilation=1,
#             bias=True,
#         )
#         # Fills the input Tensor 'conv1.weight' with values drawn from the normal distribution
#         nn.init.normal_(self.conv1.weight, mean=0.0, std=1.0) 
#         # Fills the input Tensor 'conv1.bias' with the value 'val = -1'.
#         nn.init.constant_(self.conv1.bias, -1.0)
#     def forward(self, x):
#         # The rectified linear activation function or ReLU for short is a piecewise linear function 
#         # that will output the input directly if it is positive, otherwise, it will output zero.
#         x1a = F.relu(self.conv1(x), inplace=True)
#         # The below step is where we take the inverse which is appended later
#         x1b = F.relu(-self.conv1(x), inplace=True)
#         # Applies a 2D adaptive average pooling over an input signal composed of several input planes.
#         x1a = F.adaptive_avg_pool2d(x1a, (1, 1)).squeeze()
#         x1b = F.adaptive_avg_pool2d(x1b, (1, 1)).squeeze()
#         if len(x1a.shape) == 1:  # case where we passed a single input
#             return torch.cat((x1a, x1b), dim=0)
#         elif len(x1a.shape) == 2:  # case where we passed a batch of > 1 inputs
#             return torch.cat((x1a, x1b), dim=1)

In [3]:
torch.cuda.is_available()

True

In [4]:
print(torch.__version__)

1.11.0


Next, we initialize the model and pytorch components

In [5]:
num_features = 4096
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = RCF(num_features).eval().to(device)
model = torch.jit.load('/kaggle/input/ptmodel/model_scripted.pt').eval().to(device)

In [6]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
conv1.weight 	 torch.Size([2048, 3, 3, 3])
conv1.bias 	 torch.Size([2048])


Read in metadata

In [7]:
train_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/train.csv')
test_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/test.csv')
other_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/other.csv')

List images for training and testing

In [8]:
train_path = "/kaggle/input/mayo-clinic-strip-ai/train/*.tif"
train_images = sorted(list(glob.glob(train_path)))
train_image_ids = [i.split("/")[-1].rstrip('.tif') for i in train_images]

test_path = "/kaggle/input/mayo-clinic-strip-ai/test/*.tif"
test_images = sorted(list(glob.glob(test_path)))
test_image_ids = [i.split("/")[-1].rstrip('.tif') for i in test_images]

In [9]:
test_images

['/kaggle/input/mayo-clinic-strip-ai/test/006388_0.tif',
 '/kaggle/input/mayo-clinic-strip-ai/test/008e5c_0.tif',
 '/kaggle/input/mayo-clinic-strip-ai/test/00c058_0.tif',
 '/kaggle/input/mayo-clinic-strip-ai/test/01adc5_0.tif']

Define helper functions

In [10]:
img_to_tensor = T.ToTensor()

def read_tiff(path):
    image = tifi.imread(path)
    filename = path.split('/')[-1].rstrip('.tif')
    return image, filename


def resize_image(image):
    re_sized_image = (
        cv.resize(
            image,
            (int(image.shape[1]/50),
             int(image.shape[0]/50)),
            interpolation=cv.INTER_LINEAR))
    return re_sized_image


def convert_image_grayscale(image):
    gray_image = cv.cvtColor(image, cv.COLOR_RGB2GRAY)
    return gray_image


def segment_images(resized_gray_img):
    elevation_map = sobel(resized_gray_img)
    markers = np.zeros_like(resized_gray_img)
    markers[resized_gray_img >= resized_gray_img.mean()] = 1
    markers[resized_gray_img < resized_gray_img.mean()] = 2
    segmented_img = segmentation.watershed(elevation_map, markers)
    filled_segments = ndi.binary_fill_holes(segmented_img - 1)
    labeled_segments, _ = ndi.label(filled_segments)
    return labeled_segments


def get_object_coordinates(labeled_segments):
    properties = ['area','bbox','convex_area','bbox_area', 'major_axis_length', 'minor_axis_length', 'eccentricity']
    df = pd.DataFrame(regionprops_table(labeled_segments, properties=properties))
    standard_scaler = StandardScaler()
    scaled_area = standard_scaler.fit_transform(df.area.values.reshape(-1,1))
    df['scaled_area'] = scaled_area
    df.sort_values(by="scaled_area", ascending=False, inplace=True)
    objects = df[df['scaled_area']>=.75]
    object_coordinates = [
        (int(row['bbox-0']),
         int(row['bbox-1']),
         int(row['bbox-2']),
         int(row['bbox-3']))
        for index, row in objects.iterrows()
    ]
    return object_coordinates


def crop_patch(coordinates, image):
    x1, y1, x2, y2 = coordinates
    if x2-x1<3:
        x1-=1
        x2+=1
    if y2-y1<3:
        y1-=1
        y2+=1
    cropped_image = image[x1:x2, y1:y2]
    return cropped_image


def compute_features(images):
    tic = time.time()
    j = 0
    white_pixels = [255,255,255]
    image_patches = []
    features = []
    filled_space = []
    for img in images: 
        image, filename = read_tiff(img)
        re_sized_image = resize_image(image)
#         del image
#         gc.collect()
        resized_gray_img = convert_image_grayscale(re_sized_image)
        labeled_segments = segment_images(resized_gray_img)
        object_coordinates = get_object_coordinates(labeled_segments)
        del labeled_segments, re_sized_image, resized_gray_img
        gc.collect()
        for i in range(len(object_coordinates)):
            patch_name = str(filename)+"_"+str(i)
            coordinates = object_coordinates[i]
            cropped_image = crop_patch(coordinates, image)

            white_space = np.count_nonzero(np.all(cropped_image==white_pixels,axis=2)) / cropped_image.size
            filled_space.append(1-white_space)

            tensor = img_to_tensor(cropped_image)
            tensor = tensor.to(device)
            with torch.no_grad():
                feats = model(tensor.unsqueeze(0)).cpu().numpy()
            features.append(feats)
            image_patches.append(patch_name)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            del tensor, feats, white_space, cropped_image, coordinates
            gc.collect()
        ten_percent = int(len(images)/10 + 1)
        if j % ten_percent == 0:
            print(f"{j+1}/{len(images)} -- {j / len(images) * 100:0.2f}% -- {time.time()-tic:0.2f} seconds")
            tic = time.time()
        j += 1
    arrays = [image_patches, filled_space]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=["image_patch", "filled_space"])
    data = pd.DataFrame(features, index=index).reset_index()
    return data

Open file, resize, convert to grayscale, segment, crop, and featurize. 

In [11]:
# %%time
# train_features = compute_features(train_images)
# train_features.to_csv("train_features.csv", index=False)
train_features = pd.read_csv('/kaggle/input/features/train_features.csv')
train_features

Unnamed: 0,image_patch,filled_space,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_1,0.716473,0.0,0,0.000923,4.164403,0.0,0.000056,0.000025,0.000245,...,0.000081,0.001384,6.379834,11.690228,1.788263,2.749364,0.333988,0.000366,0.0,6.015168
1,006388_0_2,0.666667,0.0,0,0.000000,4.179573,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,6.396448,11.714314,1.785543,2.756642,0.334701,0.000000,0.0,6.026847
2,006388_0_3,0.666667,0.0,0,0.000000,4.179573,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,6.396447,11.714314,1.785543,2.756641,0.334701,0.000000,0.0,6.026847
3,006388_0_4,0.666667,0.0,0,0.000000,4.179572,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,6.396448,11.714313,1.785544,2.756641,0.334701,0.000000,0.0,6.026847
4,006388_0_5,0.666667,0.0,0,0.000000,4.179572,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,6.396449,11.714313,1.785544,2.756641,0.334701,0.000000,0.0,6.026848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6047,ffec5c_0_6,1.000000,0.0,0,0.000000,4.016933,0.0,0.000000,0.000000,0.000000,...,0.000000,0.174519,6.497773,11.499941,1.590398,2.899749,0.382360,0.000000,0.0,5.847535
6048,ffec5c_1_1,1.000000,0.0,0,0.000000,4.016934,0.0,0.000000,0.000000,0.000000,...,0.000000,0.174519,6.497774,11.499943,1.590397,2.899751,0.382360,0.000000,0.0,5.847536
6049,ffec5c_1_2,1.000000,0.0,0,0.000000,4.016933,0.0,0.000000,0.000000,0.000000,...,0.000000,0.174519,6.497772,11.499938,1.590398,2.899750,0.382360,0.000000,0.0,5.847537
6050,ffec5c_1_3,1.000000,0.0,0,0.000000,4.016933,0.0,0.000000,0.000000,0.000000,...,0.000000,0.174519,6.497774,11.499942,1.590398,2.899749,0.382360,0.000000,0.0,5.847535


In [12]:
%%time
test_features = compute_features(test_images)
test_features.to_csv("test_features.csv", index=False)
test_features

1/4 -- 0.00% -- 42.04 seconds
2/4 -- 25.00% -- 4.70 seconds
3/4 -- 50.00% -- 12.41 seconds
4/4 -- 75.00% -- 23.83 seconds
CPU times: user 47.2 s, sys: 9.62 s, total: 56.8 s
Wall time: 1min 23s


Unnamed: 0,image_patch,filled_space,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.716473,0.0,0.0,0.000923,4.164401,0.0,5.6e-05,2.5e-05,0.000245,...,8.1e-05,0.001383961,6.379837,11.690227,1.788264,2.749362,0.333988,0.000366,0.0,6.015168
1,006388_0_1,0.666667,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.396449,11.714314,1.785544,2.756641,0.334701,0.0,0.0,6.026847
2,006388_0_2,0.666667,0.0,0.0,0.0,4.179572,0.0,0.0,0.0,0.0,...,0.0,0.0,6.396448,11.714314,1.785543,2.756641,0.334701,0.0,0.0,6.026847
3,006388_0_3,0.666667,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.396449,11.714314,1.785544,2.756641,0.334701,0.0,0.0,6.026847
4,006388_0_4,0.666667,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.39645,11.714316,1.785544,2.756641,0.334701,0.0,0.0,6.026848
5,006388_0_5,0.791027,0.0,0.0,0.001843,4.145054,0.0,0.000537,1.9e-05,0.000719,...,9.1e-05,0.003623513,6.357746,11.656507,1.791141,2.73987,0.334149,0.001242,0.0,5.999403
6,006388_0_6,0.666667,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.39645,11.714317,1.785544,2.756641,0.334701,0.0,0.0,6.026847
7,006388_0_7,0.666667,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.39645,11.714316,1.785544,2.756641,0.334701,0.0,0.0,6.026847
8,006388_0_8,0.666667,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.39645,11.714315,1.785544,2.756641,0.334701,0.0,0.0,6.026846
9,006388_0_9,0.666667,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.39645,11.714316,1.785544,2.756641,0.334701,0.0,0.0,6.026847


In [13]:
%%time
train = train_features.copy()

train[['patient_id', 'image_num', 'patch_num']] = train.image_patch.str.split("_", expand=True)

train["image_id"] = train[["patient_id", "image_num"]].apply("_".join, axis=1)
train.drop(['image_patch'], axis = 1, inplace = True)

var_cols = train.columns[1:-4].values.tolist()
train = (
    train
    .groupby(['image_id', 'patient_id'], as_index=False)
    .apply(lambda x: pd.Series([sum(x[v] * x.filled_space) / sum(x.filled_space) for v in var_cols]))
)
# df_2 = df.groupby(['image_id', 'patient_id'], as_index = False).mean()
train = train.set_index('image_id').join(train_meta.set_index('image_id')['label']).reset_index()
train 

CPU times: user 8min 11s, sys: 652 ms, total: 8min 11s
Wall time: 8min 13s


Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,label
0,006388_0,006388,0.0,0.0,0.000426,4.171304,0.0,0.000037,0.000002,0.000113,...,0.000804,6.386687,11.700088,1.787105,2.752217,0.334645,0.00021,0.0,6.020204,CE
1,008e5c_0,008e5c,0.0,0.0,0.000000,4.179574,0.0,0.000000,0.000000,0.000000,...,0.000000,6.396450,11.714315,1.785544,2.756641,0.334701,0.00000,0.0,6.026847,CE
2,00c058_0,00c058,0.0,0.0,0.000000,4.016933,0.0,0.000000,0.000000,0.000000,...,0.174519,6.497773,11.499941,1.590398,2.899750,0.382360,0.00000,0.0,5.847535,LAA
3,01adc5_0,01adc5,0.0,0.0,0.000004,3.997488,0.0,0.000000,0.000000,0.000000,...,0.000058,6.175398,11.297177,1.761283,2.677611,0.366253,0.00000,0.0,5.842044,LAA
4,026c97_0,026c97,0.0,0.0,0.000000,4.179572,0.0,0.000000,0.000000,0.000000,...,0.000000,6.396447,11.714313,1.785544,2.756641,0.334701,0.00000,0.0,6.026847,CE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,fe9645_0,fe9645,0.0,0.0,0.000000,4.179580,0.0,0.000000,0.000000,0.000000,...,0.000000,6.396433,11.714296,1.785542,2.756626,0.334709,0.00000,0.0,6.026840,CE
750,fe9bec_0,fe9bec,0.0,0.0,0.000000,4.179573,0.0,0.000000,0.000000,0.000000,...,0.000000,6.396449,11.714315,1.785544,2.756641,0.334701,0.00000,0.0,6.026848,LAA
751,ff14e0_0,ff14e0,0.0,0.0,0.000000,3.172652,0.0,0.000000,0.000000,0.000000,...,0.000000,4.988273,9.402601,1.817459,2.166300,0.457224,0.00000,0.0,5.051295,CE
752,ffec5c_0,ffec5c,0.0,0.0,0.000000,4.016934,0.0,0.000000,0.000000,0.000000,...,0.174519,6.497773,11.499941,1.590397,2.899750,0.382360,0.00000,0.0,5.847535,LAA


In [14]:
x_train = train.drop(['image_id', 'patient_id', 'label'], axis = 1)
y_train = train['label']
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.0,0.0,0.000426,4.171304,0.0,0.000037,0.000002,0.000113,0.467086,0.0,...,0.000019,0.000804,6.386687,11.700088,1.787105,2.752217,0.334645,0.00021,0.0,6.020204
1,0.0,0.0,0.000000,4.179574,0.0,0.000000,0.000000,0.000000,0.465839,0.0,...,0.000000,0.000000,6.396450,11.714315,1.785544,2.756641,0.334701,0.00000,0.0,6.026847
2,0.0,0.0,0.000000,4.016933,0.0,0.000000,0.000000,0.000000,0.257315,0.0,...,0.000000,0.174519,6.497773,11.499941,1.590398,2.899750,0.382360,0.00000,0.0,5.847535
3,0.0,0.0,0.000004,3.997488,0.0,0.000000,0.000000,0.000000,0.409350,0.0,...,0.000000,0.000058,6.175398,11.297177,1.761283,2.677611,0.366253,0.00000,0.0,5.842044
4,0.0,0.0,0.000000,4.179572,0.0,0.000000,0.000000,0.000000,0.465839,0.0,...,0.000000,0.000000,6.396447,11.714313,1.785544,2.756641,0.334701,0.00000,0.0,6.026847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,0.0,0.0,0.000000,4.179580,0.0,0.000000,0.000000,0.000000,0.465834,0.0,...,0.000000,0.000000,6.396433,11.714296,1.785542,2.756626,0.334709,0.00000,0.0,6.026840
750,0.0,0.0,0.000000,4.179573,0.0,0.000000,0.000000,0.000000,0.465839,0.0,...,0.000000,0.000000,6.396449,11.714315,1.785544,2.756641,0.334701,0.00000,0.0,6.026848
751,0.0,0.0,0.000000,3.172652,0.0,0.000000,0.000000,0.000000,0.336312,0.0,...,0.000000,0.000000,4.988273,9.402601,1.817459,2.166300,0.457224,0.00000,0.0,5.051295
752,0.0,0.0,0.000000,4.016934,0.0,0.000000,0.000000,0.000000,0.257315,0.0,...,0.000000,0.174519,6.497773,11.499941,1.590397,2.899750,0.382360,0.00000,0.0,5.847535


In [15]:
test = test_features.copy()

test[['patient_id', 'image_num', 'patch_num']] = test.image_patch.str.split("_", expand=True)

test["image_id"] = test[["patient_id", "image_num"]].apply("_".join, axis=1)
test.drop(['image_patch'], axis = 1, inplace = True)
var_cols = test.columns[1:-4].values.tolist()
test = (
    test
    .groupby(['image_id', 'patient_id'], as_index=False)
    .apply(lambda x: pd.Series([sum(x[v] * x.filled_space) / sum(x.filled_space) for v in var_cols]))
)
# df_2 = df.groupby(['image_id', 'patient_id'], as_index = False).mean()
# test = test.set_index('image_id').join(test_meta.set_index('image_id')['label']).reset_index()
test 

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0,006388,0.0,0.0,0.000426,4.171304,0.0,3.7e-05,2e-06,0.000113,...,1.9e-05,0.000804,6.386688,11.70009,1.787106,2.752216,0.334646,0.00021,0.0,6.020204
1,008e5c_0,008e5c,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,...,0.0,0.0,6.396449,11.714316,1.785544,2.756641,0.334701,0.0,0.0,6.026847
2,00c058_0,00c058,0.0,0.0,0.0,4.016934,0.0,0.0,0.0,0.0,...,0.0,0.174519,6.497775,11.499943,1.590398,2.89975,0.38236,0.0,0.0,5.847537
3,01adc5_0,01adc5,0.0,0.0,4e-06,3.997488,0.0,0.0,0.0,0.0,...,0.0,5.8e-05,6.175398,11.297176,1.761283,2.677611,0.366253,0.0,0.0,5.842044


In [16]:
x_test = test.drop(['image_id', 'patient_id'], axis = 1)
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.0,0.0,0.000426,4.171304,0.0,3.7e-05,2e-06,0.000113,0.467086,0.0,...,1.9e-05,0.000804,6.386688,11.70009,1.787106,2.752216,0.334646,0.00021,0.0,6.020204
1,0.0,0.0,0.0,4.179573,0.0,0.0,0.0,0.0,0.465839,0.0,...,0.0,0.0,6.396449,11.714316,1.785544,2.756641,0.334701,0.0,0.0,6.026847
2,0.0,0.0,0.0,4.016934,0.0,0.0,0.0,0.0,0.257315,0.0,...,0.0,0.174519,6.497775,11.499943,1.590398,2.89975,0.38236,0.0,0.0,5.847537
3,0.0,0.0,4e-06,3.997488,0.0,0.0,0.0,0.0,0.409351,0.0,...,0.0,5.8e-05,6.175398,11.297176,1.761283,2.677611,0.366253,0.0,0.0,5.842044


In [17]:
rf_class = RandomForestClassifier(max_depth=50, random_state=42, n_estimators=3000)
rf_class.fit(x_train, y_train)

RandomForestClassifier(max_depth=50, n_estimators=3000, random_state=42)

In [18]:
rf_class.score(x_train, y_train)

1.0

In [19]:
train['prediction'] = rf_class.predict(x_train)
train[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_train)
train

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4090,4091,4092,4093,4094,4095,label,prediction,CE,LAA
0,006388_0,006388,0.0,0.0,0.000426,4.171304,0.0,0.000037,0.000002,0.000113,...,1.787105,2.752217,0.334645,0.00021,0.0,6.020204,CE,CE,0.855333,0.144667
1,008e5c_0,008e5c,0.0,0.0,0.000000,4.179574,0.0,0.000000,0.000000,0.000000,...,1.785544,2.756641,0.334701,0.00000,0.0,6.026847,CE,CE,0.891667,0.108333
2,00c058_0,00c058,0.0,0.0,0.000000,4.016933,0.0,0.000000,0.000000,0.000000,...,1.590398,2.899750,0.382360,0.00000,0.0,5.847535,LAA,LAA,0.214667,0.785333
3,01adc5_0,01adc5,0.0,0.0,0.000004,3.997488,0.0,0.000000,0.000000,0.000000,...,1.761283,2.677611,0.366253,0.00000,0.0,5.842044,LAA,LAA,0.093667,0.906333
4,026c97_0,026c97,0.0,0.0,0.000000,4.179572,0.0,0.000000,0.000000,0.000000,...,1.785544,2.756641,0.334701,0.00000,0.0,6.026847,CE,CE,0.863667,0.136333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,fe9645_0,fe9645,0.0,0.0,0.000000,4.179580,0.0,0.000000,0.000000,0.000000,...,1.785542,2.756626,0.334709,0.00000,0.0,6.026840,CE,CE,0.899000,0.101000
750,fe9bec_0,fe9bec,0.0,0.0,0.000000,4.179573,0.0,0.000000,0.000000,0.000000,...,1.785544,2.756641,0.334701,0.00000,0.0,6.026848,LAA,LAA,0.264000,0.736000
751,ff14e0_0,ff14e0,0.0,0.0,0.000000,3.172652,0.0,0.000000,0.000000,0.000000,...,1.817459,2.166300,0.457224,0.00000,0.0,5.051295,CE,CE,0.845333,0.154667
752,ffec5c_0,ffec5c,0.0,0.0,0.000000,4.016934,0.0,0.000000,0.000000,0.000000,...,1.590397,2.899750,0.382360,0.00000,0.0,5.847535,LAA,LAA,0.216333,0.783667


In [20]:
train.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()

Unnamed: 0,patient_id,CE,LAA
0,006388,0.855333,0.144667
1,008e5c,0.891667,0.108333
2,00c058,0.214667,0.785333
3,01adc5,0.093667,0.906333
4,026c97,0.863667,0.136333
...,...,...,...
627,fe0cca,0.847333,0.152667
628,fe9645,0.899000,0.101000
629,fe9bec,0.264000,0.736000
630,ff14e0,0.845333,0.154667


In [21]:
test['prediction'] = rf_class.predict(x_test)
test[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_test)

In [22]:
submission = test.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()
submission

Unnamed: 0,patient_id,CE,LAA
0,006388,0.855667,0.144333
1,008e5c,0.537333,0.462667
2,00c058,0.723667,0.276333
3,01adc5,0.093667,0.906333


In [23]:
submission.to_csv("submission.csv", index = False)