Strip AI Kaggle

In [1]:
!conda install /kaggle/input/how-to-use-pyvips-offline/*.tar.bz2
import time
import os
import glob
import gc
from tqdm.notebook import tqdm

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

import pandas as pd
import cv2 as cv
import tifffile as tifi
import pyvips

import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import skimage
from skimage.filters import sobel
from skimage import segmentation
from skimage.transform import resize
from skimage.measure import regionprops_table

from scipy import ndimage as ndi

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier

from PIL import Image
Image.MAX_IMAGE_PIXELS = 5_000_000_000

## Disabling the benchmarking feature with torch.backends.cudnn.benchmark = False 
## causes cuDNN to deterministically select an algorithm, possibly at the cost of reduced performance.
## https://pytorch.org/docs/stable/notes/randomness.html
torch.backends.cudnn.benchmark = False

np.random.seed(42)
torch.manual_seed(42)


Downloading and Extracting Packages
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
###########################################################

<torch._C.Generator at 0x7fd1ae5b6b70>

In [2]:
num_features = 4096
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.jit.load('/kaggle/input/ptmodel/model_scripted.pt').eval().to(device)

In [3]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
conv1.weight 	 torch.Size([2048, 3, 3, 3])
conv1.bias 	 torch.Size([2048])


Read in metadata

In [4]:
train_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/train.csv')
test_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/test.csv')
other_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/other.csv')

In [5]:
input_file_type = ".tif"
output_file_type = ".png"

In [6]:
def prune_image_rows_cols(im, mask, thr=0.990):
    # delete empty columns
    for l in reversed(range(im.shape[1])):
        if (np.sum(mask[:, l]) / float(mask.shape[0])) > thr:
            im = np.delete(im, l, 1)
    # delete empty rows
    for l in reversed(range(im.shape[0])):
        if (np.sum(mask[l, :]) / float(mask.shape[1])) > thr:
            im = np.delete(im, l, 0)
    return im


def mask_median(im, val=255):
    masks = [None] * 3
    for c in range(3):
        masks[c] = im[..., c] >= np.median(im[:, :, c]) - 5
    mask = np.logical_and(*masks)
    im[mask, :] = val
    return im, mask

os.environ['VIPS_CONCURRENCY'] = '4'
os.environ['VIPS_DISC_THRESHOLD'] = '8gb'

def image_load_scale_norm(img_path, prune_thr=0.990, bg_val=255):
    img = Image.open(img_path)
    if (img.width * img.height) > 4_000_000_000:
        print(f"width: {img.width}, height: {img.height}, pixels: {img.width * img.height}")
        return None
    scale = min(img.height / 2e3, img.width / 2e3)
    if scale > 1:
        tmp_size = int(img.width / scale), int(img.height / scale)
        img.thumbnail(tmp_size, resample=Image.Resampling.BILINEAR, reducing_gap=2.0)
    img, mask = mask_median(np.array(img), val=bg_val)
    img = prune_image_rows_cols(img, mask, thr=prune_thr)
    img = Image.fromarray(img)
    scale = min(img.height / 1e3, img.width / 1e3)
    if scale > 1:
        img = img.resize((int(img.width / scale), int(img.height / scale)), Image.Resampling.LANCZOS)
    return img

In [7]:
# os.mkdir("../train/")
# for name in tqdm(train_meta["image_id"]):
#     img_path = os.path.join("/kaggle/input/mayo-clinic-strip-ai/", "train", f"{name}.tif")
#     img = image_load_scale_norm(img_path)
#     if not img:
#         continue
#     img.save(os.path.join("../train/", f"{name}.png"))
#     del img
#     gc.collect()

In [8]:
try:
    os.mkdir("../test/")
except:
    pass
for name in tqdm(test_meta["image_id"]):
    img_path = os.path.join("/kaggle/input/mayo-clinic-strip-ai/", "test", f"{name}{input_file_type}")
    img = image_load_scale_norm(img_path)
    if not img:
        continue
    img.save(os.path.join("../test/", f"{name}{output_file_type}"))
    del img
    gc.collect()

  0%|          | 0/4 [00:00<?, ?it/s]

List images for training and testing

In [9]:
train_path = f"../train/*{output_file_type}"
train_images = sorted(list(glob.glob(train_path)))

test_path = f"../test/*{output_file_type}"
test_images = sorted(list(glob.glob(test_path)))

In [10]:
test_images

['../test/006388_0.png',
 '../test/008e5c_0.png',
 '../test/00c058_0.png',
 '../test/01adc5_0.png']

In [11]:
# cv.resize(image, (512, 512))

Define helper functions

In [12]:
img_to_tensor = T.ToTensor()

def read_tiff(path):
    image = cv.imread(path)
    image = cv.resize(image, (512, 512))
#     image = tifi.imread(path)
    filename = path.split('/')[-1].rstrip(output_file_type)
    return image, filename


def convert_image_grayscale(image):
    gray_image = cv.cvtColor(image, cv.COLOR_RGB2GRAY)
    return gray_image


def segment_images(gray_image):
    elevation_map = sobel(gray_image)
    markers = np.zeros_like(gray_image)
    markers[gray_image >= gray_image.mean()] = 1
    markers[gray_image < gray_image.mean()] = 2
    segmented_img = segmentation.watershed(elevation_map, markers)
    filled_segments = ndi.binary_fill_holes(segmented_img - 1)
    labeled_segments, _ = ndi.label(filled_segments)
    return labeled_segments


def get_object_coordinates(labeled_segments):
    properties = ['area','bbox','convex_area','bbox_area', 'major_axis_length', 'minor_axis_length', 'eccentricity']
    df = pd.DataFrame(regionprops_table(labeled_segments, properties=properties))
    standard_scaler = StandardScaler()
    scaled_area = standard_scaler.fit_transform(df.area.values.reshape(-1,1))
    df['scaled_area'] = scaled_area
    df.sort_values(by="scaled_area", ascending=False, inplace=True)
    objects = df[df['scaled_area']>=.75]
    object_coordinates = [
        (int(row['bbox-0']),
         int(row['bbox-1']),
         int(row['bbox-2']),
         int(row['bbox-3']))
        for index, row in objects.iterrows()
    ]
    return object_coordinates


def crop_patch(coordinates, image):
    x1, y1, x2, y2 = coordinates
    if x2-x1<3:
        x1-=1
        x2+=1
    if y2-y1<3:
        y1-=1
        y2+=1
    cropped_image = image[x1:x2, y1:y2]
    return cropped_image


def compute_features(images):
    tic = time.time()
    j = 0
    image_patches = []
    features = []
    for img in images: 
        image, filename = read_tiff(img)
        gray_image = convert_image_grayscale(image)
        labeled_segments = segment_images(gray_image)
        object_coordinates = get_object_coordinates(labeled_segments)
        del labeled_segments, gray_image,
        gc.collect()
        for i in range(len(object_coordinates)):
            patch_name = str(filename)+"_"+str(i)
            coordinates = object_coordinates[i]
            cropped_image = crop_patch(coordinates, image)
            tensor = img_to_tensor(cropped_image)
            tensor = tensor.to(device)
            with torch.no_grad():
                feats = model(tensor.unsqueeze(0)).cpu().numpy()
            features.append(feats)
            image_patches.append(patch_name)
            del tensor, feats, cropped_image, coordinates, 
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        del image
        gc.collect()
        ten_percent = int(len(images)/10 + 1)
        if j % ten_percent == 0:
            print(f"{j+1}/{len(images)} -- {(j + 1)/ len(images) * 100:0.0f}% -- {time.time()-tic:0.2f} seconds")
            tic = time.time()
        j += 1
    data = pd.DataFrame(features, index=image_patches).rename_axis("image_patch").reset_index()
    return data

Open file, resize, convert to grayscale, segment, crop, and featurize. 

In [13]:
# %%time
# train_features = compute_features(train_images)
# train_features.to_csv("train_features.csv", index=False)
train_features = pd.read_csv('/kaggle/input/features/train_features.csv')
train_features

Unnamed: 0,image_patch,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.000000,0.0,0.019385,2.253018,0.000073,0.192554,0.012104,0.037518,0.416069,...,0.019421,0.435570,4.656342,8.400555,1.697219,2.191239,0.383049,0.408236,0.000166,4.314948
1,006388_0_1,0.000000,0.0,0.025386,1.596097,0.000040,0.397371,0.028033,0.049161,0.550028,...,0.019888,0.487013,4.049784,7.501255,1.797163,1.936562,0.339594,0.556038,0.000000,3.806289
2,006388_0_2,0.000000,0.0,0.018784,2.610573,0.000000,0.166781,0.014447,0.025850,0.442816,...,0.010741,0.373136,4.994141,9.067782,1.725049,2.293719,0.366494,0.345154,0.000000,4.639303
3,006388_0_3,0.000000,0.0,0.049976,2.341556,0.000000,0.085757,0.019666,0.035585,0.296108,...,0.073973,0.600606,4.786766,8.379460,1.526385,2.296116,0.515762,0.382223,0.000000,4.302649
4,006388_0_4,0.000015,0.0,0.045484,2.448952,0.000009,0.107040,0.019359,0.036026,0.338701,...,0.057756,0.545340,4.916690,8.694880,1.568748,2.326694,0.479577,0.353527,0.000000,4.437832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4712,ffec5c_0_5,0.000000,0.0,0.004207,3.236424,0.000000,0.156066,0.008093,0.042422,0.495847,...,0.000087,0.447931,5.911875,10.486738,1.640912,2.697766,0.372719,0.181917,0.000000,5.264678
4713,ffec5c_1_0,0.000049,0.0,0.005330,2.784270,0.000012,0.245274,0.020246,0.085401,0.500667,...,0.000314,0.621847,5.689856,9.956371,1.601170,2.666361,0.360110,0.337180,0.000000,4.913766
4714,ffec5c_1_1,0.000013,0.0,0.006190,2.392345,0.000000,0.298647,0.022026,0.111168,0.527831,...,0.000035,0.785588,5.474851,9.479002,1.574447,2.621035,0.356214,0.430806,0.000000,4.618607
4715,ffec5c_1_2,0.000000,0.0,0.008376,2.073377,0.000000,0.967895,0.117872,0.243376,1.018315,...,0.000083,0.482537,4.906369,9.228513,2.030420,2.241795,0.275119,0.546356,0.000000,4.473633


In [14]:
%%time
test_features = compute_features(test_images)
test_features

1/4 -- 25% -- 6.94 seconds
2/4 -- 50% -- 1.20 seconds
3/4 -- 75% -- 1.22 seconds
4/4 -- 100% -- 1.32 seconds
CPU times: user 6.18 s, sys: 1.09 s, total: 7.26 s
Wall time: 10.9 s


Unnamed: 0,image_patch,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.0,0.0,0.019385,2.253017,7.3e-05,0.192554,0.012104,0.037518,0.416069,...,0.019421,0.43557,4.656342,8.400556,1.697219,2.191239,0.383049,0.408236,0.000166,4.314948
1,006388_0_1,0.0,0.0,0.025386,1.596097,4e-05,0.397371,0.028033,0.049161,0.550028,...,0.019888,0.487013,4.049783,7.501255,1.797163,1.936562,0.339594,0.556038,0.0,3.80629
2,006388_0_2,0.0,0.0,0.018784,2.610573,0.0,0.166781,0.014447,0.02585,0.442816,...,0.010741,0.373136,4.99414,9.067782,1.725049,2.293719,0.366494,0.345154,0.0,4.639304
3,006388_0_3,0.0,0.0,0.049976,2.341556,0.0,0.085757,0.019666,0.035585,0.296108,...,0.073973,0.600606,4.786766,8.379459,1.526384,2.296116,0.515762,0.382223,0.0,4.302649
4,006388_0_4,1.522413e-05,0.0,0.045484,2.448952,9e-06,0.10704,0.019359,0.036026,0.338701,...,0.057756,0.54534,4.91669,8.694879,1.568748,2.326694,0.479577,0.353527,0.0,4.437832
5,006388_0_5,0.0,0.0,0.021543,2.530069,0.0,0.213915,0.021223,0.020475,0.474303,...,0.013891,0.351323,4.882712,8.940242,1.763741,2.239441,0.339301,0.36994,0.0,4.57505
6,006388_0_6,0.0,0.0,0.02362,2.624382,0.0,0.121616,0.007329,0.043394,0.422694,...,0.018882,0.3426,4.991112,9.061723,1.72066,2.290522,0.390859,0.311819,0.0,4.661378
7,006388_0_7,3.096315e-05,0.0,0.056527,2.758985,5.6e-05,0.030746,0.029765,0.048267,0.325616,...,0.072753,0.460236,5.1525,9.104372,1.573599,2.400178,0.527089,0.264908,0.000128,4.713399
8,006388_0_8,0.0,0.0,0.032097,2.480603,0.0,0.111072,0.010725,0.049434,0.398158,...,0.029305,0.408917,4.85503,8.775486,1.690908,2.251536,0.423647,0.347536,0.0,4.510461
9,006388_0_9,0.0,0.0,0.07173,2.333933,2e-05,0.026684,0.041973,0.052061,0.27863,...,0.104273,0.613587,4.775343,8.325586,1.492503,2.291229,0.584436,0.358865,0.0,4.294784


In [15]:
%%time
train = train_features.copy()

train[['patient_id', 'image_num', 'patch_num']] = train.image_patch.str.split("_", expand=True)

train["image_id"] = train[["patient_id", "image_num"]].apply("_".join, axis=1)
train.drop(['image_patch'], axis = 1, inplace = True)

train = train.groupby(['image_id', 'patient_id'], as_index = False).mean()
train = train.set_index('image_id').join(train_meta.set_index('image_id')['label']).reset_index()
train 

CPU times: user 380 ms, sys: 148 ms, total: 528 ms
Wall time: 542 ms


Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,label
0,006388_0,006388,4.618722e-06,0.0,0.036453,2.397817,1.988299e-05,0.145354,0.020463,0.039777,...,0.461833,4.806042,8.625134,1.655597,2.251724,0.434981,0.369825,0.000029,4.425609,CE
1,008e5c_0,008e5c,7.107735e-05,0.0,0.035892,2.445095,1.067070e-05,0.096187,0.048255,0.058684,...,0.464923,4.866971,8.685375,1.629415,2.285439,0.508525,0.301258,0.002738,4.469131,CE
2,00c058_0,00c058,1.248724e-04,0.0,0.003693,2.795539,2.761033e-07,0.123707,0.013082,0.050844,...,0.929136,5.950115,9.960373,1.374825,2.880029,0.395592,0.398464,0.000000,4.844008,LAA
3,01adc5_0,01adc5,0.000000e+00,0.0,0.025861,3.276166,0.000000e+00,0.010768,0.001989,0.018649,...,0.254886,5.659879,10.158729,1.657563,2.551898,0.414442,0.120910,0.000000,5.221131,LAA
4,026c97_0,026c97,2.789901e-07,0.0,0.001468,2.847660,0.000000e+00,0.506687,0.024998,0.053133,...,0.166051,5.305485,10.004539,1.966014,2.332554,0.265848,0.261237,0.000000,5.064417,CE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,fe9645_0,fe9645,2.894052e-04,0.0,0.007600,2.114387,0.000000e+00,0.197965,0.017547,0.055584,...,0.937057,5.248209,8.907648,1.489948,2.585466,0.313976,0.572974,0.000000,4.305548,CE
747,fe9bec_0,fe9bec,0.000000e+00,0.0,0.014926,2.342572,0.000000e+00,0.284864,0.013294,0.077263,...,0.631356,5.180415,9.182172,1.662335,2.444120,0.371340,0.407294,0.000000,4.549081,LAA
748,ff14e0_0,ff14e0,0.000000e+00,0.0,0.031349,3.350480,0.000000e+00,0.180316,0.014175,0.040476,...,0.268685,5.721702,10.268674,1.658110,2.576721,0.462200,0.133334,0.000000,5.274067,CE
749,ffec5c_0,ffec5c,0.000000e+00,0.0,0.002865,2.894617,0.000000e+00,0.458917,0.025147,0.083489,...,0.319462,5.515711,10.200382,1.885758,2.458991,0.303956,0.234496,0.000000,5.109684,LAA


In [16]:
x_train = train.drop(['image_id', 'patient_id', 'label'], axis = 1)
# x_train = x_train.drop(['image_num', 'patch_num', ], axis = 1)
y_train = train['label']
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,4.618722e-06,0.0,0.036453,2.397817,1.988299e-05,0.145354,0.020463,0.039777,0.394313,0.0,...,0.042088,0.461833,4.806042,8.625134,1.655597,2.251724,0.434981,0.369825,0.000029,4.425609
1,7.107735e-05,0.0,0.035892,2.445095,1.067070e-05,0.096187,0.048255,0.058684,0.391062,0.0,...,0.030391,0.464923,4.866971,8.685375,1.629415,2.285439,0.508525,0.301258,0.002738,4.469131
2,1.248724e-04,0.0,0.003693,2.795539,2.761033e-07,0.123707,0.013082,0.050844,0.358889,0.0,...,0.000104,0.929136,5.950115,9.960373,1.374825,2.880029,0.395592,0.398464,0.000000,4.844008
3,0.000000e+00,0.0,0.025861,3.276166,0.000000e+00,0.010768,0.001989,0.018649,0.335013,0.0,...,0.011377,0.254886,5.659879,10.158729,1.657563,2.551898,0.414442,0.120910,0.000000,5.221131
4,2.789901e-07,0.0,0.001468,2.847660,0.000000e+00,0.506687,0.024998,0.053133,0.764649,0.0,...,0.000170,0.166051,5.305485,10.004539,1.966014,2.332554,0.265848,0.261237,0.000000,5.064417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,2.894052e-04,0.0,0.007600,2.114387,0.000000e+00,0.197965,0.017547,0.055584,0.355837,0.0,...,0.000748,0.937057,5.248209,8.907648,1.489948,2.585466,0.313976,0.572974,0.000000,4.305548
747,0.000000e+00,0.0,0.014926,2.342572,0.000000e+00,0.284864,0.013294,0.077263,0.537262,0.0,...,0.002020,0.631356,5.180415,9.182172,1.662335,2.444120,0.371340,0.407294,0.000000,4.549081
748,0.000000e+00,0.0,0.031349,3.350480,0.000000e+00,0.180316,0.014175,0.040476,0.410915,0.0,...,0.060050,0.268685,5.721702,10.268674,1.658110,2.576721,0.462200,0.133334,0.000000,5.274067
749,0.000000e+00,0.0,0.002865,2.894617,0.000000e+00,0.458917,0.025147,0.083489,0.764623,0.0,...,0.000020,0.319462,5.515711,10.200382,1.885758,2.458991,0.303956,0.234496,0.000000,5.109684


In [17]:
test = test_features.copy()

test[['patient_id', 'image_num', 'patch_num']] = test.image_patch.str.split("_", expand=True)

test["image_id"] = test[["patient_id", "image_num"]].apply("_".join, axis=1)
test.drop(['image_patch'], axis = 1, inplace = True)

test = test.groupby(['image_id', 'patient_id'], as_index = False).mean()
test 

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0,006388,5e-06,0.0,0.036453,2.397817,1.9883e-05,0.145354,0.020463,0.039777,...,0.042088,0.461833,4.806042,8.625134,1.655597,2.251724,0.434981,0.369825,2.9e-05,4.42561
1,008e5c_0,008e5c,7.1e-05,0.0,0.035892,2.445095,1.06707e-05,0.096187,0.048255,0.058684,...,0.030391,0.464923,4.866972,8.685375,1.629415,2.285439,0.508525,0.301259,0.002738,4.469131
2,00c058_0,00c058,0.000125,0.0,0.003693,2.79554,2.761016e-07,0.123707,0.013082,0.050844,...,0.000104,0.929136,5.950115,9.960373,1.374825,2.880029,0.395592,0.398464,0.0,4.844008
3,01adc5_0,01adc5,0.0,0.0,0.025861,3.276165,0.0,0.010768,0.001989,0.018649,...,0.011377,0.254886,5.659879,10.158728,1.657563,2.551898,0.414442,0.12091,0.0,5.221131


In [18]:
x_test = test.drop(['image_id', 'patient_id'], axis = 1)
# x_test = x_test.drop(['image_num', 'patch_num', ], axis = 1)
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,5e-06,0.0,0.036453,2.397817,1.9883e-05,0.145354,0.020463,0.039777,0.394312,0.0,...,0.042088,0.461833,4.806042,8.625134,1.655597,2.251724,0.434981,0.369825,2.9e-05,4.42561
1,7.1e-05,0.0,0.035892,2.445095,1.06707e-05,0.096187,0.048255,0.058684,0.391062,0.0,...,0.030391,0.464923,4.866972,8.685375,1.629415,2.285439,0.508525,0.301259,0.002738,4.469131
2,0.000125,0.0,0.003693,2.79554,2.761016e-07,0.123707,0.013082,0.050844,0.358889,0.0,...,0.000104,0.929136,5.950115,9.960373,1.374825,2.880029,0.395592,0.398464,0.0,4.844008
3,0.0,0.0,0.025861,3.276165,0.0,0.010768,0.001989,0.018649,0.335013,0.0,...,0.011377,0.254886,5.659879,10.158728,1.657563,2.551898,0.414442,0.12091,0.0,5.221131


In [19]:
rf_class = RandomForestClassifier(max_depth=15, random_state=42, n_estimators=3000)
rf_class.fit(x_train.values, y_train.values)

RandomForestClassifier(max_depth=15, n_estimators=3000, random_state=42)

In [20]:
rf_class.score(x_train.values, y_train.values)

1.0

In [21]:
train['prediction'] = rf_class.predict(x_train.values)
train[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_train.values)
train

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4090,4091,4092,4093,4094,4095,label,prediction,CE,LAA
0,006388_0,006388,4.618722e-06,0.0,0.036453,2.397817,1.988299e-05,0.145354,0.020463,0.039777,...,1.655597,2.251724,0.434981,0.369825,0.000029,4.425609,CE,CE,0.848801,0.151199
1,008e5c_0,008e5c,7.107735e-05,0.0,0.035892,2.445095,1.067070e-05,0.096187,0.048255,0.058684,...,1.629415,2.285439,0.508525,0.301258,0.002738,4.469131,CE,CE,0.843656,0.156344
2,00c058_0,00c058,1.248724e-04,0.0,0.003693,2.795539,2.761033e-07,0.123707,0.013082,0.050844,...,1.374825,2.880029,0.395592,0.398464,0.000000,4.844008,LAA,LAA,0.333034,0.666966
3,01adc5_0,01adc5,0.000000e+00,0.0,0.025861,3.276166,0.000000e+00,0.010768,0.001989,0.018649,...,1.657563,2.551898,0.414442,0.120910,0.000000,5.221131,LAA,LAA,0.262211,0.737789
4,026c97_0,026c97,2.789901e-07,0.0,0.001468,2.847660,0.000000e+00,0.506687,0.024998,0.053133,...,1.966014,2.332554,0.265848,0.261237,0.000000,5.064417,CE,CE,0.895852,0.104148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,fe9645_0,fe9645,2.894052e-04,0.0,0.007600,2.114387,0.000000e+00,0.197965,0.017547,0.055584,...,1.489948,2.585466,0.313976,0.572974,0.000000,4.305548,CE,CE,0.956328,0.043672
747,fe9bec_0,fe9bec,0.000000e+00,0.0,0.014926,2.342572,0.000000e+00,0.284864,0.013294,0.077263,...,1.662335,2.444120,0.371340,0.407294,0.000000,4.549081,LAA,LAA,0.328712,0.671288
748,ff14e0_0,ff14e0,0.000000e+00,0.0,0.031349,3.350480,0.000000e+00,0.180316,0.014175,0.040476,...,1.658110,2.576721,0.462200,0.133334,0.000000,5.274067,CE,CE,0.862942,0.137058
749,ffec5c_0,ffec5c,0.000000e+00,0.0,0.002865,2.894617,0.000000e+00,0.458917,0.025147,0.083489,...,1.885758,2.458991,0.303956,0.234496,0.000000,5.109684,LAA,LAA,0.287520,0.712480


In [22]:
train.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()

Unnamed: 0,patient_id,CE,LAA
0,006388,0.848801,0.151199
1,008e5c,0.843656,0.156344
2,00c058,0.333034,0.666966
3,01adc5,0.262211,0.737789
4,026c97,0.895852,0.104148
...,...,...,...
625,fe0cca,0.906967,0.093033
626,fe9645,0.956328,0.043672
627,fe9bec,0.328712,0.671288
628,ff14e0,0.862942,0.137058


In [23]:
# test['prediction'] = rf_class.predict(x_test)
test[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_test.values)

In [24]:
submission = test.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()
submission

Unnamed: 0,patient_id,CE,LAA
0,006388,0.848801,0.151199
1,008e5c,0.843656,0.156344
2,00c058,0.333034,0.666966
3,01adc5,0.262211,0.737789


In [25]:
submission.to_csv("submission.csv", index = False)