Strip AI Kaggle

In [1]:
import time
import os
import glob
import gc
from tqdm.notebook import tqdm

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

import pandas as pd
import cv2 as cv
import tifffile as tifi

import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import skimage
from skimage.filters import sobel
from skimage import segmentation
from skimage.transform import resize
from skimage.measure import regionprops_table

from scipy import ndimage as ndi

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier

# from PIL import Image
# Image.MAX_IMAGE_PIXELS = None

## Disabling the benchmarking feature with torch.backends.cudnn.benchmark = False 
## causes cuDNN to deterministically select an algorithm, possibly at the cost of reduced performance.
## https://pytorch.org/docs/stable/notes/randomness.html
torch.backends.cudnn.benchmark = False

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fd5d0567d90>

In [2]:
num_features = 4096
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.jit.load('/kaggle/input/ptmodel/model_scripted.pt').eval().to(device)

In [3]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
conv1.weight 	 torch.Size([2048, 3, 3, 3])
conv1.bias 	 torch.Size([2048])


Read in metadata

In [4]:
train_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/train.csv')
test_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/test.csv')
other_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/other.csv')

In [5]:
# try:
#     os.mkdir("../train/")
# except:
#     pass
# for i in tqdm(range(train_meta.shape[0])):
#     img_id = train_meta.iloc[i].image_id
#     image = tifi.imread('/kaggle/input/mayo-clinic-strip-ai/train/' + img_id + ".tif")
#     image = cv.resize(
#         image,
#         (int(image.shape[1]/250),
#          int(image.shape[0]/250)),
#         interpolation=cv.INTER_LINEAR)
#     cv.imwrite(f"../train/{img_id}.tif", image)
#     del image
#     gc.collect()

In [6]:
try:
    os.mkdir("../test/")
except:
    pass
for i in tqdm(range(test_meta.shape[0])):
    img_id = test_meta.iloc[i].image_id
    image = tifi.imread('/kaggle/input/mayo-clinic-strip-ai/test/' + img_id + ".tif")
    image = cv.resize(
        image,
        (int(image.shape[1]/250),
         int(image.shape[0]/250)),
        interpolation=cv.INTER_LINEAR)
    cv.imwrite(f"../test/{img_id}.tif", image)
    del image
    gc.collect()

  0%|          | 0/4 [00:00<?, ?it/s]

List images for training and testing

In [7]:
test_path = "../test/*.tif"
test_images = sorted(list(glob.glob(test_path)))

train_path = "../train/*.tif"
train_images = sorted(list(glob.glob(train_path)))

In [8]:
test_images

['../test/006388_0.tif',
 '../test/008e5c_0.tif',
 '../test/00c058_0.tif',
 '../test/01adc5_0.tif']

Define helper functions

In [9]:
img_to_tensor = T.ToTensor()

def read_tiff(path):
    image = tifi.imread(path)
    filename = path.split('/')[-1].rstrip('.tif')
    return image, filename


def convert_image_grayscale(image):
    gray_image = cv.cvtColor(image, cv.COLOR_RGB2GRAY)
    return gray_image


def segment_images(gray_image):
    elevation_map = sobel(gray_image)
    markers = np.zeros_like(gray_image)
    markers[gray_image >= gray_image.mean()] = 1
    markers[gray_image < gray_image.mean()] = 2
    segmented_img = segmentation.watershed(elevation_map, markers)
    filled_segments = ndi.binary_fill_holes(segmented_img - 1)
    labeled_segments, _ = ndi.label(filled_segments)
    return labeled_segments


def get_object_coordinates(labeled_segments):
    properties = ['area','bbox','convex_area','bbox_area', 'major_axis_length', 'minor_axis_length', 'eccentricity']
    df = pd.DataFrame(regionprops_table(labeled_segments, properties=properties))
    standard_scaler = StandardScaler()
    scaled_area = standard_scaler.fit_transform(df.area.values.reshape(-1,1))
    df['scaled_area'] = scaled_area
    df.sort_values(by="scaled_area", ascending=False, inplace=True)
    objects = df[df['scaled_area']>=.75]
    object_coordinates = [
        (int(row['bbox-0']),
         int(row['bbox-1']),
         int(row['bbox-2']),
         int(row['bbox-3']))
        for index, row in objects.iterrows()
    ]
    return object_coordinates


def crop_patch(coordinates, image):
    x1, y1, x2, y2 = coordinates
    if x2-x1<3:
        x1-=1
        x2+=1
    if y2-y1<3:
        y1-=1
        y2+=1
    cropped_image = image[x1:x2, y1:y2]
    return cropped_image


def compute_features(images):
    tic = time.time()
    j = 0
    image_patches = []
    features = []
    for img in images: 
        image, filename = read_tiff(img)
        gray_image = convert_image_grayscale(image)
        labeled_segments = segment_images(gray_image)
        object_coordinates = get_object_coordinates(labeled_segments)
        del labeled_segments, gray_image,
        gc.collect()
        for i in range(len(object_coordinates)):
            patch_name = str(filename)+"_"+str(i)
            coordinates = object_coordinates[i]
            cropped_image = crop_patch(coordinates, image)
            tensor = img_to_tensor(cropped_image)
            tensor = tensor.to(device)
            with torch.no_grad():
                feats = model(tensor.unsqueeze(0)).cpu().numpy()
            features.append(feats)
            image_patches.append(patch_name)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            del tensor, feats, cropped_image, coordinates, #white_space
            gc.collect()
        ten_percent = int(len(images)/10 + 1)
        if j % ten_percent == 0:
            print(f"{j+1}/{len(images)} -- {(j + 1)/ len(images) * 100:0.0f}% -- {time.time()-tic:0.2f} seconds")
            tic = time.time()
        j += 1
    data = pd.DataFrame(features, index=image_patches).rename_axis("image_patch").reset_index()
    return data

Open file, resize, convert to grayscale, segment, crop, and featurize. 

In [10]:
%%time
# train_features = compute_features(train_images)
# train_features.to_csv("train_features.csv", index=False)
train_features = pd.read_csv('/kaggle/input/trainfeats/train_features.csv')
train_features

CPU times: user 3.38 s, sys: 105 ms, total: 3.48 s
Wall time: 4.3 s


Unnamed: 0,image_patch,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.000248,0.0,0.110258,2.198422,0.000804,0.279341,0.109784,0.202421,0.560119,...,0.080221,0.584028,4.526144,8.130636,1.722932,2.150329,0.599880,0.466460,0.000102,4.188247
1,006388_0_1,0.000394,0.0,0.154251,2.733671,0.000184,0.196087,0.094712,0.204779,0.530461,...,0.109292,0.530724,5.051057,9.085207,1.715696,2.342292,0.652843,0.342316,0.000000,4.682967
2,006388_0_2,0.000128,0.0,0.103395,2.763662,0.000071,0.195958,0.096897,0.137703,0.532422,...,0.060015,0.484796,5.152316,9.337633,1.713066,2.366928,0.561089,0.341431,0.000239,4.738822
3,006388_0_3,0.000000,0.0,0.136092,1.582298,0.000281,0.538647,0.196772,0.245372,0.721442,...,0.075668,0.678573,3.907870,7.268937,1.828165,1.881065,0.637901,0.617389,0.000000,3.681116
4,006388_0_4,0.000000,0.0,0.244032,2.370870,0.001541,0.191293,0.144248,0.222606,0.474398,...,0.212331,0.798530,4.715309,8.266925,1.584070,2.271144,0.826989,0.456892,0.000519,4.215331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3163,ff14e0_0_1,0.000000,0.0,0.041953,2.516607,0.000193,0.450089,0.075902,0.200447,0.723054,...,0.159439,0.485702,4.850527,8.822890,1.729967,2.257169,0.638653,0.343420,0.000000,4.528939
3164,ffec5c_0_0,0.000000,0.0,0.039182,2.746331,0.000537,0.392228,0.099847,0.248202,0.760376,...,0.004352,0.530238,5.498745,9.922201,1.782396,2.516367,0.549443,0.357594,0.000000,4.910604
3165,ffec5c_0_1,0.000000,0.0,0.031853,2.780383,0.000000,0.447877,0.101686,0.252695,0.821034,...,0.004920,0.477313,5.474993,9.993552,1.838873,2.477563,0.510918,0.337992,0.000000,4.961650
3166,ffec5c_1_0,0.000000,0.0,0.033335,2.609250,0.000244,0.251769,0.087242,0.223369,0.579014,...,0.004705,0.851035,5.665803,9.691852,1.537329,2.698247,0.564132,0.423614,0.000000,4.711995


In [11]:
%%time
test_features = compute_features(test_images)
# test_features.to_csv("test_features.csv", index=False)
test_features

1/4 -- 25% -- 6.47 seconds
2/4 -- 50% -- 0.19 seconds
3/4 -- 75% -- 0.40 seconds
4/4 -- 100% -- 0.44 seconds
CPU times: user 3 s, sys: 601 ms, total: 3.6 s
Wall time: 7.67 s


Unnamed: 0,image_patch,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.000248,0.0,0.110258,2.198422,0.000804,0.279341,0.109784,0.202421,0.560119,...,0.080221,0.584028,4.526144,8.130636,1.722932,2.150329,0.59988,0.46646,0.000102,4.188247
1,006388_0_1,0.000394,0.0,0.154251,2.73367,0.000184,0.196087,0.094712,0.204779,0.530461,...,0.109292,0.530724,5.051057,9.085207,1.715696,2.342292,0.652843,0.342316,0.0,4.682967
2,006388_0_2,0.000128,0.0,0.103395,2.763662,7.1e-05,0.195958,0.096897,0.137703,0.532422,...,0.060015,0.484796,5.152316,9.337633,1.713066,2.366928,0.561089,0.341431,0.000239,4.738822
3,006388_0_3,0.0,0.0,0.136092,1.582298,0.000281,0.538647,0.196772,0.245372,0.721442,...,0.075668,0.678573,3.90787,7.268937,1.828165,1.881065,0.637902,0.617388,0.0,3.681116
4,006388_0_4,0.0,0.0,0.244032,2.37087,0.001541,0.191293,0.144248,0.222606,0.474397,...,0.212331,0.79853,4.715309,8.266925,1.58407,2.271144,0.826989,0.456892,0.000519,4.215331
5,006388_0_5,0.0,0.0,0.10566,2.457043,6e-05,0.279115,0.119823,0.129323,0.595479,...,0.057842,0.505673,4.786747,8.802071,1.776293,2.195262,0.526529,0.413551,4.4e-05,4.455239
6,006388_0_6,0.000235,0.0,0.118764,2.856206,0.0,0.150204,0.07439,0.177728,0.475778,...,0.077211,0.529943,5.269718,9.363044,1.647931,2.455883,0.627423,0.328836,0.0,4.779446
7,006388_0_7,0.0,0.0,0.195542,2.535527,0.000305,0.216535,0.105376,0.263131,0.567272,...,0.133579,0.59093,4.773409,8.636659,1.764083,2.208145,0.684833,0.391678,0.0,4.463066
8,008e5c_0_0,0.0,0.0,0.236526,2.238184,0.0,0.19302,0.234313,0.244101,0.602687,...,0.194537,0.811747,4.710178,8.226409,1.602295,2.205938,0.820323,0.445312,0.0,4.195854
9,00c058_0_0,0.0,0.0,0.030049,2.875057,0.0,0.159377,0.048302,0.170514,0.444435,...,0.004067,0.819257,5.932816,10.020407,1.446501,2.838813,0.572221,0.358068,0.0,4.906837


In [12]:
%%time
train = train_features.copy()

train[['patient_id', 'image_num', 'patch_num']] = train.image_patch.str.split("_", expand=True)

train["image_id"] = train[["patient_id", "image_num"]].apply("_".join, axis=1)
train.drop(['image_patch'], axis = 1, inplace = True)

# var_cols = train.columns[1:-4].values.tolist()
# train = (
#     train
#     .groupby(['image_id', 'patient_id'], as_index=False)
#     .apply(lambda x: pd.Series([sum(x[v] * x.filled_space) / sum(x.filled_space) for v in var_cols]))
# )
train = train.groupby(['image_id', 'patient_id'], as_index = False).mean()
train = train.set_index('image_id').join(train_meta.set_index('image_id')['label']).reset_index()
train 

CPU times: user 207 ms, sys: 133 ms, total: 340 ms
Wall time: 346 ms


Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,label
0,006388_0,006388,0.000126,0.0,0.145999,2.437212,0.000406,0.255898,0.117750,0.197883,...,0.587900,4.772821,8.611389,1.719029,2.233881,0.639686,0.419819,0.000113,4.400529,CE
1,008e5c_0,008e5c,0.000000,0.0,0.236526,2.238183,0.000000,0.193020,0.234313,0.244101,...,0.811747,4.710178,8.226409,1.602295,2.205938,0.820322,0.445312,0.000000,4.195854,CE
2,00c058_0,00c058,0.000000,0.0,0.030837,2.764057,0.000123,0.137638,0.053070,0.174138,...,0.969535,5.956170,9.885099,1.362541,2.892566,0.598086,0.401214,0.000000,4.780872,LAA
3,01adc5_0,01adc5,0.000000,0.0,0.082319,3.168609,0.000000,0.053731,0.028355,0.083681,...,0.334535,5.539375,9.924203,1.655366,2.507355,0.527840,0.151800,0.000000,5.108660,LAA
4,028989_0,028989,0.000041,0.0,0.025937,2.772342,0.000245,0.484572,0.105693,0.165291,...,0.369978,5.317365,9.743801,1.841314,2.417328,0.411426,0.366623,0.001593,4.867456,LAA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,fe9645_0,fe9645,0.001371,0.0,0.077364,2.416725,0.000346,0.276947,0.121376,0.261887,...,0.952234,5.349768,9.098815,1.597000,2.564734,0.607171,0.496975,0.000000,4.479399,CE
719,fe9bec_0,fe9bec,0.000195,0.0,0.098755,2.401913,0.000165,0.413335,0.136314,0.262059,...,0.715042,5.143785,9.249087,1.751719,2.400231,0.641748,0.449056,0.000000,4.563587,LAA
720,ff14e0_0,ff14e0,0.000000,0.0,0.040056,2.478461,0.000096,0.471794,0.082098,0.210908,...,0.489889,4.827977,8.806522,1.747714,2.250147,0.646491,0.353988,0.000000,4.513768,CE
721,ffec5c_0,ffec5c,0.000000,0.0,0.035518,2.763357,0.000269,0.420052,0.100767,0.250449,...,0.503775,5.486869,9.957876,1.810634,2.496965,0.530180,0.347793,0.000000,4.936127,LAA


In [13]:
x_train = train.drop(['image_id', 'patient_id', 'label'], axis = 1)
y_train = train['label']
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.000126,0.0,0.145999,2.437212,0.000406,0.255898,0.117750,0.197883,0.557171,0.0,...,0.100770,0.587900,4.772821,8.611389,1.719029,2.233881,0.639686,0.419819,0.000113,4.400529
1,0.000000,0.0,0.236526,2.238183,0.000000,0.193020,0.234313,0.244101,0.602687,0.0,...,0.194537,0.811747,4.710178,8.226409,1.602295,2.205938,0.820322,0.445312,0.000000,4.195854
2,0.000000,0.0,0.030837,2.764057,0.000123,0.137638,0.053070,0.174138,0.392809,0.0,...,0.003638,0.969535,5.956170,9.885099,1.362541,2.892566,0.598086,0.401214,0.000000,4.780872
3,0.000000,0.0,0.082319,3.168609,0.000000,0.053731,0.028355,0.083681,0.401513,0.0,...,0.031514,0.334535,5.539375,9.924203,1.655366,2.507355,0.527840,0.151800,0.000000,5.108660
4,0.000041,0.0,0.025937,2.772342,0.000245,0.484572,0.105693,0.165291,0.708903,0.0,...,0.008245,0.369978,5.317365,9.743801,1.841314,2.417328,0.411426,0.366623,0.001593,4.867456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,0.001371,0.0,0.077364,2.416725,0.000346,0.276947,0.121376,0.261887,0.563697,0.0,...,0.007911,0.952234,5.349768,9.098815,1.597000,2.564734,0.607171,0.496975,0.000000,4.479399
719,0.000195,0.0,0.098755,2.401913,0.000165,0.413335,0.136314,0.262059,0.800512,0.0,...,0.023548,0.715042,5.143785,9.249087,1.751719,2.400231,0.641748,0.449056,0.000000,4.563587
720,0.000000,0.0,0.040056,2.478461,0.000096,0.471794,0.082098,0.210908,0.752493,0.0,...,0.151489,0.489889,4.827977,8.806522,1.747714,2.250147,0.646491,0.353988,0.000000,4.513768
721,0.000000,0.0,0.035518,2.763357,0.000269,0.420052,0.100767,0.250449,0.790705,0.0,...,0.004636,0.503775,5.486869,9.957876,1.810634,2.496965,0.530180,0.347793,0.000000,4.936127


In [14]:
test = test_features.copy()

test[['patient_id', 'image_num', 'patch_num']] = test.image_patch.str.split("_", expand=True)

test["image_id"] = test[["patient_id", "image_num"]].apply("_".join, axis=1)
test.drop(['image_patch'], axis = 1, inplace = True)
# var_cols = test.columns[1:-4].values.tolist()
# test = (
#     test
#     .groupby(['image_id', 'patient_id'], as_index=False)
#     .apply(lambda x: pd.Series([sum(x[v] * x.filled_space) / sum(x.filled_space) for v in var_cols]))
# )
test = test.groupby(['image_id', 'patient_id'], as_index = False).mean()
test 

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0,006388,0.000126,0.0,0.145999,2.437212,0.000406,0.255898,0.11775,0.197883,...,0.10077,0.5879,4.772821,8.611389,1.719029,2.233881,0.639686,0.419819,0.000113,4.400529
1,008e5c_0,008e5c,0.0,0.0,0.236526,2.238184,0.0,0.19302,0.234313,0.244101,...,0.194537,0.811747,4.710178,8.226409,1.602295,2.205938,0.820323,0.445312,0.0,4.195854
2,00c058_0,00c058,0.0,0.0,0.030837,2.764057,0.000123,0.137638,0.05307,0.174138,...,0.003638,0.969535,5.956171,9.885098,1.362541,2.892566,0.598086,0.401214,0.0,4.780872
3,01adc5_0,01adc5,0.0,0.0,0.082319,3.168609,0.0,0.053731,0.028355,0.083681,...,0.031514,0.334535,5.539375,9.924203,1.655366,2.507355,0.52784,0.1518,0.0,5.10866


In [15]:
x_test = test.drop(['image_id', 'patient_id'], axis = 1)
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.000126,0.0,0.145999,2.437212,0.000406,0.255898,0.11775,0.197883,0.557171,0.0,...,0.10077,0.5879,4.772821,8.611389,1.719029,2.233881,0.639686,0.419819,0.000113,4.400529
1,0.0,0.0,0.236526,2.238184,0.0,0.19302,0.234313,0.244101,0.602687,0.0,...,0.194537,0.811747,4.710178,8.226409,1.602295,2.205938,0.820323,0.445312,0.0,4.195854
2,0.0,0.0,0.030837,2.764057,0.000123,0.137638,0.05307,0.174138,0.392809,0.0,...,0.003638,0.969535,5.956171,9.885098,1.362541,2.892566,0.598086,0.401214,0.0,4.780872
3,0.0,0.0,0.082319,3.168609,0.0,0.053731,0.028355,0.083681,0.401513,0.0,...,0.031514,0.334535,5.539375,9.924203,1.655366,2.507355,0.52784,0.1518,0.0,5.10866


In [16]:
rf_class = RandomForestClassifier(max_depth=15, random_state=42, n_estimators=3000)
rf_class.fit(x_train, y_train)

RandomForestClassifier(max_depth=15, n_estimators=3000, random_state=42)

In [17]:
rf_class.score(x_train, y_train)

1.0

In [18]:
train['prediction'] = rf_class.predict(x_train)
train[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_train)
train

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4090,4091,4092,4093,4094,4095,label,prediction,CE,LAA
0,006388_0,006388,0.000126,0.0,0.145999,2.437212,0.000406,0.255898,0.117750,0.197883,...,1.719029,2.233881,0.639686,0.419819,0.000113,4.400529,CE,CE,0.827959,0.172041
1,008e5c_0,008e5c,0.000000,0.0,0.236526,2.238183,0.000000,0.193020,0.234313,0.244101,...,1.602295,2.205938,0.820322,0.445312,0.000000,4.195854,CE,CE,0.830693,0.169307
2,00c058_0,00c058,0.000000,0.0,0.030837,2.764057,0.000123,0.137638,0.053070,0.174138,...,1.362541,2.892566,0.598086,0.401214,0.000000,4.780872,LAA,LAA,0.325645,0.674355
3,01adc5_0,01adc5,0.000000,0.0,0.082319,3.168609,0.000000,0.053731,0.028355,0.083681,...,1.655366,2.507355,0.527840,0.151800,0.000000,5.108660,LAA,LAA,0.257524,0.742476
4,028989_0,028989,0.000041,0.0,0.025937,2.772342,0.000245,0.484572,0.105693,0.165291,...,1.841314,2.417328,0.411426,0.366623,0.001593,4.867456,LAA,LAA,0.182890,0.817110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,fe9645_0,fe9645,0.001371,0.0,0.077364,2.416725,0.000346,0.276947,0.121376,0.261887,...,1.597000,2.564734,0.607171,0.496975,0.000000,4.479399,CE,CE,0.903285,0.096715
719,fe9bec_0,fe9bec,0.000195,0.0,0.098755,2.401913,0.000165,0.413335,0.136314,0.262059,...,1.751719,2.400231,0.641748,0.449056,0.000000,4.563587,LAA,LAA,0.381631,0.618369
720,ff14e0_0,ff14e0,0.000000,0.0,0.040056,2.478461,0.000096,0.471794,0.082098,0.210908,...,1.747714,2.250147,0.646491,0.353988,0.000000,4.513768,CE,CE,0.841655,0.158345
721,ffec5c_0,ffec5c,0.000000,0.0,0.035518,2.763357,0.000269,0.420052,0.100767,0.250449,...,1.810634,2.496965,0.530180,0.347793,0.000000,4.936127,LAA,LAA,0.289649,0.710351


In [19]:
train.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()

Unnamed: 0,patient_id,CE,LAA
0,006388,0.827959,0.172041
1,008e5c,0.830693,0.169307
2,00c058,0.325645,0.674355
3,01adc5,0.257524,0.742476
4,028989,0.182890,0.817110
...,...,...,...
604,fe0cca,0.893626,0.106374
605,fe9645,0.903285,0.096715
606,fe9bec,0.381631,0.618369
607,ff14e0,0.841655,0.158345


In [20]:
# test['prediction'] = rf_class.predict(x_test)
test[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_test)

  "X does not have valid feature names, but"


In [21]:
submission = test.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()
submission

Unnamed: 0,patient_id,CE,LAA
0,006388,0.827959,0.172041
1,008e5c,0.830693,0.169307
2,00c058,0.325645,0.674355
3,01adc5,0.257524,0.742476


In [22]:
submission.to_csv("submission.csv", index = False)