Strip AI Kaggle

In [1]:
!conda install /kaggle/input/how-to-use-pyvips-offline/*.tar.bz2
import time
import os
import glob
import gc
from tqdm.notebook import tqdm

import numpy as np
# import matplotlib.pyplot as plt
# from matplotlib.patches import Rectangle

import pandas as pd
import cv2 as cv
# import tifffile as tifi
import pyvips

import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader

# import skimage
from skimage.filters import sobel
from skimage import segmentation
from skimage.measure import regionprops_table
# from skimage.transform import resize

from scipy import ndimage as ndi

from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier

from PIL import Image


Downloading and Extracting Packages
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
###########################################################

In [2]:
Image.MAX_IMAGE_PIXELS = 5_000_000_000


os.environ['VIPS_CONCURRENCY'] = '2'
os.environ['VIPS_DISC_THRESHOLD'] = '2gb'

## Disabling the benchmarking feature with torch.backends.cudnn.benchmark = False 
## causes cuDNN to deterministically select an algorithm, possibly at the cost of reduced performance.
## https://pytorch.org/docs/stable/notes/randomness.html
torch.backends.cudnn.benchmark = False

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f6d4be1cfb0>

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.jit.load('/kaggle/input/ptmodel/model_scripted.pt').eval().to(device)

In [4]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
conv1.weight 	 torch.Size([2048, 3, 3, 3])
conv1.bias 	 torch.Size([2048])


Read in metadata

In [5]:
train_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/train.csv')
test_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/test.csv')
# other_meta = pd.read_csv('/kaggle/input/mayo-clinic-strip-ai/other.csv')

In [6]:
input_file_type = ".tif"
output_file_type = ".png"

In [7]:
def prune_image_rows_cols(im, mask, thr=0.990):
    # delete empty columns
    for l in reversed(range(im.shape[1])):
        if (np.sum(mask[:, l]) / float(mask.shape[0])) > thr:
            im = np.delete(im, l, 1)
    # delete empty rows
    for l in reversed(range(im.shape[0])):
        if (np.sum(mask[l, :]) / float(mask.shape[1])) > thr:
            im = np.delete(im, l, 0)
    return im


def mask_median(im, val=255):
    masks = [None] * 3
    for c in range(3):
        masks[c] = im[..., c] >= np.median(im[:, :, c]) - 5
    mask = np.logical_and(*masks)
    im[mask, :] = val
    return im, mask


def image_load_scale_norm(img_path, prune_thr=0.990, bg_val=255):
    img = Image.open(img_path)
    if (img.width * img.height) > 4_000_000_000:
        print(f"width: {img.width}, height: {img.height}, pixels: {img.width * img.height}")
        return None
    scale = min(img.height / 2e3, img.width / 2e3)
    if scale > 1:
        tmp_size = int(img.width / scale), int(img.height / scale)
        img.thumbnail(tmp_size, resample=Image.Resampling.BILINEAR, reducing_gap=2.0)
    img, mask = mask_median(np.array(img), val=bg_val)
    img = prune_image_rows_cols(img, mask, thr=prune_thr)
    img = Image.fromarray(img)
    scale = min(img.height / 1.5e3, img.width / 1.5e3)
    if scale > 1:
        img = img.resize((int(img.width / scale), int(img.height / scale)), Image.Resampling.LANCZOS)
    return img

In [8]:
gc.collect()

21

In [9]:
# import psutil
# print(f'thread count per core: {psutil.cpu_count() // psutil.cpu_count(logical=False)}')

In [10]:
# os.cpu_count()

In [11]:
# try:
#     os.mkdir("../train/")
# except:
#     pass
# for name in tqdm(train_meta["image_id"]):
#     img_path = os.path.join("/kaggle/input/mayo-clinic-strip-ai/", "train", f"{name}.tif")
#     img = image_load_scale_norm(img_path)
#     if not img:
#         continue
#     img.save(os.path.join("../train/", f"{name}.png"))
#     del img
#     gc.collect()

In [12]:
try:
    os.mkdir("../test/")
except:
    pass
for name in tqdm(test_meta["image_id"]):
    img_path = os.path.join("/kaggle/input/mayo-clinic-strip-ai/", "test", f"{name}{input_file_type}")
    img = image_load_scale_norm(img_path)
    if not img:
        continue
    img.save(os.path.join("../test/", f"{name}{output_file_type}"))
    
    del img
    gc.collect()

  0%|          | 0/4 [00:00<?, ?it/s]

List images for training and testing

In [13]:
# train_path = f"../train/*{output_file_type}"
# train_images = sorted(list(glob.glob(train_path)))

test_path = f"../test/*{output_file_type}"
test_images = sorted(list(glob.glob(test_path)))

In [14]:
test_images

['../test/006388_0.png',
 '../test/008e5c_0.png',
 '../test/00c058_0.png',
 '../test/01adc5_0.png']

Define helper functions

In [15]:
img_to_tensor = T.ToTensor()

def read_tiff(path):
    image = cv.imread(path)
#     image = cv.resize(image, (512, 512))
    image = cv.resize(
        image,
        (int(image.shape[1]/2),
         int(image.shape[0]/2)),
        interpolation=cv.INTER_LINEAR)
#     image = tifi.imread(path)
    filename = path.split('/')[-1].rstrip(output_file_type)
    return image, filename


def convert_image_grayscale(image):
    gray_image = cv.cvtColor(image, cv.COLOR_RGB2GRAY)
    return gray_image


def segment_images(gray_image):
    elevation_map = sobel(gray_image)
    markers = np.zeros_like(gray_image)
    markers[gray_image >= gray_image.mean()] = 1
    markers[gray_image < gray_image.mean()] = 2
    segmented_img = segmentation.watershed(elevation_map, markers)
    filled_segments = ndi.binary_fill_holes(segmented_img - 1)
    labeled_segments, _ = ndi.label(filled_segments)
    return labeled_segments


def get_object_coordinates(labeled_segments):
    properties = ['area','bbox','convex_area','bbox_area', 'major_axis_length', 'minor_axis_length', 'eccentricity']
    df = pd.DataFrame(regionprops_table(labeled_segments, properties=properties))
    standard_scaler = StandardScaler()
    scaled_area = standard_scaler.fit_transform(df.area.values.reshape(-1,1))
    df['scaled_area'] = scaled_area
    df.sort_values(by="scaled_area", ascending=False, inplace=True)
    objects = df[df['scaled_area']>=.75]
    object_coordinates = [
        (int(row['bbox-0']),
         int(row['bbox-1']),
         int(row['bbox-2']),
         int(row['bbox-3']))
        for index, row in objects.iterrows()
    ]
    return object_coordinates


def crop_patch(coordinates, image):
    x1, y1, x2, y2 = coordinates
    if x2-x1<3:
        x1-=1
        x2+=1
    if y2-y1<3:
        y1-=1
        y2+=1
    cropped_image = image[x1:x2, y1:y2]
    return cropped_image


def compute_features(images):
    tic = time.time()
    j = 0
    image_patches = []
    features = []
    for img in images: 
        image, filename = read_tiff(img)
        gray_image = convert_image_grayscale(image)
        labeled_segments = segment_images(gray_image)
        object_coordinates = get_object_coordinates(labeled_segments)
        del labeled_segments, gray_image,
        gc.collect()
        for i in range(len(object_coordinates)):
            patch_name = str(filename)+"_"+str(i)
            coordinates = object_coordinates[i]
            cropped_image = crop_patch(coordinates, image)
            tensor = img_to_tensor(cropped_image)
            tensor = tensor.to(device)
            with torch.no_grad():
                feats = model(tensor.unsqueeze(0)).cpu().numpy()
            features.append(feats)
            image_patches.append(patch_name)
            del tensor, feats, cropped_image, coordinates, 
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        del image
        gc.collect()
        ten_percent = int(len(images)/10 + 1)
        if j % ten_percent == 0:
            print(f"{j+1}/{len(images)} -- {(j + 1)/ len(images) * 100:0.0f}% -- {time.time()-tic:0.2f} seconds")
            tic = time.time()
        j += 1
    data = pd.DataFrame(features, index=image_patches).rename_axis("image_patch").reset_index()
    return data

Open file, resize, convert to grayscale, segment, crop, and featurize. 

In [16]:
%%time
test_features = compute_features(test_images)
test_features

1/4 -- 25% -- 7.86 seconds
2/4 -- 50% -- 3.80 seconds
3/4 -- 75% -- 2.51 seconds
4/4 -- 100% -- 3.60 seconds
CPU times: user 12.3 s, sys: 2.1 s, total: 14.4 s
Wall time: 18 s


Unnamed: 0,image_patch,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.0,0.0,0.01306,2.245439,3.988785e-05,0.20297,0.006093,0.025319,0.407906,...,0.017293,0.415835,4.662181,8.41445,1.695699,2.193261,0.362635,0.402304,0.000491,4.317948
1,006388_0_1,0.0,0.0,0.015732,1.620958,0.0,0.396124,0.010507,0.033804,0.521409,...,0.013992,0.443559,4.088168,7.562249,1.797567,1.952971,0.307509,0.533557,0.0,3.849162
2,006388_0_2,0.0,0.0,0.033365,2.345252,5.012605e-06,0.089919,0.010526,0.023128,0.279102,...,0.062498,0.558681,4.797596,8.39073,1.528323,2.300081,0.483073,0.36743,2e-05,4.321018
3,006388_0_3,0.0,0.0,0.031471,2.462707,5.212107e-05,0.109201,0.009266,0.024815,0.320656,...,0.050875,0.501123,4.930061,8.711038,1.573786,2.331865,0.452037,0.335962,8.8e-05,4.463724
4,006388_0_4,0.0,0.0,0.01249,2.51823,9.260812e-06,0.225117,0.007197,0.014624,0.459538,...,0.009476,0.319903,4.873011,8.928917,1.767658,2.2363,0.318347,0.363585,1.4e-05,4.572942
5,006388_0_5,0.0,0.0,0.013017,2.393435,1.478133e-05,0.253819,0.007854,0.016466,0.471613,...,0.01041,0.32595,4.746562,8.716383,1.774029,2.186634,0.322344,0.383421,0.000208,4.469705
6,006388_0_6,0.0,0.0,0.0141,2.614562,0.0,0.128737,0.001298,0.026104,0.406531,...,0.012676,0.316202,4.997925,9.074899,1.71858,2.294684,0.360253,0.302455,0.0,4.667105
7,006388_0_7,0.0,0.0,0.040506,2.78851,9.731694e-05,0.032125,0.019506,0.037313,0.310408,...,0.061091,0.429638,5.175624,9.158613,1.576453,2.407576,0.502565,0.254936,0.000345,4.73439
8,006388_0_8,0.0,0.0,0.0186,2.480067,0.0,0.117615,0.002793,0.030041,0.379319,...,0.020371,0.37153,4.86606,8.792385,1.690632,2.257365,0.386282,0.33434,0.0,4.525209
9,006388_0_9,0.0,0.0,0.046438,2.368853,3.596147e-05,0.025753,0.020235,0.035274,0.245271,...,0.079095,0.556484,4.800591,8.377696,1.498906,2.299137,0.539729,0.336893,0.000661,4.335384


In [17]:
# %%time
# train_features = compute_features(train_images)
# train_features.to_csv("train_features.csv", index=False)
train_features = pd.read_csv('/kaggle/input/feats/train_features.csv')
train_features

Unnamed: 0,image_patch,0,1,2,3,4,5,6,7,8,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0_0,0.0,0.0,0.013060,2.245439,0.000040,0.202970,0.006093,0.025319,0.407906,...,0.017293,0.415835,4.662180,8.414451,1.695699,2.193261,0.362635,0.402304,0.000491,4.317947
1,006388_0_1,0.0,0.0,0.015732,1.620958,0.000000,0.396125,0.010507,0.033804,0.521409,...,0.013992,0.443559,4.088168,7.562249,1.797567,1.952970,0.307509,0.533557,0.000000,3.849162
2,006388_0_2,0.0,0.0,0.033365,2.345252,0.000005,0.089919,0.010526,0.023128,0.279102,...,0.062498,0.558681,4.797597,8.390731,1.528323,2.300082,0.483073,0.367430,0.000020,4.321017
3,006388_0_3,0.0,0.0,0.031471,2.462707,0.000052,0.109201,0.009266,0.024815,0.320656,...,0.050875,0.501124,4.930061,8.711040,1.573786,2.331865,0.452037,0.335962,0.000088,4.463723
4,006388_0_4,0.0,0.0,0.012490,2.518230,0.000009,0.225117,0.007197,0.014624,0.459538,...,0.009476,0.319903,4.873011,8.928919,1.767658,2.236300,0.318347,0.363585,0.000014,4.572942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5591,ffec5c_0_1,0.0,0.0,0.001273,2.857883,0.000000,0.491656,0.023960,0.088444,0.782730,...,0.000003,0.302055,5.480465,10.178615,1.910748,2.437001,0.281002,0.236201,0.000000,5.098561
5592,ffec5c_0_2,0.0,0.0,0.001657,2.414711,0.000000,0.785797,0.039193,0.109864,0.995253,...,0.000002,0.283439,5.092682,9.725646,2.063051,2.251297,0.231093,0.322368,0.000000,4.832124
5593,ffec5c_0_3,0.0,0.0,0.000994,2.736229,0.000000,0.754235,0.039039,0.103169,1.033458,...,0.000000,0.176278,5.237646,10.155512,2.136811,2.252338,0.234799,0.222215,0.000000,5.102664
5594,ffec5c_0_4,0.0,0.0,0.001763,3.352959,0.000000,0.176315,0.006623,0.036746,0.520186,...,0.000024,0.348425,5.947475,10.676533,1.702690,2.676346,0.342224,0.138996,0.000000,5.383549


In [18]:
%%time
train = train_features.copy()

train[['patient_id', 'image_num', 'patch_num']] = train.image_patch.str.split("_", expand=True)

train["image_id"] = train[["patient_id", "image_num"]].apply("_".join, axis=1)
train.drop(['image_patch'], axis = 1, inplace = True)

train = train.groupby(['image_id', 'patient_id'], as_index = False).mean()
train = train.set_index('image_id').join(train_meta.set_index('image_id')['label']).reset_index()
train 

CPU times: user 326 ms, sys: 95.9 ms, total: 422 ms
Wall time: 432 ms


Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,label
0,006388_0,006388,0.000000e+00,0.0,0.021866,2.390043,2.119518e-05,0.154833,0.008107,0.025757,...,0.422080,4.802592,8.626056,1.661629,2.250546,0.397803,0.364016,1.522583e-04,4.430144,CE
1,008e5c_0,008e5c,1.794035e-05,0.0,0.030822,2.458230,4.786523e-06,0.107301,0.035050,0.056751,...,0.452022,4.874500,8.700004,1.630854,2.287936,0.490246,0.291413,4.132000e-03,4.477591,CE
2,00c058_0,00c058,1.709266e-05,0.0,0.002218,2.720416,5.585053e-07,0.106625,0.007297,0.040451,...,1.031262,5.991431,9.866537,1.280591,2.942377,0.401952,0.413746,0.000000e+00,4.779036,LAA
3,01adc5_0,01adc5,0.000000e+00,0.0,0.017667,3.073491,0.000000e+00,0.006163,0.001494,0.010432,...,0.325122,5.503816,9.813467,1.618732,2.514609,0.415948,0.158016,0.000000e+00,5.038567,LAA
4,026c97_0,026c97,2.211611e-06,0.0,0.000843,2.853460,0.000000e+00,0.508755,0.024754,0.048316,...,0.167548,5.308777,10.009887,1.965559,2.333815,0.260252,0.260401,0.000000e+00,5.067579,CE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,fe0cca_0,fe0cca,1.827305e-04,0.0,0.001655,1.731407,1.514421e-06,0.310363,0.015953,0.080167,...,1.174313,5.141151,8.461844,1.380898,2.626466,0.320656,0.702806,0.000000e+00,3.992285,CE
746,fe9645_0,fe9645,1.603541e-04,0.0,0.006402,2.114866,4.790860e-07,0.198895,0.011594,0.057217,...,0.928629,5.252378,8.916938,1.490786,2.586366,0.308180,0.572668,7.446160e-07,4.311665,CE
747,fe9bec_0,fe9bec,1.544009e-07,0.0,0.010472,2.335320,0.000000e+00,0.327248,0.011559,0.068265,...,0.587178,5.144247,9.181820,1.697040,2.416201,0.342910,0.403784,0.000000e+00,4.557538,LAA
748,ff14e0_0,ff14e0,2.339020e-06,0.0,0.026983,3.355279,6.037785e-06,0.185815,0.011996,0.037214,...,0.263508,5.729667,10.281391,1.656879,2.579572,0.453477,0.130879,0.000000e+00,5.280942,CE


In [19]:
x_train = train.drop(['image_id', 'patient_id', 'label'], axis = 1)
# x_train = x_train.drop(['image_num', 'patch_num', ], axis = 1)
y_train = train['label']
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.000000e+00,0.0,0.021866,2.390043,2.119518e-05,0.154833,0.008107,0.025757,0.378169,0.000000e+00,...,0.029595,0.422080,4.802592,8.626056,1.661629,2.250546,0.397803,0.364016,1.522583e-04,4.430144
1,1.794035e-05,0.0,0.030822,2.458230,4.786523e-06,0.107301,0.035050,0.056751,0.361009,8.433866e-08,...,0.023929,0.452022,4.874500,8.700004,1.630854,2.287936,0.490246,0.291413,4.132000e-03,4.477591
2,1.709266e-05,0.0,0.002218,2.720416,5.585053e-07,0.106625,0.007297,0.040451,0.313243,0.000000e+00,...,0.000162,1.031262,5.991431,9.866537,1.280591,2.942377,0.401952,0.413746,0.000000e+00,4.779036
3,0.000000e+00,0.0,0.017667,3.073491,0.000000e+00,0.006163,0.001494,0.010432,0.290542,0.000000e+00,...,0.009996,0.325122,5.503816,9.813467,1.618732,2.514609,0.415948,0.158016,0.000000e+00,5.038567
4,2.211611e-06,0.0,0.000843,2.853460,0.000000e+00,0.508755,0.024754,0.048316,0.761925,0.000000e+00,...,0.000110,0.167548,5.308777,10.009887,1.965559,2.333815,0.260252,0.260401,0.000000e+00,5.067579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,1.827305e-04,0.0,0.001655,1.731407,1.514421e-06,0.310363,0.015953,0.080167,0.378549,0.000000e+00,...,0.000143,1.174313,5.141151,8.461844,1.380898,2.626466,0.320656,0.702806,0.000000e+00,3.992285
746,1.603541e-04,0.0,0.006402,2.114866,4.790860e-07,0.198895,0.011594,0.057217,0.343712,0.000000e+00,...,0.000674,0.928629,5.252378,8.916938,1.490786,2.586366,0.308180,0.572668,7.446160e-07,4.311665
747,1.544009e-07,0.0,0.010472,2.335320,0.000000e+00,0.327248,0.011559,0.068265,0.564786,0.000000e+00,...,0.001754,0.587178,5.144247,9.181820,1.697040,2.416201,0.342910,0.403784,0.000000e+00,4.557538
748,2.339020e-06,0.0,0.026983,3.355279,6.037785e-06,0.185815,0.011996,0.037214,0.410360,0.000000e+00,...,0.071605,0.263508,5.729667,10.281391,1.656879,2.579572,0.453477,0.130879,0.000000e+00,5.280942


In [20]:
test = test_features.copy()

test[['patient_id', 'image_num', 'patch_num']] = test.image_patch.str.split("_", expand=True)

test["image_id"] = test[["patient_id", "image_num"]].apply("_".join, axis=1)
test.drop(['image_patch'], axis = 1, inplace = True)

test = test.groupby(['image_id', 'patient_id'], as_index = False).mean()
test 

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,006388_0,006388,0.0,0.0,0.021866,2.390043,2.119517e-05,0.154833,0.008107,0.025757,...,0.029595,0.42208,4.802592,8.626056,1.661629,2.250546,0.397803,0.364016,0.000152,4.430145
1,008e5c_0,008e5c,1.8e-05,0.0,0.030822,2.458229,4.786522e-06,0.107301,0.03505,0.056751,...,0.023929,0.452022,4.8745,8.700008,1.630855,2.287938,0.490246,0.291413,0.004132,4.477592
2,00c058_0,00c058,1.7e-05,0.0,0.002218,2.720416,5.585061e-07,0.106625,0.007297,0.040451,...,0.000162,1.031262,5.991432,9.866537,1.280591,2.942377,0.401952,0.413746,0.0,4.779036
3,01adc5_0,01adc5,0.0,0.0,0.017667,3.07349,0.0,0.006163,0.001494,0.010432,...,0.009996,0.325122,5.503817,9.813467,1.618732,2.514609,0.415948,0.158016,0.0,5.038567


In [21]:
x_test = test.drop(['image_id', 'patient_id'], axis = 1)
# x_test = x_test.drop(['image_num', 'patch_num', ], axis = 1)
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.0,0.0,0.021866,2.390043,2.119517e-05,0.154833,0.008107,0.025757,0.378169,0.0,...,0.029595,0.42208,4.802592,8.626056,1.661629,2.250546,0.397803,0.364016,0.000152,4.430145
1,1.8e-05,0.0,0.030822,2.458229,4.786522e-06,0.107301,0.03505,0.056751,0.361009,8.433866e-08,...,0.023929,0.452022,4.8745,8.700008,1.630855,2.287938,0.490246,0.291413,0.004132,4.477592
2,1.7e-05,0.0,0.002218,2.720416,5.585061e-07,0.106625,0.007297,0.040451,0.313244,0.0,...,0.000162,1.031262,5.991432,9.866537,1.280591,2.942377,0.401952,0.413746,0.0,4.779036
3,0.0,0.0,0.017667,3.07349,0.0,0.006163,0.001494,0.010432,0.290542,0.0,...,0.009996,0.325122,5.503817,9.813467,1.618732,2.514609,0.415948,0.158016,0.0,5.038567


In [22]:
rf_class = RandomForestClassifier(max_depth=15, random_state=42, n_estimators=3000)
rf_class.fit(x_train.values, y_train.values)

RandomForestClassifier(max_depth=15, n_estimators=3000, random_state=42)

In [23]:
rf_class.score(x_train.values, y_train.values)

1.0

In [24]:
train['prediction'] = rf_class.predict(x_train.values)
train[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_train.values)
train

Unnamed: 0,image_id,patient_id,0,1,2,3,4,5,6,7,...,4090,4091,4092,4093,4094,4095,label,prediction,CE,LAA
0,006388_0,006388,0.000000e+00,0.0,0.021866,2.390043,2.119518e-05,0.154833,0.008107,0.025757,...,1.661629,2.250546,0.397803,0.364016,1.522583e-04,4.430144,CE,CE,0.831931,0.168069
1,008e5c_0,008e5c,1.794035e-05,0.0,0.030822,2.458230,4.786523e-06,0.107301,0.035050,0.056751,...,1.630854,2.287936,0.490246,0.291413,4.132000e-03,4.477591,CE,CE,0.838879,0.161121
2,00c058_0,00c058,1.709266e-05,0.0,0.002218,2.720416,5.585053e-07,0.106625,0.007297,0.040451,...,1.280591,2.942377,0.401952,0.413746,0.000000e+00,4.779036,LAA,LAA,0.331039,0.668961
3,01adc5_0,01adc5,0.000000e+00,0.0,0.017667,3.073491,0.000000e+00,0.006163,0.001494,0.010432,...,1.618732,2.514609,0.415948,0.158016,0.000000e+00,5.038567,LAA,LAA,0.173427,0.826573
4,026c97_0,026c97,2.211611e-06,0.0,0.000843,2.853460,0.000000e+00,0.508755,0.024754,0.048316,...,1.965559,2.333815,0.260252,0.260401,0.000000e+00,5.067579,CE,CE,0.895661,0.104339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,fe0cca_0,fe0cca,1.827305e-04,0.0,0.001655,1.731407,1.514421e-06,0.310363,0.015953,0.080167,...,1.380898,2.626466,0.320656,0.702806,0.000000e+00,3.992285,CE,CE,0.913121,0.086879
746,fe9645_0,fe9645,1.603541e-04,0.0,0.006402,2.114866,4.790860e-07,0.198895,0.011594,0.057217,...,1.490786,2.586366,0.308180,0.572668,7.446160e-07,4.311665,CE,CE,0.951049,0.048951
747,fe9bec_0,fe9bec,1.544009e-07,0.0,0.010472,2.335320,0.000000e+00,0.327248,0.011559,0.068265,...,1.697040,2.416201,0.342910,0.403784,0.000000e+00,4.557538,LAA,LAA,0.335807,0.664193
748,ff14e0_0,ff14e0,2.339020e-06,0.0,0.026983,3.355279,6.037785e-06,0.185815,0.011996,0.037214,...,1.656879,2.579572,0.453477,0.130879,0.000000e+00,5.280942,CE,CE,0.862718,0.137282


In [25]:
train.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()

Unnamed: 0,patient_id,CE,LAA
0,006388,0.831931,0.168069
1,008e5c,0.838879,0.161121
2,00c058,0.331039,0.668961
3,01adc5,0.173427,0.826573
4,026c97,0.895661,0.104339
...,...,...,...
625,fe0cca,0.913121,0.086879
626,fe9645,0.951049,0.048951
627,fe9bec,0.335807,0.664193
628,ff14e0,0.862718,0.137282


In [26]:
# test['prediction'] = rf_class.predict(x_test)
test[[rf_class.classes_[0], rf_class.classes_[1]]] = rf_class.predict_proba(x_test.values)

In [27]:
submission = test.groupby(['patient_id'], as_index = False)[['CE', 'LAA']].mean()
submission

Unnamed: 0,patient_id,CE,LAA
0,006388,0.831931,0.168069
1,008e5c,0.838879,0.161121
2,00c058,0.331039,0.668961
3,01adc5,0.173427,0.826573


In [28]:
submission.to_csv("submission.csv", index = False)