### Data Splitting

This notebook splits the whole dataset into train and validation set

Importing packages

In [None]:
import pandas as pd
import numpy as np
import shutil
import os
import tqdm
import glob
import tifffile
import matplotlib.pyplot as plt

In [None]:
data_basepath = '38829_81799_A_2_4_lung' 
metadata_csv = data_basepath + 'metadata.csv'
data_with_rle = data_basepath + 'complete_dataset_rle.csv'

In [None]:
metadata = pd.read_csv(metadata_csv)
rle_data_csv = pd.read_csv(data_with_rle)
rle_data_csv.columns = ["filename","rle","organ"]

In [None]:
len(rle_data_csv)

In [None]:
len(metadata)

Fitlering data for training

In [None]:
train_data = metadata[metadata['data_type']=="public"]
test_data = metadata[(metadata['data_type']=="private") | (metadata['data_type']=="hubmap")]

In [None]:
train_data

In [None]:
train_data_filtered = train_data[train_data['filename'].isin(rle_data_csv['filename'].str.replace(".tif",""))]
test_data_filtered = test_data[test_data['filename'].isin(rle_data_csv['filename'].str.replace(".tif",""))]

In [None]:
def get_rle(x):
    return rle_data_csv[rle_data_csv["filename"].str.replace(".tif","")==x]['rle'].values[0]

In [None]:
train_data_filtered['rle'] = train_data_filtered['filename'].apply(lambda x: get_rle(x))
test_data_filtered['rle'] = test_data_filtered['filename'].apply(lambda x: get_rle(x))

In [None]:
os.mkdir('/N/slate/yashjain/kaggle_data_package/kaggle_data_multiftu/data/gftu_dataset')
train_data_filtered[["filename","rle","patient_id","tissue_name"]].to_csv(data_basepath + 'gftu_dataset/train.csv',index=None)
test_data_filtered[["filename","rle","patient_id","tissue_name"]].to_csv(data_basepath + 'gftu_dataset/test.csv',index=None)

In [None]:
images_paths = glob.glob(data_basepath + 'gftu_dataset_separate/lung/image/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/kidney/image/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/spleen/image/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/largeintestine/image/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/prostate/image/*tif')




masks_path = glob.glob(data_basepath + 'gftu_dataset_separate/lung/mask/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/kidney/mask/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/spleen/mask/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/largeintestine/mask/*tif')\
+glob.glob(data_basepath + 'gftu_dataset_separate/prostate/mask/*tif')

In [None]:
print(f"Number of images = {len(images_paths)}")
print(f"Number of masks = {len(masks_path)}")

In [None]:
train_data_path = data_basepath + 'gftu_dataset/train/'
train_mask_path = data_basepath + 'gftu_dataset/train_mask/'
test_data_path = data_basepath + 'gftu_dataset/test/'
test_mask_path = data_basepath + 'gftu_dataset/test_mask/'
os.mkdir(train_data_path)
os.mkdir(train_mask_path)
os.mkdir(test_data_path)
os.mkdir(test_mask_path)

In [None]:
for image_path in tqdm.tqdm(images_paths):
    filename = image_path.split("/")[-1].replace(".tif","")
    
    if filename in train_data_filtered['filename'].tolist():
        shutil.copyfile(image_path,train_data_path+image_path.split("/")[-1])
        shutil.copyfile(image_path.replace("image","mask"),train_mask_path+image_path.split("/")[-1])
        
    if filename in test_data_filtered['filename'].tolist():
        shutil.copyfile(image_path,test_data_path+image_path.split("/")[-1])
        shutil.copyfile(image_path.replace("image","mask"),test_mask_path+image_path.split("/")[-1])
    
    
    
    
    

In [None]:
len(os.listdir(data_basepath + 'gftu_dataset/train/'))

In [None]:
len(os.listdir(data_basepath + 'gftu_dataset/test/'))

In [None]:
len(os.listdir(data_basepath + 'gftu_dataset/train_mask/'))

In [None]:
len(os.listdir(data_basepath + 'gftu_dataset/test_mask/'))

Checking if rle2mask is working fine

In [None]:
def rle2mask(rle, shape):
    '''
    mask_rle: run-length as string formatted (start length)
    shape: (height, width) of array to return 
    Returns numpy array <- 1(mask), 0(background)
    '''
    if type(rle) == float:
        rle=""
    s = rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape, order='F')  # Needed to align to RLE direction

In [None]:
data_path = data_basepath + 'gftu_dataset/train/'
masks_path = data_basepath + 'gftu_dataset/train_mask/'

In [None]:
idx = 150
im_id = train_data_filtered.reset_index(drop=True).loc[idx,["filename"]].values[0]
rle = train_data_filtered.reset_index(drop=True).loc[idx,["rle"]].values[0]
organ = train_data_filtered.reset_index(drop=True).loc[idx,["tissue_name"]].values[0]

In [None]:
im_id

In [None]:
organ

In [None]:
image = tifffile.imread(data_path+im_id+".tif")
mask = tifffile.imread(masks_path+im_id+".tif")

In [None]:
plt.rcParams["figure.figsize"] = [10,10]

In [None]:
plt.imshow(image)

In [None]:
shape = image.shape

In [None]:
plt.imshow(mask)

In [None]:
mask_enc = rle2mask(rle,(shape[0],shape[1]))

In [None]:
plt.imshow(mask_enc)

#### Data stats

In [None]:
organs = ["lung","kidney","prostate","largeintestine","spleen"]

In [None]:
for org in organs:
    print(f"Number of {org} images in training set is {len(train_data_filtered[train_data_filtered['tissue_name']==org])}")

In [None]:
for org in organs:
    print(f"Number of {org} images in test set is {len(test_data_filtered[test_data_filtered['tissue_name']==org])}")

Checking validation set


In [None]:
import pandas as pd


In [None]:
training_data_filtered = pd.read_csv(data_basepath + 'gftu_dataset/train.csv')

Kidney patient ids

In [None]:
training_data_filtered[training_data_filtered['tissue_name']=="kidney"]['patient_id'].value_counts()

lung patient ids

In [None]:
training_data_filtered[training_data_filtered['tissue_name']=="lung"]['patient_id'].value_counts()

spleen patient ids

In [None]:
training_data_filtered[training_data_filtered['tissue_name']=="spleen"]['patient_id'].value_counts()

prostate patient ids

In [None]:
training_data_filtered[training_data_filtered['tissue_name']=="prostate"]['patient_id'].value_counts()

large intestine patient ids

In [None]:
training_data_filtered[training_data_filtered['tissue_name']=="largeintestine"]['patient_id'].value_counts()

In [None]:
validation_set = [[2184.0],[443.0],[1678.0],[2208.0],[1787.0],[96.0],[2932.0],[3497.0],[2040.0],[4510.0]]#olf[[4510.0],[1960.0],[3181.0],[2098.0],[1511.0],[1787.0],[2208.0],[2222.0],[443.0],[1943.0]]

In [None]:
validation_set