# This code splits data into training, testing and validation. Then it applied augmentation on each crop to get balanced classes in the datasets

## Import Libraries

In [4]:
import pandas as pd
import numpy as np
from glob import glob
import os
import json
from PIL import Image
import albumentations as A
import random
import os
from os.path import exists
import glob
import json
import shutil

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Checking the count of objects within each class in the json files and creating stratified split to get approx same distribution of classes in each train, test and val

In [5]:
def get_count(l):
    """
    Parses a JSON file containing annotations of regions in image tiles and counts the occurrences
    of different types of plaques.

    Args:
        l (str): Path to the JSON file.

    Returns:
        dict: A dictionary with counts of each plaque type. The keys are:
              'Cored', 'Diffuse', 'Coarse-Grained', 'CAA'.
    """
    with open(l) as f:
        data = json.load(f)

    # Initialize the dictionary with known plaque types
    plaque_dict = {'Cored': 0, 'Diffuse': 0, 'Coarse-Grained': 0, 'CAA': 0}

    # Iterate over each tile and its list of annotated regions
    for tileId, ele in data.items():
        for region in ele:
            # Check if the region has a label and a name for the plaque
            if "label" in region.keys():
                if "name" in region['label'].keys():
                    plaque_type = region['label']["name"]
                    # Increment count if plaque type is one of the known types
                    if plaque_type in plaque_dict:
                        plaque_dict[plaque_type] += 1

    return plaque_dict

In [9]:
# getting data from two paths
path1 = "/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/amy-def-mfg-jsons"
path2 = "/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/amy-def-mfg-test-jsons"

In [10]:
# create dataframe of json file paths
test_df = pd.DataFrame({"json_path":glob.glob(os.path.join(path1,"*.json"))})
train_df = pd.DataFrame({"json_path":glob.glob(os.path.join(path2,"*.json"))})

In [11]:
# get class-wise count of objects in each json file
for cls in ["Cored","Diffuse","Coarse-Grained","CAA"]:
    test_df[cls] = test_df["json_path"].apply(lambda l:get_count(l)[cls] )
    
    
for cls in ["Cored","Diffuse","Coarse-Grained","CAA"]:
    train_df[cls] = train_df["json_path"].apply(lambda l:get_count(l)[cls] )

In [12]:
# creating file_name column 
test_df["file_name"] = test_df["json_path"].apply(lambda l: l.split("/")[-1].split(".")[0])
train_df["file_name"] = train_df["json_path"].apply(lambda l: l.split("/")[-1].split(".")[0])
# creating flag for train (to combine later)
train_df["flag"]="train"

In [13]:
# using the same test files as was used in first interrater study
test_files =  [x.split("/")[-1] for x in glob.glob(os.path.join("/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/Datasets/amyb_wsi/test","*"))]
test_df["flag"] = test_df["file_name"].apply(lambda l: "test" if l in test_files else "val")

In [14]:
# combining the train and test data
combined_df = pd.concat([train_df,test_df], ignore_index=True)

In [16]:
combined_df.head(2)

Unnamed: 0,json_path,Cored,Diffuse,Coarse-Grained,CAA,file_name,flag
0,/gladstone/finkbeiner/steve/work/data/npsad_da...,3,6,22,0,XE14-004_1_AmyB_1,train
1,/gladstone/finkbeiner/steve/work/data/npsad_da...,5,13,9,0,XE17-010_1_AmyB_1,train


In [17]:
# There are lot more objects in validation than train (because no of objects vary in each whole slide image)
combined_df.groupby(["flag"]).sum()

Unnamed: 0_level_0,json_path,Cored,Diffuse,Coarse-Grained,CAA,file_name
flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
test,/gladstone/finkbeiner/steve/work/data/npsad_da...,26,85,22,15,XE18-066_1_AmyB_1XE19-037_1_AmyB_1XE11-039_1_A...
train,/gladstone/finkbeiner/steve/work/data/npsad_da...,95,130,105,34,XE14-004_1_AmyB_1XE17-010_1_AmyB_1XE17-030_1_A...
val,/gladstone/finkbeiner/steve/work/data/npsad_da...,207,339,98,45,XE07-047_1_AmyB_1XE07-064_1_AmyB_1XE17-004_1_A...


In [19]:
## flagging these slides in train data to get better distribution
train_list = ["XE07-064_1_AmyB_1","XE17-039_1_AmyB_1","XE10-045_1_AmyB_1","XE12-016_1_AmyB_1","XE15-022_1_AmyB_1","XE15-039_1_AmyB_1"]
combined_df["flag"]=np.where(combined_df["file_name"].isin(train_list), "train",combined_df["flag"] )

In [21]:
combined_df.groupby(["flag"]).sum() # The distribution looks good

Unnamed: 0_level_0,json_path,Cored,Diffuse,Coarse-Grained,CAA,file_name
flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
test,/gladstone/finkbeiner/steve/work/data/npsad_da...,26,85,22,15,XE18-066_1_AmyB_1XE19-037_1_AmyB_1XE11-039_1_A...
train,/gladstone/finkbeiner/steve/work/data/npsad_da...,183,258,153,64,XE14-004_1_AmyB_1XE17-010_1_AmyB_1XE17-030_1_A...
val,/gladstone/finkbeiner/steve/work/data/npsad_da...,119,211,50,15,XE07-047_1_AmyB_1XE17-004_1_AmyB_1XE07-048_1_A...


## Create Folders Train/Val/Test, and copy files based on flag to respective folder (train, test and val)

### Creating train folder

In [23]:
DATASET_PATH = "/home/mahirwar/Desktop/Monika/npsad_data/vivek/Datasets/amyb_wsi_v2"
TARGET_PATH = '/home/mahirwar/Desktop/Monika/npsad_data/vivek/Datasets/amyb_wsi_v2/train' 
target_folders = ["images", "labels"]
origin_folders = ["image","mask"]
# keeping patient folder with train flag
imgs = combined_df[combined_df["flag"]=="train"]["file_name"].values
for i in imgs:
   for target_folder,origin_folder in zip(target_folders,origin_folders):
      if i==".DS_Store" :
         continue
      origin = os.path.join(DATASET_PATH,i,origin_folder)
      target = os.path.join(TARGET_PATH,target_folder)
      if not os.path.exists(target):
         os.makedirs(target)
      # Fetching the list of all the files
      files = os.listdir(origin)
      # Fetching all the files to directory
      for file_name in files:
         shutil.copyfile(os.path.join(origin,file_name), os.path.join(target,file_name))
      print("Files are copied successfully")

Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully


### Creating validation folder

In [49]:
DATASET_PATH = "/home/mahirwar/Desktop/Monika/npsad_data/vivek/Datasets/amyb_wsi_v2"
TARGET_PATH = '/home/mahirwar/Desktop/Monika/npsad_data/vivek/Datasets/amyb_wsi_v2/val' 
target_folders = ["images", "labels"]
origin_folders = ["image","mask"]
# keeping patient folder with val flag
imgs = combined_df[combined_df["flag"]=="val"]["file_name"].values
for i in imgs:
   for target_folder,origin_folder in zip(target_folders,origin_folders):
      if i==".DS_Store" :
         continue
      origin = os.path.join(DATASET_PATH,i,origin_folder)
      target = os.path.join(TARGET_PATH,target_folder)
      if not os.path.exists(target):
         os.makedirs(target)
      # Fetching the list of all the files
      files = os.listdir(origin)
      # Fetching all the files to directory
      for file_name in files:
         shutil.copyfile(os.path.join(origin,file_name), os.path.join(target,file_name))
      print("Files are copied successfully")

Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully
Files are copied successfully


## Data Augmentation: Augment each crop using a multiplier, which is determined based on the underrepresentation of class objects in the training/validation set

In [24]:
## keep crop paths of cored, diffuse, coarse-grained and caa classes in a list. This is bit approximated as cored crop list may contain diffuse plaque objects and so on

all_images = glob.glob(os.path.join("/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/Datasets/amyb_wsi_v2/train/images/",'*.png')) #/amyb_wsi_v2/val/images/ for validation
all_masks = glob.glob(os.path.join("/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/Datasets/amyb_wsi_v2/train/labels/",'*.png'))

all_images.sort()
all_masks.sort()

cored_images, cored_masks, diffuse_images, diffuse_masks, cg_images,cg_masks, caa_images, caa_masks  = [],[],[],[],[],[],[],[]
for image, mask in zip(all_images,all_masks):
    found = np.unique(np.array(Image.open(mask)))
    # color code for cored: 50
    if 50 in found:
        cored_images.append(image)
        cored_masks.append(mask)
        
    # color code for diffuse: 100
    if 100 in found:
        diffuse_images.append(image)
        diffuse_masks.append(mask)
        
    # color code for coarse-grained: 150
    if 150 in found:
        cg_images.append(image)
        cg_masks.append(mask)
        
    # color code for caa: 200
    if 200 in found:
        caa_images.append(image)
        caa_masks.append(mask)

In [25]:
len(all_images)

491

In [27]:
# augmentation multiplier is decided based this count. For example this case will use ratio of 1.5:1:1.2:3  for augmentation
len(cored_images),  len(diffuse_images), len(cg_images), len(caa_images)

(166, 202, 178, 72)

In [28]:
# The selected transforms are designed to make minimal changes to the crops
transforms  = A.Compose([A.VerticalFlip(p=0.5),
                            A.HorizontalFlip(p=0.5),
                            A.Blur(blur_limit=1,p=0.2),
                            A.OpticalDistortion(p=0.25),
                            A.HueSaturationValue(hue_shift_limit=2, sat_shift_limit=3, val_shift_limit=2, p=0.25),
                            A.RandomRotate90(p=0.5),
                            A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.25)
                        ])


def get_randimages_dataug(total_imgs, image_filenames, label_filenames):
    """
    Randomly selects pairs of image and label filenames for data augmentation.

    This function takes a total number of images to select and two lists of image and label filenames.
    It then randomly selects pairs of images and their corresponding labels, ensuring that each image 
    and label is selected based on the same random seed for reproducibility.

    Args:
        total_imgs (int): The total number of image-label pairs to return.
        image_filenames (list of str): A list of file paths for the image files.
        label_filenames (list of str): A list of file paths for the label files.

    Returns:
        list: A list containing two lists:
            - A list of randomly selected image filenames.
            - A list of randomly selected label filenames, corresponding to the image filenames.
    """
    # Sort image and label filenames to maintain a consistent order
    image_filenames.sort()
    label_filenames.sort()
    
    # Initialize lists to store random image and label filenames
    random_image_file = []
    random_label_file = []
    
    # Select random pairs of image and label filenames
    for i in range(total_imgs):
        random.seed(i)
        random_image_file.append(random.choice(image_filenames))
        random.seed(i)
        random_label_file.append(random.choice(label_filenames))
    return [random_image_file, random_label_file]

def upsample_dataset(dataset_base_dir, random_img_filenames, rand_label_filenames, variations, transforms, dest_img_folder_name, dest_label_folder_name):
    """
    Augments a dataset by applying random transformations to images and their corresponding masks.
    
    This function loads images and their corresponding labels from the specified filenames, 
    applies a series of transformations to each image-mask pair, and saves the augmented 
    images and labels to new directories.

    Args:
        dataset_base_dir (str): The base directory of the dataset where images and labels are stored.
        random_img_filenames (list of str): A list of file paths for the image files.
        rand_label_filenames (list of str): A list of file paths for the label files.
        variations (int): The number of augmented variations to generate per image-label pair. This represent augmentation multiplier
        transforms (callable): A function or pipeline of transformations to apply to the images and masks.
        dest_img_folder_name (str): The name of the folder to save the augmented images.
        dest_label_folder_name (str): The name of the folder to save the augmented labels.

    Returns:
        tuple: A tuple containing two lists:
            - A list of file paths for the augmented images.
    """
    i = 0
    aug_img_files = []
    aug_mask_files = []
    random.seed(500)

    # Create directories for augmented images and labels if they don't exist
    aug_img_dir = os.path.join(dataset_base_dir, dest_img_folder_name)
    if not os.path.exists(aug_img_dir):
            os.makedirs(aug_img_dir)
            print("Augmented Directory '%s' created" %aug_img_dir)
            
    aug_mask_dir = os.path.join(dataset_base_dir, dest_label_folder_name)
    if not os.path.exists(aug_mask_dir):
            os.makedirs(aug_mask_dir)
            print("Augmented Directory '%s' created" %aug_mask_dir)

    print(os.listdir(dataset_base_dir))
    print("\nData Augmentation in Progress ...")
    total_imgs = len(random_img_filenames)
    
    for i in range(total_imgs):
        # Load the image and mask
        img = Image.open(random_img_filenames[i]).convert("RGB")
        img = np.array(img)
        mask = Image.open(rand_label_filenames[i]).convert('P')
        mask = np.array(mask)

        for j in range(variations):
            # Apply transformations to the image and mask
            transformed = transforms(image=img, mask=mask)
            transformed_img = transformed["image"]
            transformed_img = Image.fromarray(transformed_img)

            #To rename the file with prefix A_
            filename = os.path.basename(random_img_filenames[i])
            filepath = os.path.dirname(random_img_filenames[i])

            aug_file_name = "A_" + str(i) + "_" + str(j) + "_" + filename
            new_file = os.path.join(dataset_base_dir,dest_img_folder_name,
                                    aug_file_name)
            
            
            transformed_img.save(new_file)
            aug_img_files.append(new_file)

            # Apply transformation to the mask and save it
            transformed_mask = transformed["mask"]
            transformed_mask = Image.fromarray(transformed_mask)
            
            #To rename the file with prefix A_
            filename = os.path.basename(rand_label_filenames[i])
            filepath = os.path.dirname(rand_label_filenames[i])
            aug_file_name = "A_" + str(i) + "_" + str(j) + "_" + filename
            new_file = os.path.join(dataset_base_dir, dest_label_folder_name,
                                    aug_file_name)
            transformed_mask.save(new_file)
            aug_mask_files.append(new_file)
    return aug_img_files, aug_mask_files

In [29]:
aug_value = 50
rand_image_filenames, rand_label_filenames = get_randimages_dataug(aug_value, cored_images, cored_masks)  # change variables such as for diffuse use - diffuse_images, diffuse_masks and decide aug value based on count 
dataset_base_dir = "/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/Datasets/amyb_wsi_v2/test_augmentations"
aug_img_files, aug_mask_files = upsample_dataset(dataset_base_dir, rand_image_filenames, rand_label_filenames, 5, transforms, "images", "labels")

['labels', '.DS_Store', 'images']

Data Augmentation in Progress ...
