In [1]:
import pandas as pd
import numpy as np
import random
import os
import torch
from sklearn.model_selection import train_test_split
import ast
import utility_functions as uf

# default setting

In [2]:
patches_size = 512
target_gene = "NPM1"
split_size = 500
level = 0
# left_proportion = 0.6
# shrink_proportion = 0.15
slide_path = "/home/exon_storage1/aml_slide/"
info_file_path = "/home/weber50432/AML_image_processing/code_use_csv/changeSlideName.csv"

# Original data generation process

In [3]:
slide_list = uf.get_slides_list_number(f"{slide_path}ROI_level0_pixel512/")
target = uf.get_targets_list(target_gene,slide_list,info_file_path)
X = np.array(slide_list)
y = np.array(target)
# 將數據集按比例 6:2:2 分為訓練集、驗證集和測試集
# 找到標籤為1的樣本索引
positive_indices = np.where(y == 1)[0]

# 使用 train_test_split 函数分割樣本
# 首先將標籤為1的樣本分成訓練集、驗證集和測試集
train_pos, val_pos_test = train_test_split(positive_indices, test_size=0.4)
val_pos, test_pos = train_test_split(val_pos_test, test_size=0.5)

# 接下來將標籤為0的樣本分成訓練集、驗證集和測試集
train_neg, val_neg_test, y_train, y_val_test = train_test_split(np.where(y == 0)[0], y[np.where(y == 0)[0]], test_size=0.4)
val_neg, test_neg, y_val, y_test = train_test_split(val_neg_test, y_val_test, test_size=0.5)

# 將訓練集、驗證集和測試集的索引合併起來
train_indices = sorted(np.concatenate((train_pos, train_neg)))
val_indices = sorted(np.concatenate((val_pos, val_neg)))
test_indices = sorted(np.concatenate((test_pos, test_neg)))

# 根據索引提取對應的數據和標籤
X_train = X[train_indices].tolist()
X_val = X[val_indices].tolist()
X_test = X[test_indices].tolist()
y_train = y[train_indices].tolist()
y_val = y[val_indices].tolist()
y_test = y[test_indices].tolist()

# 計算各個集合的樣本數量
print(f"訓練集樣本數量：{len(X_train)}")
print(f"positive target：{y_train.count(1)}")
print(f"驗證集樣本數量：{len(X_val)}")
print(f"positive target：{y_val.count(1)}")
print(f"測試集樣本樣量：{len(X_test)}")
print(f"positive target：{y_test.count(1)}")

訓練集樣本數量：343
positive target：60
驗證集樣本數量：115
positive target：20
測試集樣本樣量：116
positive target：21


In [10]:
target_gene_rename = target_gene.split(" ")[0]+"_patch"
output_path ="/home/weber50432/AML_image_processing/lib/{}".format(target_gene_rename)
# check the output path is exist or not
if not os.path.exists(output_path):
    os.makedirs(output_path)
# save the data
train_output = {
      "slides": uf.make_paths_list("/staging/biology/b08611005/ROI_level0_pixel512/",X_train),
      "grid": uf.get_patches_grid(slide_path+"ROI_level0_pixel512/",X_train,patch_num),
      "targets": y_train,
      "mult": patches_size/224,
      "level": level,
  }
torch.save(train_output, "{}/{}_train_data.pt".format(output_path,target_gene_rename))

slide A9 is processing...
slide A12 is processing...
slide A22 is processing...
slide A26 is processing...
slide A60 is processing...
slide A99 is processing...
slide A102 is processing...
slide A103 is processing...
slide A104 is processing...
slide A105 is processing...
slide A106 is processing...
slide A108 is processing...
slide A121 is processing...
slide A124 is processing...
slide A129 is processing...
slide A131 is processing...
slide A135 is processing...
slide A136 is processing...
slide A146 is processing...
slide A147 is processing...
slide A151 is processing...
slide A152 is processing...
slide A153 is processing...
slide A154 is processing...
slide A156 is processing...
slide A159 is processing...
slide A160 is processing...
slide A161 is processing...
slide A162 is processing...
slide A167 is processing...
slide A169 is processing...
slide A171 is processing...
slide A173 is processing...
slide A177 is processing...
slide A180 is processing...
slide A197 is processing...

# upsampling training data 

## count the total patches number of each WSIs, and save it as a csv file

In [4]:
slide_list = uf.get_slides_list_number(f"{slide_path}ROI_level0_pixel512_norm/")
target_list = uf.get_targets_list(target_gene,slide_list,info_file_path)
#get the patches number of each slide, and save it as a list
df = pd.DataFrame({'Slide': [],"Target":[] ,'patches_number': []})
for index,slide_num in enumerate(slide_list):
    patch_sum = len(os.listdir(f"{slide_path}ROI_level0_pixel512_norm/A{slide_num}"))
    df.loc[index] = [slide_num,target_list[index],patch_sum]
#count the positive and negative slide number
print(f"positive target: {df['Target'].sum()}, negative target: {len(df)-df['Target'].sum()}")
df.to_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene}_norm_slide_patch_num.csv",index=False)

positive target: 101, negative target: 473


## version 0 : only upsample positive slides before spliting data

In [47]:
# find the slides of targets, and show the number of patches
slide_list = uf.get_slides_list_number(f"{slide_path}ROI_level0_pixel512/")
target_list = uf.get_targets_list(target_gene,slide_list,info_file_path)
#get the patches number of each slide, and save it as a list
df = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,slide_num in enumerate(slide_list):
  target = target_list[index]
  patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
  # if target is positive, split the patches number of the slide and add it to slide_list
  if target == 1:
    # Shuffle the original list randomly
    random.shuffle(patch_list)
    #split the patches list into  sublists
    sublists = [patch_list[i:i+split_size] for i in range(0, len(patch_list)-len(patch_list)%split_size, split_size)]
    # store the sublists into the dataframe
    for j, sublist in enumerate(sublists):
      df.loc[df.shape[0]] = [f"A{slide_num}_{j+1}",target,sublist]
  else:
    if split_size < len(patch_list):
      sublist = random.sample(patch_list,split_size)
    else:
      sublist = patch_list
      print(f"slide A{slide_num} only has {len(patch_list)} patches")
    df.loc[df.shape[0]] = [f"A{slide_num}",target,sublist]
# show the length of the dataframe
print(len(df))
#save the dataframe as a csv file
df.to_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene}_slide_target_patch_num.csv",index=False)
type(df["patches"][0])

  return array(a, dtype, copy=False, order=order)


slide A299 only has 305 patches
slide A308 only has 483 patches
slide A352 only has 386 patches
slide A426 only has 380 patches
slide A446 only has 444 patches
slide A665 only has 214 patches
slide A731 only has 223 patches
slide A732 only has 267 patches
slide A763 only has 382 patches
slide A764 only has 304 patches
slide A942 only has 442 patches
slide A1053 only has 477 patches
slide A1058 only has 451 patches
slide A1083 only has 149 patches
slide A1102 only has 364 patches
slide A1108 only has 424 patches
slide A1121 only has 299 patches
slide A1140 only has 351 patches
slide A1142 only has 288 patches
slide A1307 only has 320 patches
949


list

In [48]:
target_gene_rename = target_gene.split(" ")[0]+"_patch_500_balanced"
output_path =f"/home/weber50432/AML_image_processing/lib/{target_gene_rename}"
# df = pd.read_csv(f"{output_path}/{target_gene}_slide_target_patch_num.csv")
# split the dataframe into train, val, test
df_train, df_val = train_test_split(df, test_size=0.3, random_state=100)
df_val, df_test = train_test_split(df_val, test_size=0.67, random_state=100)
# show the number of positive and negative target in each dataframe
print(f"train positive target: {df_train['Target'].sum()}, negative target: {len(df_train)-df_train['Target'].sum()}")
print(f"val positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
print(f"test positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
# show the proportion of total data number in each dataframe in percentage
print(f"train: {len(df_train)/len(df)*100:.2f}%, val: {len(df_val)/len(df)*100:.2f}%, test: {len(df_test)/len(df)*100:.2f}%")
if not os.path.exists(output_path):
    os.makedirs(output_path)
# save the data
train_output = {
      "slides": df_train['Slide'].tolist(),
      "grid": df_train['patches'].tolist(),
      "targets": df_train['Target'].tolist(),
      "mult": patches_size/224,
      "level": level,
  }
torch.save(train_output, "{}/{}_train_data.pt".format(output_path,target_gene_rename))
val_output = {
        "slides": df_val['Slide'].tolist(),
        "grid": df_val['patches'].tolist(),
        "targets": df_val['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(val_output, "{}/{}_val_data.pt".format(output_path,target_gene_rename))
test_output = {
        "slides": df_test['Slide'].tolist(),
        "grid": df_test['patches'].tolist(),
        "targets": df_test['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(test_output, "{}/{}_test_data.pt".format(output_path,target_gene_rename))

train positive target: 337, negative target: 327
val positive target: 42, negative target: 52
test positive target: 99, negative target: 92
train: 69.97%, val: 9.91%, test: 20.13%
<class 'list'>


## version 1 : upsample positive and negative slides before spliting data

In [2]:
# find the slides of targets, and show the number of patches
split_size = 500
target_gene = "NPM1"
slide_path = "/home/exon_storage1/aml_slide/"
info_file_path = "/home/weber50432/AML_image_processing/code_use_csv/changeSlideName.csv"
slide_list = uf.get_slides_list_number(f"{slide_path}ROI_level0_pixel512/")
target_list = uf.get_targets_list(target_gene,slide_list,info_file_path)
#get the patches number of each slide, and save it as a list
df = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,slide_num in enumerate(slide_list):
  target = target_list[index]
  patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
  # if patch number is larger than the desired size, split the patches number of the slide and add it to slide_list
  if split_size < len(patch_list):
    # Shuffle the original list randomly
    random.shuffle(patch_list)
    #split the patches list into  sublists
    sublists = [patch_list[i:i+split_size] for i in range(0, len(patch_list)-len(patch_list)%split_size, split_size)]
    # store the sublists into the dataframe
    for j, sublist in enumerate(sublists):
      df.loc[df.shape[0]] = [f"A{slide_num}_{j+1}",target,sublist]
  else:
    sublist = patch_list
    print(f"slide A{slide_num} only has {len(patch_list)} patches")
  df.loc[df.shape[0]] = [f"A{slide_num}",target,sublist]
# show positive and negative target number
print(f"positive target: {df['Target'].sum()}, negative target: {len(df)-df['Target'].sum()}")
#save the dataframe as a csv file
df.to_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene}_slide_target_patch_num.csv",index=False)

  return array(a, dtype, copy=False, order=order)


slide A299 only has 305 patches
slide A308 only has 483 patches
slide A352 only has 386 patches
slide A365 only has 213 patches
slide A426 only has 380 patches
slide A446 only has 444 patches
slide A581 only has 145 patches
slide A660 only has 495 patches
slide A665 only has 214 patches
slide A724 only has 459 patches
slide A731 only has 223 patches
slide A732 only has 267 patches
slide A763 only has 382 patches
slide A764 only has 304 patches
slide A942 only has 442 patches
slide A1053 only has 477 patches
slide A1058 only has 451 patches
slide A1065 only has 494 patches
slide A1083 only has 149 patches
slide A1102 only has 364 patches
slide A1108 only has 424 patches
slide A1121 only has 299 patches
slide A1140 only has 351 patches
slide A1142 only has 288 patches
slide A1175 only has 453 patches
slide A1307 only has 320 patches
positive target: 579, negative target: 3157


list

In [16]:
target_gene = "NPM1"
patches_size = 512
level = 0
target_gene_rename = target_gene.split(" ")[0]+"_patch_500_upsampled"
output_path = f"/home/weber50432/AML_image_processing/lib/{target_gene_rename}"
df = pd.read_csv(f"{output_path}/{target_gene}_slide_target_patch_num.csv")
df['patches'] = df['patches'].apply(ast.literal_eval)
df_train, df_val = train_test_split(df, test_size=0.3, random_state=100)
df_val, df_test = train_test_split(df_val, test_size=0.67, random_state=100)
# show the number of positive and negative target in each dataframe
print(
    f"train positive target: {df_train['Target'].sum()}, negative target: {len(df_train)-df_train['Target'].sum()}")
print(
    f"val positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
print(
    f"test positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
# show the proportion of total data number in each dataframe in percentage
print(f"train: {len(df_train)/len(df)*100:.2f}%, val: {len(df_val)/len(df)*100:.2f}%, test: {len(df_test)/len(df)*100:.2f}%")
if not os.path.exists(output_path):
    os.makedirs(output_path)
# save the data
train_output = {
    "slides": df_train['Slide'].tolist(),
    "grid": df_train['patches'].tolist(),
    "targets": df_train['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(
    train_output, "{}/{}_train_data.pt".format(output_path, target_gene_rename))
val_output = {
    "slides": df_val['Slide'].tolist(),
    "grid": df_val['patches'].tolist(),
    "targets": df_val['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(val_output, "{}/{}_val_data.pt".format(output_path, target_gene_rename))
test_output = {
    "slides": df_test['Slide'].tolist(),
    "grid": df_test['patches'].tolist(),
    "targets": df_test['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(
    test_output, "{}/{}_test_data.pt".format(output_path, target_gene_rename))

train positive target: 406, negative target: 2209
val positive target: 57, negative target: 312
test positive target: 116, negative target: 636
train: 69.99%, val: 9.88%, test: 20.13%


## version 2 : split the data first, and then only upsample positive slides in training data set

In [51]:
target_gene_rename = target_gene.split(" ")[0]+"_patch_500_training_data_augmentation"
output_path =f"/home/weber50432/AML_image_processing/lib/{target_gene_rename}"
if not os.path.exists(output_path):
    os.makedirs(output_path)
df = pd.read_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene}_slide_patch_num.csv")
df_train, df_val = train_test_split(df, test_size=0.3, random_state=1000)
df_val, df_test = train_test_split(df_val, test_size=0.67, random_state=1000)
# show the number of positive and negative target in each dataframe
print(f"train positive target: {df_train['Target'].sum()}, negative target: {len(df_train)-df_train['Target'].sum()}")
print(f"val positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
print(f"test positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
# show the proportion of total data number in each dataframe in percentage
print(f"train: {len(df_train)/len(df)*100:.2f}%, val: {len(df_val)/len(df)*100:.2f}%, test: {len(df_test)/len(df)*100:.2f}%")


train positive target: 69, negative target: 331
val positive target: 11, negative target: 45
test positive target: 21, negative target: 95
train: 69.93%, val: 9.79%, test: 20.28%


In [48]:
# training data upsampling
# df_train.drop_duplicates(subset=['Slide'], keep='first', inplace=True)
df_temp = pd.DataFrame({'Slide': [], "Target": [], 'patches': []})
for index, row in df_train.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
    # if target is positive, split the patches number of the slide and add it to slide_list
    if target == 1:
        if split_size < len(patch_list):
            # Shuffle the original list randomly
            random.shuffle(patch_list)
            # split the patches list into  sublists
            sublists = [patch_list[i:i+split_size]
                        for i in range(0, len(patch_list)-len(patch_list) % split_size, split_size)]
            # store the sublists into the dataframe
            for j, sublist in enumerate(sublists):
                df_temp.loc[df_temp.shape[0]] = [
                    f"A{slide_num}_{j+1}", target, sublist]
        else:
            df_temp.loc[df_temp.shape[0]] = [
                f"A{slide_num}", target, patch_list]
    else:
        if split_size < len(patch_list):
            sublist = random.sample(patch_list, split_size)
        else:
            sublist = patch_list
        df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}", target, sublist]
df_train = df_temp
print(
    f"training: positive target: {df_train['Target'].sum()}, negative target: {len(df_train)-df_train['Target'].sum()}")
# save the data as a .pt file
train_output = {
    "slides": df_train['Slide'].tolist(),
    "grid": df_train['patches'].tolist(),
    "targets": df_train['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(train_output, "{}/{}_train_data.pt".format(output_path, target_gene))

  return array(a, dtype, copy=False, order=order)


training: positive target: 296, negative target: 331


In [49]:
# validation data, test data without upsampling
df_temp = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,row in df_val.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
    if split_size < len(patch_list):
        sublist = random.sample(patch_list,split_size)
    else:
        sublist = patch_list
    df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,sublist]
df_val = df_temp
print(f"validation: positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
val_output = {
        "slides": df_val['Slide'].tolist(),
        "grid": df_val['patches'].tolist(),
        "targets": df_val['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(val_output, "{}/{}_val_data.pt".format(output_path,target_gene))


  return array(a, dtype, copy=False, order=order)


validation: positive target: 11, negative target: 45


In [50]:
# test data without upsampling
df_temp = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,row in df_test.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
    if split_size < len(patch_list):
        sublist = random.sample(patch_list,split_size)
    else:
        sublist = patch_list
    df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,sublist]
df_test = df_temp
print(f"test: positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
test_output = {
        "slides": df_test['Slide'].tolist(),
        "grid": df_test['patches'].tolist(),
        "targets": df_test['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(test_output, "{}/{}_test_data.pt".format(output_path,target_gene))

  return array(a, dtype, copy=False, order=order)


test: positive target: 21, negative target: 95


In [53]:
# test data with upsampling 
df_temp = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,row in df_test.iterrows():
  slide_num = row['Slide']
  target = row['Target']
  patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
  # if target is positive, split the patches number of the slide and add it to slide_list
  if target == 1:
      if split_size < len(patch_list):
      # Shuffle the original list randomly
          random.shuffle(patch_list)
          #split the patches list into  sublists
          sublists = [patch_list[i:i+split_size] for i in range(0, len(patch_list)-len(patch_list)%split_size, split_size)]
          # store the sublists into the dataframe
          for j, sublist in enumerate(sublists):
              df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}_{j+1}",target,sublist]
      else:
          df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,patch_list]
  else:
      if split_size < len(patch_list):
          sublist = random.sample(patch_list,split_size)
      else:
          sublist = patch_list
      df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,sublist]
df_test_augment = df_temp
print(f"test: positive target: {df_test_augment['Target'].sum()}, negative target: {len(df_test_augment)-df_test_augment['Target'].sum()}")
test_output = {
        "slides": df_test_augment['Slide'].tolist(),
        "grid": df_test_augment['patches'].tolist(),
        "targets": df_test_augment['Target'].tolist(),
        "mult": patches_size/224,
        "level": level
    }
torch.save(test_output, "{}/{}_test_augment_data.pt".format(output_path,target_gene))

  return array(a, dtype, copy=False, order=order)


test: positive target: 132, negative target: 95


## version 3 : split the data first, and then upsample both positive and negative slides in train, val dataset 

In [6]:
target_gene_rename = target_gene.split(" ")[0]
output_path =f"/home/weber50432/AML_image_processing/lib/{target_gene_rename}_patch_500_upsampled_V3"
if not os.path.exists(output_path):
    os.makedirs(output_path)
df = pd.read_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene}_slide_patch_num.csv")
df_train, df_val = train_test_split(df, test_size=0.3, random_state=1000)
df_val, df_test = train_test_split(df_val, test_size=0.67, random_state=1000)
# show the number of positive and negative target in each dataframe
print(f"train positive target: {df_train['Target'].sum()}, negative target: {len(df_train)-df_train['Target'].sum()}")
print(f"val positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
print(f"test positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
# show the proportion of total data number in each dataframe in percentage
print(f"train: {len(df_train)/len(df)*100:.2f}%, val: {len(df_val)/len(df)*100:.2f}%, test: {len(df_test)/len(df)*100:.2f}%")

train positive target: 69, negative target: 331
val positive target: 11, negative target: 45
test positive target: 21, negative target: 95
train: 69.93%, val: 9.79%, test: 20.28%


In [4]:
# training data upsampling
# df_train.drop_duplicates(subset=['Slide'], keep='first', inplace=True)
df_out = pd.DataFrame({'Slide': [], "Target": [], 'patches': []})
for index, row in df_train.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
    # split the patches number of the slide and add it to slide_list
    if split_size < len(patch_list):
        # Shuffle the original list randomly
        random.shuffle(patch_list)
        # split the patches list into  sublists
        sublists = [patch_list[i:i+split_size]
                    for i in range(0, len(patch_list)-len(patch_list) % split_size, split_size)]
        # store the sublists into the dataframe
        for j, sublist in enumerate(sublists):
            df_out.loc[df_out.shape[0]] = [
                f"A{slide_num}_{j+1}", target, sublist]
    else:
        df_out.loc[df_out.shape[0]] = [
            f"A{slide_num}", target, patch_list]
print(
    f"training: positive target: {df_out['Target'].sum()}, negative target: {len(df_out)-df_out['Target'].sum()}")
# save the data as a .pt file
train_output = {
    "slides": df_out['Slide'].tolist(),
    "grid": df_out['patches'].tolist(),
    "targets": df_out['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(train_output, "{}/{}_train_data.pt".format(output_path, target_gene))

  return array(a, dtype, copy=False, order=order)


training: positive target: 296, negative target: 1885


In [7]:
# validation data upsampling
df_out = pd.DataFrame({'Slide': [], "Target": [], 'patches': []})
for index, row in df_val.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
    # split the patches number of the slide and add it to slide_list
    if split_size < len(patch_list):
        # Shuffle the original list randomly
        random.shuffle(patch_list)
        # split the patches list into  sublists
        sublists = [patch_list[i:i+split_size]
                    for i in range(0, len(patch_list)-len(patch_list) % split_size, split_size)]
        # store the sublists into the dataframe
        for j, sublist in enumerate(sublists):
            df_out.loc[df_out.shape[0]] = [
                f"A{slide_num}_{j+1}", target, sublist]
    else:
        df_out.loc[df_out.shape[0]] = [
            f"A{slide_num}", target, patch_list]
print(f"validation: positive target: {df_out['Target'].sum()}, negative target: {len(df_out)-df_out['Target'].sum()}")
val_output = {
    "slides": df_out['Slide'].tolist(),
    "grid": df_out['patches'].tolist(),
    "targets": df_out['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(val_output, "{}/{}_val_data.pt".format(output_path, target_gene))

  return array(a, dtype, copy=False, order=order)


validation: positive target: 56, negative target: 324


In [8]:
# test data without upsampling
df_temp = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,row in df_test.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
    if split_size < len(patch_list):
        sublist = random.sample(patch_list,split_size)
    else:
        sublist = patch_list
    df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,sublist]
df_test = df_temp
print(f"test: positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
test_output = {
        "slides": df_test['Slide'].tolist(),
        "grid": df_test['patches'].tolist(),
        "targets": df_test['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(test_output, "{}/{}_test_data.pt".format(output_path,target_gene))

  return array(a, dtype, copy=False, order=order)


test: positive target: 21, negative target: 95


## version 4 : normailzation before version 3

In [5]:
target_gene_rename = target_gene.split(" ")[0]
output_path =f"/home/weber50432/AML_image_processing/lib/{target_gene_rename}_patch_500_upsampled_V4"
if not os.path.exists(output_path):
    os.makedirs(output_path)
df = pd.read_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene}_norm_slide_patch_num.csv")
df_train, df_val = train_test_split(df, test_size=0.3, random_state=1000)
df_val, df_test = train_test_split(df_val, test_size=0.67, random_state=1000)
# show the number of positive and negative target in each dataframe
print(f"train positive target: {df_train['Target'].sum()}, negative target: {len(df_train)-df_train['Target'].sum()}")
print(f"val positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
print(f"test positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
# show the proportion of total data number in each dataframe in percentage
print(f"train: {len(df_train)/len(df)*100:.2f}%, val: {len(df_val)/len(df)*100:.2f}%, test: {len(df_test)/len(df)*100:.2f}%")

train positive target: 62, negative target: 339
val positive target: 13, negative target: 44
test positive target: 26, negative target: 90
train: 69.86%, val: 9.93%, test: 20.21%


In [6]:
# training data upsampling
# df_train.drop_duplicates(subset=['Slide'], keep='first', inplace=True)
df_out = pd.DataFrame({'Slide': [], "Target": [], 'patches': []})
for index, row in df_train.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512_norm/A{slide_num}")
    # split the patches number of the slide and add it to slide_list
    if split_size < len(patch_list):
        # Shuffle the original list randomly
        random.shuffle(patch_list)
        # split the patches list into  sublists
        sublists = [patch_list[i:i+split_size]
                    for i in range(0, len(patch_list)-len(patch_list) % split_size, split_size)]
        # store the sublists into the dataframe
        for j, sublist in enumerate(sublists):
            df_out.loc[df_out.shape[0]] = [
                f"A{slide_num}_{j+1}", target, sublist]
    else:
        df_out.loc[df_out.shape[0]] = [
            f"A{slide_num}", target, patch_list]
print(
    f"training: positive target: {df_out['Target'].sum()}, negative target: {len(df_out)-df_out['Target'].sum()}")
# save the data as a .pt file
train_output = {
    "slides": df_out['Slide'].tolist(),
    "grid": df_out['patches'].tolist(),
    "targets": df_out['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(train_output, "{}/{}_train_data.pt".format(output_path, target_gene))

  return array(a, dtype, copy=False, order=order)


training: positive target: 264, negative target: 1934


In [7]:
# validation data upsampling
df_out = pd.DataFrame({'Slide': [], "Target": [], 'patches': []})
for index, row in df_val.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512_norm/A{slide_num}")
    # split the patches number of the slide and add it to slide_list
    if split_size < len(patch_list):
        # Shuffle the original list randomly
        random.shuffle(patch_list)
        # split the patches list into  sublists
        sublists = [patch_list[i:i+split_size]
                    for i in range(0, len(patch_list)-len(patch_list) % split_size, split_size)]
        # store the sublists into the dataframe
        for j, sublist in enumerate(sublists):
            df_out.loc[df_out.shape[0]] = [
                f"A{slide_num}_{j+1}", target, sublist]
    else:
        df_out.loc[df_out.shape[0]] = [
            f"A{slide_num}", target, patch_list]
print(f"validation: positive target: {df_out['Target'].sum()}, negative target: {len(df_out)-df_out['Target'].sum()}")
val_output = {
    "slides": df_out['Slide'].tolist(),
    "grid": df_out['patches'].tolist(),
    "targets": df_out['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(val_output, "{}/{}_val_data.pt".format(output_path, target_gene))

  return array(a, dtype, copy=False, order=order)


validation: positive target: 72, negative target: 273


In [8]:
# test data without upsampling
df_temp = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,row in df_test.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512_norm/A{slide_num}")
    if split_size < len(patch_list):
        sublist = random.sample(patch_list,split_size)
    else:
        sublist = patch_list
    df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,sublist]
df_test = df_temp
print(f"test: positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
test_output = {
        "slides": df_test['Slide'].tolist(),
        "grid": df_test['patches'].tolist(),
        "targets": df_test['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(test_output, "{}/{}_test_data.pt".format(output_path,target_gene))

  return array(a, dtype, copy=False, order=order)


test: positive target: 26, negative target: 90


# down sampling base on the WSIs