In [1]:
import pandas as pd
import numpy as np
import random
import os
import torch
from sklearn.model_selection import train_test_split
import ast
import utility_functions as uf

In [13]:
patches_size = 64
target_gene = "NPM1"
cell_type_list = ['Neutrophil', 'Metamyelocyte', 'Myelocyte', 'Promyelocyte', 'Blast', 'Lymphocyte', 'Monocyte', 'Eosinophil', 'Basophil']
target_gene_split = target_gene.split(" ")[0]
split_size = 2000
level = 0
slide_path = "/home/exon_storage1/aml_slide/"
info_file_path = "/home/weber50432/AML_image_processing/code_use_csv/changeSlideName.csv"
output_path =f"/home/weber50432/AML_image_processing/lib/{target_gene_split}_cell_test"

In [5]:
slide_list = uf.get_slides_list_number(f"{slide_path}single_cell_image/")
target_list = uf.get_targets_list(target_gene,slide_list,info_file_path)
#get the cells number of each slide
df = pd.DataFrame({'Slide': [],"Target":[] ,'cell_number': []})
for index,slide_num in enumerate(slide_list):
    cell_num = 0
    for target_cell in cell_type_list:
        cell_path = f"{slide_path}single_cell_image/A{slide_num}/{target_cell}"
        if os.path.exists(cell_path):
            cell_num = len(os.listdir(cell_path)) + cell_num
            df.loc[index] = [slide_num,target_list[index],cell_num]
# count the positive and negative slide number
# print(f"positive target: {df['Target'].sum()}, negative target: {len(df)-df['Target'].sum()}")
df.to_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene_split}_select_cell_num.csv",index=False)

In [20]:
if not os.path.exists(output_path):
    os.makedirs(output_path)
df = pd.read_csv(f"/home/weber50432/AML_image_processing/lib/{target_gene_split}_Myelocyte_num.csv")
# # take 10% of the data as data
# df = df.sample(frac=0.1, random_state=111)
df_train, df_val = train_test_split(df, test_size=0.3, random_state=111)
df_val, df_test = train_test_split(df_val, test_size=0.67, random_state=111)
# show the number of positive and negative target in each dataframe
print(f"train positive target: {df_train['Target'].sum()}, negative target: {len(df_train)-df_train['Target'].sum()}")
print(f"val positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
print(f"test positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
# show the proportion of total data number in each dataframe in percentage
print(f"train: {len(df_train)/len(df)*100:.2f}%, val: {len(df_val)/len(df)*100:.2f}%, test: {len(df_test)/len(df)*100:.2f}%")

train positive target: 70, negative target: 330
val positive target: 11, negative target: 45
test positive target: 20, negative target: 96
train: 69.93%, val: 9.79%, test: 20.28%


In [21]:
# training data upsampling
df_train_output = pd.DataFrame({'Slide': [], "Target": [], 'patches': []})
for index, row in df_train.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = []
    for target_cell in cell_type_list:
        if os.path.exists(f"{slide_path}single_cell_image/A{slide_num}/{target_cell}"):
            for patches in  os.listdir(f"{slide_path}single_cell_image/A{slide_num}/{target_cell}"):
                patch_list.append(f"{target_cell}/{patches}")
    # if target is positive, split the patches number of the slide and add it to slide_list
    if target == 1:
        if split_size < len(patch_list):
            # Shuffle the original list randomly
            random.shuffle(patch_list)
            # split the patches list into  sublists
            sublists = [patch_list[i:i+split_size]
                        for i in range(0, len(patch_list)-len(patch_list) % split_size, split_size)]
            # store the sublists into the dataframe
            for j, sublist in enumerate(sublists):
                df_train_output.loc[df_train_output.shape[0]] = [
                    f"A{slide_num}_{j+1}", target, sublist]
        else:
            df_train_output.loc[df_train_output.shape[0]] = [
                f"A{slide_num}", target, patch_list]
    else:
        if split_size < len(patch_list):
            sublist = random.sample(patch_list, split_size)
        else:
            sublist = patch_list
        df_train_output.loc[df_train_output.shape[0]] = [f"A{slide_num}", target, sublist]
print(
    f"training: positive target: {df_train_output['Target'].sum()}, negative target: {len(df_train_output)-df_train_output['Target'].sum()}")
# save the data as a .pt file
train_output = {
    "slides": df_train_output['Slide'].tolist(),
    "grid": df_train_output['patches'].tolist(),
    "targets": df_train_output['Target'].tolist(),
    "mult": patches_size/224,
    "level": level,
}
torch.save(train_output, "{}/{}_train_data.pt".format(output_path, target_gene_split))

  return array(a, dtype, copy=False, order=order)


training: positive target: 323, negative target: 330


In [14]:
# validation data without upsampling
df_temp = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,row in df_val.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}single_cell_image/A{slide_num}/Myelocyte")
    if split_size < len(patch_list):
        sublist = random.sample(patch_list,split_size)
    else:
        sublist = patch_list
    df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,sublist]
df_val = df_temp
print(f"validation: positive target: {df_val['Target'].sum()}, negative target: {len(df_val)-df_val['Target'].sum()}")
val_output = {
        "slides": df_val['Slide'].tolist(),
        "grid": df_val['patches'].tolist(),
        "targets": df_val['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(val_output, "{}/{}_val_data.pt".format(output_path,target_gene_split))

validation: positive target: 1, negative target: 4


  return array(a, dtype, copy=False, order=order)


In [15]:
# test data without upsampling
df_temp = pd.DataFrame({'Slide': [],"Target":[] ,'patches': []})
for index,row in df_test.iterrows():
    slide_num = row['Slide']
    target = row['Target']
    patch_list = os.listdir(f"{slide_path}ROI_level0_pixel512/A{slide_num}")
    if split_size < len(patch_list):
        sublist = random.sample(patch_list,split_size)
    else:
        sublist = patch_list
    df_temp.loc[df_temp.shape[0]] = [f"A{slide_num}",target,sublist]
df_test = df_temp
print(f"test: positive target: {df_test['Target'].sum()}, negative target: {len(df_test)-df_test['Target'].sum()}")
test_output = {
        "slides": df_test['Slide'].tolist(),
        "grid": df_test['patches'].tolist(),
        "targets": df_test['Target'].tolist(),
        "mult": patches_size/224,
        "level": level,
    }
torch.save(test_output, "{}/{}_test_data.pt".format(output_path,target_gene_split))

test: positive target: 1, negative target: 12


  return array(a, dtype, copy=False, order=order)
