In [1]:
import pandas as pd
import numpy as np
import flowio
import warnings 
from tqdm import tqdm

def std(channel_data):
    # Calculate the mean
    mean = sum(channel_data) / len(channel_data)
    
    # Calculate the squared differences from the mean
    squared_diff = [(x - mean) ** 2 for x in channel_data]
    
    # Calculate the variance
    variance = sum(squared_diff) / len(channel_data)
    
    # Calculate the standard deviation (square root of the variance)
    std_dev = variance ** 0.5
    
    # Standardize the data
    STD = [(x - mean) / std_dev for x in channel_data]
    #STD = channel_data
    #STD = np.median(channel_data)
    return [STD]

def extract_features(patient_number):
    # given a patient number, open all relevant files and extract desired features. Return a single array of extracted features. 
    #NOTE: this is not the only way to set up your data matrix. Feel free to change this up. 

    feature_array = [patient_number] # start with the patient number in the first column so you can link labels. Remove prior to training/testing
    for sample in range(1,9):# 8 samples per patient      
        file_number = (patient_number-1)*8+sample # calculate the file number given the patient and sample numbers
        file_number_4d = '{:04d}'.format(file_number) # 4-digit file number (ex: 0007 instead of 7)

        # NOTE: CHANGE THIS TO THE CORRECT FILE PATH ON YOUR COMPUTER
        file_path = '/Users/gaberustia/Desktop/EGR410/Project_1_AML/FlowCytometry_files_1/' + file_number_4d + '.FCS'

        try:
            # Load the FCS file
            fcs_file_multi = flowio.read_multiple_data_sets(file_path,ignore_offset_error=True)
            # NOTE: there are 2 data sets in each file. The first is the real data, the second is control data (e.g. saline flush)
            # if you don't want to look at the second dataset, uncomment the following line: 
            del fcs_file_multi[-1] # this deletes the control portion of the file

            for ff in fcs_file_multi: 
                # Access metadata of the current data set
                metadata = ff.text
                # print("\nMetadata:", metadata)

                header = ff.header
                # print('\nHeader:', header)

                analysis = ff.analysis
                # print('\nAnalysis:', analysis)

                channel_count = ff.channel_count
                # print('\nChannel count:', channel_count)

                channels = ff.channels
                # print('\nChannels:', channels)

                event_count = ff.event_count
                # print('\nEvent count:', event_count)

                # Access data of the current data set
                data = ff.events
                # print('\nData length: ',len(data))

                for cc in range(0,channel_count):
                    channel_name = 'p'+str(cc+1)+'n'

                    channel_data = data[cc*event_count:(cc+1)*event_count-1]

                    my_feature = std(channel_data)

                    feature_array.append(my_feature) # assuming 'my_feature' is a single value, this will add 56 features (7 channels * 8 vials) for each patient 
                    # NOTE: if my_feature is an array instead of a single value, you will need to change up the structure on how you build your dataset for it to work properly. 

                    # It might be a good idea to look at some graphs...

                    # # Visualize channel distribution with a histogram 
                    # plt.hist(channel_data, bins=50, color='blue', alpha=0.7)
                    # plt.title(f'Histogram of {channel_name}')
                    # plt.xlabel('Fluorescence Intensity')
                    # plt.ylabel('Frequency')
                    # plt.show()

        except Exception as e:
            print('\n\nerror with file '+file_number_4d)
            print(e)

    return feature_array

warnings.simplefilter("ignore", category=UserWarning) # the .fcs file format throws a warning about data offsets. safe to ignore. alternative: 1 warning message every time you open a .fcs file (terminal now has 2500 warning messages)


# load patient numbers for training and testing 
training_patients_df = pd.read_csv('/Users/gaberustia/Desktop/EGR410/Project_1_AML/training_patients.csv')
testing_patients_df = pd.read_csv('/Users/gaberustia/Desktop/EGR410/Project_1_AML/testing_patients.csv')

training_set = [] # create placeholder
patients_train = training_patients_df['Patient Number'].to_list()
for patient in tqdm(patients_train, desc="Creating training set", unit="patients"): # 181 patients total
    training_set.append(extract_features(patient)) # create array of arrays (matrix)
df_train = pd.DataFrame(training_set) # NOTE: this doesn't have any column names. I suggest you keep track of which features are which. 
df_train.to_csv('training_set.csv',index=False)

testing_set = [] # create placeholder
patients_test = testing_patients_df['Patient Number'].to_list()
for patient in tqdm(patients_test, desc="Creating testing set", unit="patients"): # 178 patients total
    testing_set.append(extract_features(patient)) # create array of arrays (matrix)
df_test = pd.DataFrame(testing_set)
df_test.to_csv('testing_set.csv',index=False)


Creating training set: 100%|██████████| 181/181 [01:57<00:00,  1.54patients/s]
Creating testing set: 100%|██████████| 178/178 [02:02<00:00,  1.46patients/s]


In [2]:
import statistics as st

def mean(channel_data):
    med = st.median(channel_data)
    return [med]

def extract_features2(patient_number):
    # given a patient number, open all relevant files and extract desired features. Return a single array of extracted features. 
    #NOTE: this is not the only way to set up your data matrix. Feel free to change this up. 

    feature_array = [patient_number] # start with the patient number in the first column so you can link labels. Remove prior to training/testing
    for sample in range(1,9):# 8 samples per patient      
        file_number = (patient_number-1)*8+sample # calculate the file number given the patient and sample numbers
        file_number_4d = '{:04d}'.format(file_number) # 4-digit file number (ex: 0007 instead of 7)

        # NOTE: CHANGE THIS TO THE CORRECT FILE PATH ON YOUR COMPUTER
        file_path = '/Users/gaberustia/Desktop/EGR410/Project_1_AML/FlowCytometry_files_1/' + file_number_4d + '.FCS'

        try:
            # Load the FCS file
            fcs_file_multi = flowio.read_multiple_data_sets(file_path,ignore_offset_error=True)
            # NOTE: there are 2 data sets in each file. The first is the real data, the second is control data (e.g. saline flush)
            # if you don't want to look at the second dataset, uncomment the following line: 
            del fcs_file_multi[-1] # this deletes the control portion of the file

            for ff in fcs_file_multi: 
                # Access metadata of the current data set
                metadata = ff.text
                # print("\nMetadata:", metadata)

                header = ff.header
                # print('\nHeader:', header)

                analysis = ff.analysis
                # print('\nAnalysis:', analysis)

                channel_count = ff.channel_count
                # print('\nChannel count:', channel_count)

                channels = ff.channels
                # print('\nChannels:', channels)

                event_count = ff.event_count
                # print('\nEvent count:', event_count)

                # Access data of the current data set
                data = ff.events
                # print('\nData length: ',len(data))

                for cc in range(0,channel_count):
                    channel_name = 'p'+str(cc+1)+'n'

                    channel_data = data[cc*event_count:(cc+1)*event_count-1]

                    my_feature = mean(channel_data)

                    feature_array.append(my_feature) # assuming 'my_feature' is a single value, this will add 56 features (7 channels * 8 vials) for each patient 
                    # NOTE: if my_feature is an array instead of a single value, you will need to change up the structure on how you build your dataset for it to work properly. 

                    # It might be a good idea to look at some graphs...

                    # # Visualize channel distribution with a histogram 
                    # plt.hist(channel_data, bins=50, color='blue', alpha=0.7)
                    # plt.title(f'Histogram of {channel_name}')
                    # plt.xlabel('Fluorescence Intensity')
                    # plt.ylabel('Frequency')
                    # plt.show()

        except Exception as e:
            print('\n\nerror with file '+file_number_4d)
            print(e)

    return feature_array

warnings.simplefilter("ignore", category=UserWarning) 

training_med = [] # create placeholder
patients_train = training_patients_df['Patient Number'].to_list()
for patient in tqdm(patients_train, desc="Creating training set", unit="patients"): # 181 patients total
    training_med.append(extract_features2(patient)) # create array of arrays (matrix)
df_train_med = pd.DataFrame(training_med) # NOTE: this doesn't have any column names. I suggest you keep track of which features are which. 


testing_med = [] # create placeholder
patients_test = testing_patients_df['Patient Number'].to_list()
for patient in tqdm(patients_test, desc="Creating testing set", unit="patients"): # 178 patients total
    testing_med.append(extract_features2(patient)) # create array of arrays (matrix)
df_test_med = pd.DataFrame(testing_med)


Creating training set: 100%|██████████| 181/181 [01:44<00:00,  1.74patients/s]
Creating testing set: 100%|██████████| 178/178 [01:41<00:00,  1.75patients/s]


## TRAINING

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

def gate2(cell_num, fsc, ssc, FSC_thres, SSC_thres):
    new_cell_num = []
    for i in range(cell_num):  # Iterate over indices instead of directly over cell_num
        if fsc[i] > FSC_thres and ssc[i] < SSC_thres:
            new_cell_num.append(i)  # Append index of the cell, not the cell itself
    return new_cell_num

def gate1(cell_num, cd45, ssc, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL):
    new_cell_num2 = []
    for i in range(cell_num):  # iterate over indices
        if cd45[i] > CD45_thresL and cd45[i] < CD45_thresH and ssc[i] < SSC_thresH and ssc[i] > SSC_thresL:
            new_cell_num2.append(i)
    return new_cell_num2



In [4]:
FSC_thres = -0.748
SSC_thres = 1000.5 
SSC_thresH = -0.4
SSC_thresL = -1.2 #-1.2
CD45_thresH = 1.75
CD45_thresL = 0.32

""" BEST 
F1: 2.5
CD45:  0.32:1.75
SSC: -1.2:-0.4 """

#rf = 0.8918918918918919
#svc = 0.9459459459459459
#dc = 0.918918918918919

#CD45: 0.3:1.7
#SS: -1.2:-0.4

' BEST \nF1: 2.5\nCD45:  0.32:1.75\nSSC: -1.2:-0.4 '

In [5]:
df = pd.DataFrame(training_patients_df)

# Find cell positions where the Diagnosis is 'aml'
aml_positions = df.index[df['Diagnosis'] == 'aml'].tolist()

# Find cell positions where the Diagnosis is 'normal'
normal_positions = df.index[df['Diagnosis'] == 'normal'].tolist()

# Store the positions in an array
train_dia = {
    'aml': aml_positions,
    'normal': normal_positions
}

print(aml_positions)

[3, 18, 29, 32, 44, 49, 55, 57, 66, 83, 89, 101, 107, 114, 118, 128, 134, 141, 142, 149, 155, 169, 173]


In [6]:
num_of_cells = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 1]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cells.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cells)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 20823, 29999, 26898, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 28640, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13951, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 26986, 29999, 13960, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 12839, 29999, 29999, 29999, 27499, 29999, 29999, 29999, 29999, 17577, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 49999, 29999, 29999, 29999, 29999, 29999, 28016, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 17251, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999,

## TRAINING GATES

In [7]:
gate_1 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,1]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,2]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,5]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cells[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_1.append(gate10001)
#print(gate_1[1])

gate_2 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,1]).flatten()
    ssc2 = np.array(df_train.iloc[i,2]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_1[i]]
    gate_ssc = [ssc2[loc] for loc in gate_1[i]]

    gate_num_cell = len(gate_1)

    gate20002 = gate2(len(gate_1[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_2.append(gate20002)

blast_ratio1 = []
for i in range(len(df_train)):
    ratio = len(gate_2[i])/(num_of_cells[i])
    blast_ratio1.append(ratio)

print(blast_ratio1)


[0.11667055568518951, 0.04603486782892763, 0.062035401180039335, 0.07976932564418814, 0.06650221674055802, 0.08663622120737358, 0.086802893429781, 0.07286909563652122, 0.07066902230074336, 0.06866895563185439, 0.08416947231574386, 0.060668688956298546, 0.07763592119737324, 0.027949863132113527, 0.10397013233774459, 0.1259573202468585, 0.06460215340511351, 0.08093603120104004, 0.07460248674955831, 0.08883629454315144, 0.07766925564185473, 0.05280176005866862, 0.08123604120137339, 0.13707123570785693, 0.08540284676155872, 0.07633587786259542, 0.08126937564585486, 0.07133571119037302, 0.09063635454515151, 0.09979050279329608, 0.07163572119070635, 0.06813560452015067, 0.07790259675322511, 0.07643588119603986, 0.0731024367478916, 0.07710257008566952, 0.05616853895129838, 0.14100470015667188, 0.047101570052335076, 0.07536917897263243, 0.08000266675555852, 0.10670355678522618, 0.10083669455648521, 0.05903530117670589, 0.0806393806895563, 0.0532351078369279, 0.08960298676622554, 0.090269675655

In [8]:
num_of_cells2 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 9]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cells2.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cells2)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 23930, 29999, 27458, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 23132, 29999, 29999, 29999, 29999, 29033, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13418, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14462, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 27737, 29999, 29999, 29999, 29999, 17631, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 49999, 29999, 29999, 29999, 29999, 29999, 27841, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 16787, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999,

In [9]:
gate_01 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,8]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,9]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,12]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cells2[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_01.append(gate10001)

gate_02 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,8]).flatten()
    ssc2 = np.array(df_train.iloc[i,9]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_01[i]]
    gate_ssc = [ssc2[loc] for loc in gate_01[i]]

    gate_num_cell = len(gate_01)

    gate20002 = gate2(len(gate_01[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_02.append(gate20002)

blast_ratio2 = []
for i in range(len(df_train)):
    ratio = len(gate_02[i])/(num_of_cells2[i])
    blast_ratio2.append(ratio)

print(blast_ratio2)

[0.05026834227807594, 0.05090169672322411, 0.06943564785492849, 0.06476882562752091, 0.05063502116737224, 0.06993566452215073, 0.04826827560918697, 0.060168672289076304, 0.056135204506816895, 0.048168272275742524, 0.05620187339577986, 0.047234907830261005, 0.06610220340678023, 0.04437944003343084, 0.019067302243408114, 0.05025857673537767, 0.06876895896529885, 0.0692356411880396, 0.04773492449748325, 0.04950165005500183, 0.06840228007600253, 0.06643554785159506, 0.07763592119737324, 0.06996899896663222, 0.058879474321286526, 0.06040201340044668, 0.0567018900630021, 0.06283542784759492, 0.04846828227607587, 0.07625805118313643, 0.04466815560518684, 0.05590186339544651, 0.06103536784559485, 0.05403513450448348, 0.07216907230241008, 0.06573552451748392, 0.05390179672655755, 0.03816793893129771, 0.04333477782592753, 0.0655688522950765, 0.05310177005900197, 0.07243574785826194, 0.05653521784059469, 0.051935064502150075, 0.02161275898047399, 0.054001800060002, 0.059701990066335545, 0.0458681

In [10]:
num_of_cells3 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 15]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cells3.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cells3)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24187, 29999, 28759, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 23958, 29999, 29999, 29999, 29999, 29405, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13261, 29999, 29999, 29999, 29999, 29999, 29482, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14133, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 27958, 29999, 29999, 29999, 29999, 17386, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 49999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 17514, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999,

In [11]:
gate_001 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,15]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,16]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,19]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cells3[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_001.append(gate10001)

gate_002 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,15]).flatten()
    ssc2 = np.array(df_train.iloc[i,16]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_001[i]]

    gate_num_cell = len(gate_001)

    gate20002 = gate2(len(gate_001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_002.append(gate20002)

blast_ratio3 = []
for i in range(len(df_train)):
    ratio = len(gate_002[i])/(num_of_cells3[i])
    blast_ratio3.append(ratio)

print(blast_ratio3)

[0.07660255341844728, 0.05093503116770559, 0.06010200340011334, 0.07240241341378045, 0.04930164338811294, 0.06996899896663222, 0.07403580119337311, 0.07260242008066936, 0.06243541451381713, 0.056601886729557654, 0.07076902563418781, 0.04676822560752025, 0.06460215340511351, 0.0455203208335056, 0.07970265675522517, 0.05473069300045203, 0.06040201340044668, 0.07076902563418781, 0.05730191006366879, 0.0690356345211507, 0.06413547118237274, 0.05990199673322444, 0.06436881229374312, 0.07466915563852128, 0.04286668336255113, 0.06596886562885429, 0.06690223007433581, 0.06936897896596553, 0.06956898563285444, 0.0651589865669104, 0.06370212340411346, 0.05716857228574286, 0.05356845228174272, 0.040868028934297806, 0.058735291176372544, 0.06970232341078036, 0.04953498449948331, 0.0804026800893363, 0.038967965598853294, 0.07220240674689156, 0.059435314510483686, 0.0966365545518184, 0.06676889229640988, 0.05096836561218707, 0.040494683658849256, 0.0395346511550385, 0.07343578119270643, 0.0831027700

In [12]:
num_of_cellsx = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 22]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cellsx.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cellsx)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 18239, 29999, 23010, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 21101, 29999, 29999, 29999, 29999, 27909, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29629, 29999, 29999, 29999, 29999, 29999, 13413, 29999, 29999, 29999, 29999, 29999, 25143, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 16209, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 25379, 29999, 29999, 29999, 29999, 17683, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 49999, 29999, 29999, 29999, 29999, 29999, 24126, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 17036, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999,

In [13]:
gate_x01 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,22]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,23]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,26]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cellsx[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_x01.append(gate10001)

gate_x02 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,22]).flatten()
    ssc2 = np.array(df_train.iloc[i,23]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_x01[i]]
    gate_ssc = [ssc2[loc] for loc in gate_x01[i]]

    gate_num_cell = len(gate_x01)

    gate20002 = gate2(len(gate_x01[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_x02.append(gate20002)

blast_ratiox3 = []
for i in range(len(df_train)):
    ratio = len(gate_x02[i])/(num_of_cellsx[i])
    blast_ratiox3.append(ratio)

print(blast_ratiox3)

[0.04190139671322377, 0.05326844228140938, 0.04140138004600153, 0.04856828560952032, 0.03203440114670489, 0.04393479782659422, 0.03263442114737158, 0.05170172339077969, 0.048501616720557354, 0.03843461448714957, 0.041501383379445984, 0.03920130671022368, 0.043968132271075704, 0.05060584461867427, 0.01766725557518584, 0.055193394176445025, 0.04890163005433514, 0.054435147838261276, 0.10690356345211507, 0.03190106336877896, 0.04183472782426081, 0.061902063402113405, 0.0426347544918164, 0.03656788559618654, 0.034785081275768924, 0.028300943364778827, 0.03950131671055702, 0.051001700056668556, 0.028667622254075136, 0.045576695689562505, 0.027334244474815826, 0.03640121337377913, 0.05963532117737258, 0.045068168938964634, 0.03750125004166806, 0.03440114670489016, 0.04250141671389046, 0.03796793226440881, 0.0376995511154612, 0.05616853895129838, 0.035767858928630956, 0.0306676889229641, 0.04036801226707557, 0.039167972265742194, 0.08804890777603817, 0.04230141004700157, 0.0414680489349645, 0

In [14]:
num_of_cells4 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 29]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cells4.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cells4)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24873, 29999, 27704, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 23079, 29999, 29999, 29999, 29999, 27418, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13492, 29999, 29999, 29999, 29999, 29999, 29850, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 15273, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 26954, 29999, 29999, 29999, 29999, 17448, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 49999, 29999, 29999, 29999, 29999, 29999, 29939, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 17402, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999,

In [15]:
gate_101 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,29]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,30]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,33]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cells4[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_101.append(gate10001)

gate_202 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,29]).flatten()
    ssc2 = np.array(df_train.iloc[i,30]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_101[i]]
    gate_ssc = [ssc2[loc] for loc in gate_101[i]]

    gate_num_cell = len(gate_101)

    gate20002 = gate2(len(gate_101[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_202.append(gate20002)

blast_ratio4 = []
for i in range(len(df_train)):
    ratio = len(gate_202[i])/(num_of_cells4[i])
    blast_ratio4.append(ratio)

print(blast_ratio4)
# 5 = [33]
# 6 = [40]
# 8 = [54]

[0.07186906230207674, 0.033101103370112336, 0.057835261175372514, 0.06603553451781725, 0.08300276675889197, 0.11110370345678189, 0.0698356611887063, 0.0700023334111137, 0.04903496783226108, 0.04796826560885363, 0.048501616720557354, 0.05546851561718724, 0.05756858561952065, 0.058416757126201105, 0.08926964232141071, 0.04389257868899798, 0.04286809560318677, 0.07536917897263243, 0.05820194006466882, 0.06193539784659489, 0.05966865562185406, 0.05453515117170572, 0.09793659788659621, 0.05123504116803893, 0.027947484726374625, 0.06250208340278009, 0.07363578785959532, 0.08463615453848461, 0.08263608786959566, 0.03380990590123277, 0.05543518117270576, 0.053635121170705694, 0.03970132337744591, 0.05220174005800193, 0.05180172672422414, 0.09330311010367012, 0.05233507783592786, 0.09366978899296644, 0.04423480782692756, 0.05480182672755759, 0.0763692123070769, 0.11813727124237475, 0.06593553118437281, 0.041934731157705255, 0.03646605395790098, 0.04953498449948331, 0.0698023267442248, 0.0727690

In [16]:
num_of_cells5 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 36]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cells5.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cells5)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 21262, 29999, 26099, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 21429, 29999, 29999, 29999, 29999, 28117, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13915, 29999, 29999, 29999, 29999, 29999, 25316, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 26774, 29999, 16530, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 26609, 29999, 29999, 29999, 29999, 16931, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 49999, 29999, 29999, 29999, 29999, 29999, 27637, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 18791, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999,

In [17]:
gate_1001 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,36]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,37]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,40]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cells5[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_1001.append(gate10001)

gate_2002 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,36]).flatten()
    ssc2 = np.array(df_train.iloc[i,37]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_1001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_1001[i]]

    gate_num_cell = len(gate_1001)

    gate20002 = gate2(len(gate_1001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_2002.append(gate20002)

blast_ratio5 = []
for i in range(len(df_train)):
    ratio = len(gate_2002[i])/(num_of_cells5[i])
    blast_ratio5.append(ratio)

print(blast_ratio5)
# 5 = [33]
# 6 = [40]
# 8 = [54]

[0.055201840061335376, 0.039001300043334776, 0.040234674489149636, 0.0733024434147805, 0.06070202340078003, 0.05726857561918731, 0.07740258008600287, 0.0671022367412247, 0.0486016200540018, 0.06836894563152104, 0.06370212340411346, 0.033434447814927166, 0.04870162338744625, 0.0792493650644342, 0.08350278342611421, 0.0842561017663512, 0.047401580052668424, 0.045234841161372046, 0.047068235607853594, 0.0692356411880396, 0.06043534784492816, 0.05343511450381679, 0.03263442114737158, 0.032801093369778996, 0.05730552055625554, 0.04716823894129804, 0.0626687556251875, 0.05420180672689089, 0.09166972232407747, 0.04833374826617349, 0.061435381179372646, 0.0503350111670389, 0.023367445581519385, 0.07446914897163238, 0.04333477782592753, 0.06456881896063202, 0.04703490116337211, 0.09140304676822561, 0.03983466115537185, 0.05383512783759459, 0.054901830061002035, 0.09010300343344778, 0.05310177005900197, 0.042568085602853425, 0.02500898311174991, 0.048801626720890695, 0.0634687822927431, 0.044034

In [53]:
num_of_cellsx6 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 43]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cellsx6.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cellsx6)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19372, 29999, 27561, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 22424, 29999, 29999, 29999, 29999, 29105, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13583, 29999, 29999, 29999, 29999, 29999, 24992, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 27486, 29999, 15157, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 20790, 29999, 29999, 29999, 29999, 15461, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 49999, 29999, 29999, 29999, 29999, 29999, 27488, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 17530, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999,

In [19]:
gate_1xx1 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,43]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,44]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,47]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cellsx6[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_1xx1.append(gate10001)

gate_2xx2 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,43]).flatten()
    ssc2 = np.array(df_train.iloc[i,44]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_1xx1[i]]
    gate_ssc = [ssc2[loc] for loc in gate_1xx1[i]]

    gate_num_cell = len(gate_1xx1)

    gate20002 = gate2(len(gate_1xx1[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_2xx2.append(gate20002)

blast_ratiox5 = []
for i in range(len(df_train)):
    ratio = len(gate_2xx2[i])/(num_of_cellsx6[i])
    blast_ratiox5.append(ratio)

print(blast_ratiox5)


[0.058768625620854026, 0.05213507116903897, 0.06253541784726158, 0.059435314510483686, 0.05530184339477982, 0.046301543384779494, 0.05560185339511317, 0.06416880562685423, 0.0599686656221874, 0.05093503116770559, 0.07643588119603986, 0.04886829560985366, 0.07043568118937298, 0.04651042742102003, 0.07323577452581753, 0.07169551177388339, 0.06613553785126171, 0.06753558451948398, 0.07660255341844728, 0.07276909230307677, 0.06283542784759492, 0.054068468948964964, 0.07410247008233607, 0.053468448948298275, 0.05141812343917231, 0.05266842228074269, 0.06430214340478016, 0.06696889896329877, 0.0682689422980766, 0.030578938326747983, 0.05680189339644655, 0.06630221007366913, 0.07446914897163238, 0.05790193006433548, 0.06933564452148405, 0.04703490116337211, 0.04963498783292777, 0.06760225340844694, 0.0412680422680756, 0.06960232007733591, 0.051468382279409315, 0.06603553451781725, 0.05560185339511317, 0.040834694489816324, 0.09784289185010675, 0.0524684156138538, 0.08086936231207707, 0.072169

In [20]:
num_of_cells6 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_train)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_train.iloc[i, 50]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_of_cells6.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_of_cells6)

[29999, 29999, 29999, 29999, 29999, 9999, 29999, 9999, 9999, 9999, 29999, 29999, 9999, 29060, 9999, 29999, 9999, 29999, 29999, 9999, 29999, 29999, 9999, 29999, 8165, 29999, 9999, 29999, 29999, 10228, 29999, 29999, 9999, 9999, 10751, 9999, 24063, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 9999, 9999, 9999, 29999, 29849, 29999, 22067, 29999, 29999, 9999, 10623, 9999, 29999, 9999, 11054, 29999, 29999, 9999, 9999, 29999, 9999, 9999, 29999, 29999, 29999, 21759, 9999, 9999, 9999, 9999, 29999, 25471, 10283, 9999, 29999, 20607, 9999, 9999, 18684, 29999, 29999, 9999, 10506, 10963, 29999, 9999, 9999, 29999, 7490, 29999, 9999, 9999, 11775, 9999, 29999, 9999, 9999, 29999, 9999, 29999, 29999, 29999, 9999, 29999, 13439, 29999, 9999, 18017, 9999, 29999, 10280, 29999, 14283, 29999, 12187, 29999, 10099, 29999, 9999, 29999, 29999, 10320, 9999, 10492, 29999, 9999, 29999, 29999, 29999, 10048, 9999, 29999, 29999, 29999, 10094, 29999, 9999, 9999, 10264, 22468, 29999, 29999, 9999, 29999, 29999, 9999, 29

In [54]:
gate_111 = []
for i in range(len(df_train)):
    fsc1 = np.array(df_train.iloc[i,50]).flatten()  # Remove [i] index
    ssc1 = np.array(df_train.iloc[i,51]).flatten()  # Remove [i] index
    cd451 = np.array(df_train.iloc[i,54]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_of_cells6[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_111.append(gate10001)

gate_222 = []
for i in range(len(df_train)):
    fsc2 = np.array(df_train.iloc[i,50]).flatten()
    ssc2 = np.array(df_train.iloc[i,51]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_111[i]]
    gate_ssc = [ssc2[loc] for loc in gate_111[i]]

    gate_num_cell = len(gate_111)

    gate20002 = gate2(len(gate_111[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_222.append(gate20002)

blast_ratio6 = []
for i in range(len(df_train)):
    ratio = len(gate_222[i])/(num_of_cells6[i])
    blast_ratio6.append(ratio)

print(blast_ratio6)
# 5 = [33]
# 6 = [40]
# 8 = [54]

[0.05953531784392813, 0.03770125670855695, 0.05636854561818727, 0.06763558785292843, 0.04440148004933498, 0.0241024102410241, 0.032134404480149335, 0.027002700270027002, 0.030203020302030203, 0.017201720172017203, 0.03796793226440881, 0.03803460115337178, 0.026302630263026303, 0.016207845836200965, 0.08400840084008401, 0.06606886896229874, 0.036103610361036105, 0.05390179672655755, 0.04386812893763126, 0.022902290229022904, 0.05656855228507617, 0.04000133337777926, 0.010501050105010502, 0.06440214673822461, 0.023147581139007962, 0.054068468948964964, 0.0159015901590159, 0.03903463448781626, 0.052701756725224175, 0.0825185764567853, 0.04876829227640921, 0.04266808893629788, 0.012801280128012802, 0.0241024102410241, 0.006883080643661055, 0.0166016601660166, 0.03362008062170137, 0.04450148338277943, 0.04160138671289043, 0.025402540254025403, 0.042734757825260844, 0.059235307843594785, 0.05353511783726124, 0.03640121337377913, 0.028902890289028902, 0.018801880188018802, 0.01280128012801280

In [22]:
FSC_channel = [1, 8, 15, 22, 29, 36, 43, 50]
SSC_channel = [2, 9, 16, 23, 30, 37, 44, 51]
FITC_channel = [3, 10, 17, 24, 31, 38, 45, 52]
PE_channel = [4, 11, 18, 25, 32, 39, 46, 53]
CD45_channel = [5, 12, 19, 26, 33, 40, 47, 54]
PC5_channel = [6, 13, 20, 27, 34, 41, 48, 55]
PC7_channel = [7, 14, 21, 28, 35, 42, 49, 56]

# OTHER TRAINING FEATURES

In [23]:
#print(training_patients_df)

training_results = training_patients_df.copy()

training_results['Diagnosis'] = training_results['Diagnosis'].map({'aml': 1, 'normal': 0})

#print(training_results)

In [24]:
def calculate_ratio(df_train_med, CD45, FITC):
    CD45_med2 = []
    FITC_med2 = []

    [CD45_med2.extend(sublist) for sublist in np.array(df_train_med.iloc[:, FSC_channel[CD45]])]
    [FITC_med2.extend(sublist) for sublist in np.array(df_train_med.iloc[:, SSC_channel[FITC]])]

    CD45_med2 = np.array(CD45_med2)
    FITC_med2 = np.array(FITC_med2)

    ratio2 = CD45_med2  #/ FITC_med2
    ratio2 = ratio2.tolist()
    
    return ratio2

ratio1 = calculate_ratio(df_train_med, 0, 0)
ratio2 = calculate_ratio(df_train_med, 1, 1)
ratio3 = calculate_ratio(df_train_med, 2, 2)
ratio4 = calculate_ratio(df_train_med, 3, 3)
ratio5 = calculate_ratio(df_train_med, 4, 4)
ratio6 = calculate_ratio(df_train_med, 5, 5)
ratio7 = calculate_ratio(df_train_med, 6, 6)
ratio8 = calculate_ratio(df_train_med, 7, 7)

In [60]:
def calculate_ratio2(df_train_med, CD45, channel):
    CD45_med2 = []

    [CD45_med2.extend(sublist) for sublist in np.array(df_train_med.iloc[:, channel[CD45]])]

    CD45_med2 = np.array(CD45_med2)

    ratio2 = CD45_med2  
    ratio2 = ratio2.tolist()
    
    return ratio2

num = 5

fsc_6 =calculate_ratio2(df_train_med, num, FSC_channel)
ssc_6 = calculate_ratio2(df_train_med, num, SSC_channel)
pe_6 = calculate_ratio2(df_train_med, num, PE_channel)
pc5_6 = calculate_ratio2(df_train_med, num, PC5_channel)
pc7_6 = calculate_ratio2(df_train_med, num, PC7_channel)
fitc_6 = calculate_ratio2(df_train_med, num, FITC_channel)

# MACHINE LEARNING

In [550]:
from sklearn import metrics
from sklearn.decomposition import PCA 

blast_data = { 'blast_ratio1': blast_ratio1,
            'blast_ratio2': blast_ratio2,
            'blast_ratio3': blast_ratiox3,
            'blast_ratio4': blast_ratio3,
            'blast_ratio5':blast_ratio4, 
            'blast_ratio6': blast_ratio5,
            'blast_ratio7': blast_ratiox5,
            'blast_ratio8':blast_ratio6, 

            'ratio1':ratio1, 
            'ratio2':ratio2,
            'ratio3':ratio3,
            'ratio4':ratio4,
            'ratio5':ratio5,
            'ratio6':ratio6,
            'ratio7':ratio7,
            'ratio8':ratio8, 

            'fsc_6': fsc_6,
            'ssc_6': ssc_6, 
            'pe_6': pe_6, 
            'pc5_6': pc5_6,
            'pc7_6': pc7_6,
            'fitc_6':fitc_6

            
            } # please ignore my stupid naming conventions

# use 1 3 7 
# 22 features 

# Convert dictionary to DataFrame
X = pd.DataFrame(blast_data)
y = training_results['Diagnosis']

pca_num = 10

pca = PCA(n_components=pca_num)
X2D = pca.fit_transform(X)

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X2D, y, test_size=0.25, random_state=42,stratify=y) #

# 1 2 3 4 7

""" # Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) """
#  pca: 13


' # Create Decision Tree classifer object\nclf = DecisionTreeClassifier()\n\n# Train Decision Tree Classifer\nclf = clf.fit(X_train,y_train)\n\n#Predict the response for test dataset\ny_pred = clf.predict(X_test)\n\n# Model Accuracy, how often is the classifier correct?\nprint("Accuracy:",metrics.accuracy_score(y_test, y_pred)) '

In [551]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

voting_clf = VotingClassifier(
        estimators=[
            #('lr', LogisticRegression(random_state=42)),
            #('rf', RandomForestClassifier(random_state=42)),
            #('svc', SVC(random_state=42)), 
            #('dc', DecisionTreeClassifier(random_state=42, max_depth=2,)),
            ('bd_rf', BaggingClassifier(RandomForestClassifier(),random_state=42)),
            ('bd_dc', BaggingClassifier(DecisionTreeClassifier(),random_state=42)),
            ('bd_et',BaggingClassifier(ExtraTreeClassifier(), random_state=42)),
] )

voting_clf.fit(X_train, y_train)

In [552]:
for name, clf in voting_clf.named_estimators_.items(): 
    print(name, "=", clf.score(X_test, y_test))

bd_rf = 0.9565217391304348
bd_dc = 0.9565217391304348
bd_et = 0.9347826086956522


In [553]:
voting_clf.score(X_test, y_test)

0.9565217391304348

In [554]:
def f1_score(y_true, y_pred):
    """
    Compute the F1 score.
    
    Parameters:
        y_true (array-like): Ground truth (correct) target values.
        y_pred (array-like): Estimated targets as returned by a classifier.
        
    Returns:
        float: F1 score.
    """
    # Calculate True Positives, False Positives, and False Negatives
    tp = sum((true == 1 and pred == 1) for true, pred in zip(y_true, y_pred))
    fp = sum((true == 0 and pred == 1) for true, pred in zip(y_true, y_pred))
    fn = sum((true == 1 and pred == 0) for true, pred in zip(y_true, y_pred))
    
    # Calculate Precision and Recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Calculate F1 score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return f1


In [555]:
voting_clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
y_vc = voting_clf.predict(X_test)


f1 = f1_score(y, y_vc)
print(f1)
print(voting_clf.score(X_test, y_test))

0.1818181818181818
0.9565217391304348


# TEST

In [556]:
#[3, 18, 29, 32, 44, 49, 55, 57, 66, 83, 89, 101, 107, 114, 118, 128, 134, 141, 142, 149, 155, 169, 173]


## GATING FOR TEST

In [557]:
num_cells = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 1]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19999, 29999, 29999, 11007, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 11737, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24157, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 9589, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24815, 25403, 29999, 9999, 29999, 29999, 29999, 20823, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13797, 29999, 29999, 29999, 27155, 29999, 29999, 11101, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 2999

In [558]:
gate_t1 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,1]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,2]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,5]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t1.append(gate10001)
#print(gate_1[1])

gate_t2 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,1]).flatten()
    ssc2 = np.array(df_test.iloc[i,2]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t1[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t1[i]]

    gate_num_cell = len(gate_t1)

    gate20002 = gate2(len(gate_t1[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t2.append(gate20002)

blast_t1 = []
for i in range(len(df_test)):
    ratio = len(gate_t2[i])/(num_of_cells[i])
    blast_t1.append(ratio)

print(blast_t1)

[0.07416913897129904, 0.06390213007100237, 0.07630254341811393, 0.06783559451981733, 0.09410313677122571, 0.061102036734557816, 0.0667355578519284, 0.1058701956731891, 0.07200240008000267, 0.07100236674555818, 0.0997033234441148, 0.08566952231741058, 0.09113637121237375, 0.1330259808865197, 0.09266975565852195, 0.09640121942151833, 0.07103570119003967, 0.06580219340644688, 0.06336877895929864, 0.05816860562018734, 0.08220274009133638, 0.0852695089836328, 0.08363612120404014, 0.06716890563018767, 0.06696889896329877, 0.06193539784659489, 0.05730191006366879, 0.052868428947631586, 0.05513517117237241, 0.07241620111731843, 0.07150238341278042, 0.07893596453215107, 0.06386879562652088, 0.05396846561552052, 0.05806860228674289, 0.07413580452681756, 0.06930231007700256, 0.06433547784926164, 0.08770292343078102, 0.0634354478482616, 0.05590186339544651, 0.0883362778759292, 0.07696923230774359, 0.0889029634321144, 0.16070532578309799, 0.06843561452048402, 0.10657021900730024, 0.0412013733791126

In [559]:
num_cells2 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 8]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells2.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells2)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19999, 29999, 29999, 11181, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 12216, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 23259, 29999, 29999, 29999, 29999, 29999, 25577, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 15584, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24766, 26287, 29999, 9999, 29999, 29999, 29999, 22568, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 25827, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14000, 29999, 29999, 29999, 28269, 29999, 29999, 11164, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29

In [560]:
gate_t01 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,8]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,9]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,12]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells2[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t01.append(gate10001)
#print(gate_1[1])

gate_t02 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,8]).flatten()
    ssc2 = np.array(df_test.iloc[i,9]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t01[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t01[i]]

    gate_num_cell = len(gate_t01)

    gate20002 = gate2(len(gate_t01[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t02.append(gate20002)

blast_t2 = []
for i in range(len(df_test)):
    ratio = len(gate_t02[i])/(num_cells2[i])
    blast_t2.append(ratio)

print(blast_t2)

[0.06163538784626154, 0.052235074502483415, 0.08823627454248475, 0.05486849561652055, 0.0746358211940398, 0.04690156338544618, 0.057835261175372514, 0.05030167672255742, 0.05926864228807627, 0.0698023267442248, 0.04886829560985366, 0.061568718957298575, 0.08510283676122538, 0.04386812893763126, 0.040101336711223706, 0.06786892896429882, 0.0557351911730391, 0.06380212673755792, 0.05930197673255775, 0.04003466782226074, 0.08470282342744759, 0.06233541118037268, 0.06296876562552085, 0.054901830061002035, 0.032801093369778996, 0.06115305765288265, 0.05350178339277976, 0.0493683122770759, 0.10544674000536625, 0.05890196339877996, 0.06296876562552085, 0.049768325610853696, 0.0783026100870029, 0.06433547784926164, 0.059235307843594785, 0.0648021600720024, 0.053335111170372346, 0.041301376712557084, 0.05583519450648355, 0.055535184506150206, 0.05090169672322411, 0.07496916563885463, 0.051735057835261174, 0.055035167838927965, 0.07866928897629921, 0.06683556118537284, 0.06393546451548385, 0.070

In [561]:
num_cells3 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 15]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells3.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells3)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19999, 29999, 29999, 11258, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 11684, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24545, 29999, 29999, 29999, 29999, 29999, 24587, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 15617, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24448, 27175, 29999, 9999, 29999, 29999, 29999, 20001, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24380, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14021, 29999, 29999, 29999, 28654, 29999, 29999, 11138, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29

In [562]:
gate_t001 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,15]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,16]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,19]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells3[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t001.append(gate10001)
#print(gate_1[1])

gate_t002 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,15]).flatten()
    ssc2 = np.array(df_test.iloc[i,16]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t001[i]]

    gate_num_cell = len(gate_t001)

    gate20002 = gate2(len(gate_t001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t002.append(gate20002)

blast_t3 = []
for i in range(len(df_test)):
    ratio = len(gate_t002[i])/(num_cells3[i])
    blast_t3.append(ratio)

print(blast_t3)

[0.07186906230207674, 0.04813493783126104, 0.06446881562718758, 0.05140171339044635, 0.08070269008966965, 0.049134971165705525, 0.06196873229107637, 0.08796959898663288, 0.061268708956965234, 0.05990199673322444, 0.06563552118403947, 0.06566885562852096, 0.04896829894329811, 0.06493549784992833, 0.07263575452515084, 0.05063502116737224, 0.0596019867328911, 0.05340178005933531, 0.061402046734891164, 0.04966832227740925, 0.0634687822927431, 0.07320244008133604, 0.053035101170039, 0.053601786726224204, 0.06416880562685423, 0.08260413020651032, 0.04613487116237208, 0.041168038934631154, 0.06955054183691597, 0.0599686656221874, 0.05840194673155772, 0.053801793393113105, 0.07293576452548418, 0.05063502116737224, 0.054568485616187205, 0.05966865562185406, 0.05990199673322444, 0.05046834894496483, 0.07106903563452115, 0.05793526450881696, 0.04780159338644621, 0.07013567118903963, 0.05653521784059469, 0.05140171339044635, 0.06076869228974299, 0.0677022567418914, 0.06026867562252075, 0.056605660

In [563]:
num_cells4 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 23]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells4.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells4)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19999, 29999, 29999, 11245, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 11874, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 22951, 29999, 29999, 29999, 29999, 29999, 24599, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14678, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 22795, 27103, 29999, 9999, 29999, 29999, 29999, 15512, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 26142, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 13849, 29999, 29999, 29999, 23535, 29999, 29999, 11250, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29

In [564]:
gate_t0001 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,22]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,23]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,26]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells4[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t0001.append(gate10001)
#print(gate_1[1])

gate_t0002 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,22]).flatten()
    ssc2 = np.array(df_test.iloc[i,23]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t0001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t0001[i]]

    gate_num_cell = len(gate_t0001)

    gate20002 = gate2(len(gate_t0001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t0002.append(gate20002)

blast_t4 = []
for i in range(len(df_test)):
    ratio = len(gate_t0002[i])/(num_cells4[i])
    blast_t4.append(ratio)

print(blast_t4)

[0.042134737824594155, 0.046334877829260976, 0.04813493783126104, 0.04970165672189073, 0.0711690389679656, 0.04736824560818694, 0.047434914497149906, 0.024867495583186107, 0.03130104336811227, 0.049268308943631454, 0.05313510450348345, 0.03843461448714957, 0.08946964898829961, 0.020067335577852594, 0.06886896229874329, 0.0331677722590753, 0.028367612253741792, 0.05403513450448348, 0.044001466715557186, 0.02833427780926031, 0.043668122270742356, 0.04390146338211274, 0.0524684156138538, 0.02850095003166772, 0.03580119337311244, 0.04595229761488075, 0.0370345678189273, 0.039601320044001465, 0.07879057358826146, 0.038067935597853264, 0.04993499783326111, 0.039334644488149606, 0.07210240341344712, 0.05156838561285376, 0.057235241174705825, 0.04443481449381646, 0.03020100670022334, 0.03080102670089003, 0.0430014333811127, 0.04920164005466849, 0.03533451115037168, 0.04653488449614987, 0.02686756225207507, 0.0717357245241508, 0.052701756725224175, 0.06056868562285409, 0.054401813393779794, 0.0

In [565]:
num_cells5 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 29]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells5.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells5)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 18264, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19999, 29999, 29999, 10988, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 12046, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 23478, 29999, 29999, 29999, 29999, 29999, 24203, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 15941, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24990, 28773, 29999, 9999, 29999, 29999, 29999, 20793, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 21502, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14055, 29999, 29999, 29999, 27318, 29999, 29999, 11274, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29

In [566]:
gate_t00001 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,29]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,30]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,33]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells5[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t00001.append(gate10001)
#print(gate_1[1])

gate_t00002 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,29]).flatten()
    ssc2 = np.array(df_test.iloc[i,30]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t00001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t00001[i]]

    gate_num_cell = len(gate_t00001)

    gate20002 = gate2(len(gate_t00001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t00002.append(gate20002)

blast_t5 = []
for i in range(len(df_test)):
    ratio = len(gate_t00002[i])/(num_cells5[i])
    blast_t5.append(ratio)

print(blast_t5)

[0.06026867562252075, 0.045834861162038734, 0.05276842561418714, 0.05850195006500217, 0.06610220340678023, 0.0873695789859662, 0.0441681389379646, 0.09343644788159605, 0.07336911230374346, 0.08300276675889197, 0.10100336677889263, 0.05523517450581686, 0.10616513359614542, 0.08823627454248475, 0.08646954898496617, 0.055035167838927965, 0.08113603786792893, 0.051935064502150075, 0.06360212007066902, 0.03863462115403847, 0.07050235007833594, 0.0565018833961132, 0.06443548118270609, 0.07393579785992867, 0.09146971565718857, 0.041802090104505224, 0.03890129670989033, 0.04283476115870529, 0.04659628685839097, 0.07416913897129904, 0.052068402280076004, 0.04823494116470549, 0.0698023267442248, 0.049268308943631454, 0.05373512450415014, 0.05620187339577986, 0.07320244008133604, 0.07546918230607687, 0.08156938564618821, 0.06960232007733591, 0.06620220674022467, 0.06180206006866896, 0.06656888562952099, 0.04346811560385346, 0.0995033167772259, 0.060802026734224475, 0.05726857561918731, 0.05580558

In [567]:
num_cells6 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 36]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells6.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells6)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 18384, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19999, 29999, 29999, 10888, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 11795, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24218, 29999, 29999, 29999, 29999, 29999, 26833, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14247, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 25213, 27834, 29999, 9999, 29999, 29999, 29999, 21491, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24410, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14208, 29999, 29999, 29999, 29898, 29999, 29999, 11458, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29

In [568]:
gate_t00001 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,36]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,37]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,40]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells6[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t00001.append(gate10001)
#print(gate_1[1])

gate_t00002 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,36]).flatten()
    ssc2 = np.array(df_test.iloc[i,37]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t00001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t00001[i]]

    gate_num_cell = len(gate_t00001)

    gate20002 = gate2(len(gate_t00001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t00002.append(gate20002)

blast_t6 = []
for i in range(len(df_test)):
    ratio = len(gate_t00002[i])/(num_cells6[i])
    blast_t6.append(ratio)

print(blast_t6)

[0.03720124004133471, 0.05313510450348345, 0.043368112270409015, 0.055368512283742795, 0.042434747824927496, 0.06660222007400246, 0.05056835227840928, 0.1037701256708557, 0.05123504116803893, 0.05063502116737224, 0.06173539117970599, 0.05620187339577986, 0.04862924281984334, 0.0621020700690023, 0.05830194339811327, 0.05063502116737224, 0.04736824560818694, 0.04830161005366845, 0.06620220674022467, 0.044001466715557186, 0.05103503450115004, 0.07063568785626187, 0.0358678622620754, 0.056601886729557654, 0.0727024234141138, 0.06860343017150858, 0.06123537451248375, 0.05080169338977966, 0.01588905216752388, 0.041334711157038566, 0.05213507116903897, 0.06510217007233575, 0.06123537451248375, 0.049434981166038866, 0.04076802560085336, 0.05220174005800193, 0.04786826227540918, 0.0630687689589653, 0.05726857561918731, 0.043634787826260873, 0.06196873229107637, 0.053468448948298275, 0.05693523117437248, 0.03736791226374212, 0.058335277842594756, 0.048334944498149936, 0.0495683189439648, 0.08120

In [569]:
num_cells7 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 43]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells7.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells7)

[29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 18117, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 19999, 29999, 29999, 12072, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 11820, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 24776, 29999, 29999, 29999, 29999, 29999, 27458, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 15795, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 23870, 27733, 29999, 9999, 29999, 29999, 29999, 20691, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 25073, 29999, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 14408, 29999, 29999, 29999, 27907, 29999, 29999, 12309, 9999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 29

In [570]:
gate_t000001 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,43]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,44]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,47]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells7[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t000001.append(gate10001)
#print(gate_1[1])

gate_t000002 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,43]).flatten()
    ssc2 = np.array(df_test.iloc[i,44]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t000001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t000001[i]]

    gate_num_cell = len(gate_t000001)

    gate20002 = gate2(len(gate_t000001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t000002.append(gate20002)

blast_t7 = []
for i in range(len(df_test)):
    ratio = len(gate_t000002[i])/(num_cells7[i])
    blast_t7.append(ratio)

print(blast_t7)

[0.07433581119370646, 0.05883529450981699, 0.04656821894063135, 0.05646854895163172, 0.09056968565618854, 0.05040168005600187, 0.057535251175039166, 0.08303610120337344, 0.062168738957965264, 0.062035401180039335, 0.06946898229940998, 0.08256941898063269, 0.10095490423359275, 0.06120204006800227, 0.06166872229074302, 0.0603686789559652, 0.05790193006433548, 0.0675022500750025, 0.06200206673555785, 0.05546851561718724, 0.06413547118237274, 0.0789026300876696, 0.07470249008300277, 0.05603520117337245, 0.05020167338911297, 0.08570428521426071, 0.04553485116170539, 0.035901196706556886, 0.08316766070245195, 0.06506883562785426, 0.06263542118070603, 0.06913563785459516, 0.07216907230241008, 0.04496816560552019, 0.05153505116837228, 0.05273509116970566, 0.05913530451015034, 0.05413513783792793, 0.08563618787292909, 0.05950198339944665, 0.03970132337744591, 0.07916930564352145, 0.05743524784159472, 0.049134971165705525, 0.06626887562918764, 0.06860228674289143, 0.06276875895863196, 0.05410541

In [571]:
num_cells8 = []  # Initialize an empty list to store the number of cells for each patient

# Iterate over each row (patient) in df_train
for i in range(len(df_test)):
    # Get the number of cells for the current patient (assuming it's in the second column)
    cells_count = len(np.array(df_test.iloc[i, 50]).flatten())
    
    # Append the number of cells to the num_of_cells list
    num_cells8.append(cells_count)

# Now, num_of_cells contains the number of cells for each patient
    
print(num_cells8)

[29999, 9999, 29999, 29999, 9999, 29999, 9999, 29999, 9999, 29999, 29999, 29999, 9999, 9999, 29999, 10569, 12584, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 10495, 9999, 29999, 29999, 21887, 9999, 29999, 29999, 9999, 9999, 16055, 18175, 29999, 29999, 29999, 29999, 9999, 29567, 20863, 29999, 29999, 17151, 9999, 29999, 9999, 9999, 29999, 10694, 29999, 9999, 29999, 29999, 29999, 29999, 29999, 10367, 9999, 29999, 29999, 29999, 9999, 29999, 9999, 29999, 16672, 9999, 29999, 9999, 29999, 29999, 9999, 14207, 29999, 9999, 9157, 29999, 9999, 8699, 9999, 29999, 29999, 9999, 20351, 29999, 29999, 29999, 10555, 29999, 9999, 9999, 29999, 29999, 10320, 29999, 29999, 29999, 9999, 9999, 29999, 9999, 29999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 29999, 29999, 29999, 29999, 29999, 18815, 29999, 9999, 10623, 29999, 29999, 29999, 29999, 29999, 29999, 29999, 9999, 9999, 29999, 29999, 9999, 9999, 29999, 29999, 29999, 29999, 9999, 29999, 8905, 29999, 12543, 29999, 29999, 10361, 29999, 29999, 115

In [572]:
gate_t0000001 = []
for i in range(len(df_test)):
    fsc1 = np.array(df_test.iloc[i,50]).flatten()  # Remove [i] index
    ssc1 = np.array(df_test.iloc[i,51]).flatten()  # Remove [i] index
    cd451 = np.array(df_test.iloc[i,54]).flatten()  # Remove [i] index
    
    gate10001 = gate1(num_cells8[i], cd451, ssc1, SSC_thresH, SSC_thresL, CD45_thresH, CD45_thresL)  # Remove [i] index
    gate_t0000001.append(gate10001)
#print(gate_1[1])

gate_t0000002 = []
for i in range(len(df_test)):
    fsc2 = np.array(df_test.iloc[i,50]).flatten()
    ssc2 = np.array(df_test.iloc[i,51]).flatten()

    gate_fsc = [fsc2[loc] for loc in gate_t0000001[i]]
    gate_ssc = [ssc2[loc] for loc in gate_t0000001[i]]

    gate_num_cell = len(gate_t0000001)

    gate20002 = gate2(len(gate_t0000001[i]),gate_fsc,gate_ssc,FSC_thres,SSC_thres)
    gate_t0000002.append(gate20002)

blast_t8 = []
for i in range(len(df_test)):
    ratio = len(gate_t0000002[i])/(num_cells8[i])
    blast_t8.append(ratio)

print(blast_t8)

[0.049268308943631454, 0.019801980198019802, 0.06280209340311343, 0.04270142338077936, 0.0242024202420242, 0.05063502116737224, 0.0368036803680368, 0.07726924230807694, 0.020002000200020003, 0.04890163005433514, 0.0524684156138538, 0.04606820227340911, 0.041504150415041505, 0.0204020402040204, 0.051735057835261174, 0.004163118554262465, 0.040050858232676415, 0.05603520117337245, 0.041801393379779325, 0.04203473449114971, 0.0611353711790393, 0.03940131337711257, 0.06890229674322478, 0.04530151005033501, 0.0070007000700070005, 0.11376846117198666, 0.0167016701670167, 0.03686789559651989, 0.05810193673122437, 0.03234796911408599, 0.030103010301030103, 0.04983499449981666, 0.05696856561885396, 0.020202020202020204, 0.0222022202220222, 0.04515727187791965, 0.009408528198074279, 0.042768092269742326, 0.05180172672422414, 0.04746824894163139, 0.03813460448681623, 0.036103610361036105, 0.0018263604694422836, 0.05924363706082538, 0.05180172672422414, 0.047101570052335076, 0.10139350475190952, 0

## OTHER FEATURES FOR TEST

In [573]:
test_results = testing_patients_df.copy()

test_results['Diagnosis'] = test_results['Diagnosis'].map({'aml': 1, 'normal': 0})

In [574]:
ratio1_t = calculate_ratio(df_test_med, 0, 0)
ratio2_t = calculate_ratio(df_test_med, 1, 1)
ratio3_t = calculate_ratio(df_test_med, 2, 2)
ratio4_t = calculate_ratio(df_test_med, 3, 3)
ratio5_t = calculate_ratio(df_test_med, 4, 4)
ratio6_t = calculate_ratio(df_test_med, 5, 5)
ratio7_t = calculate_ratio(df_test_med, 6, 6)
ratio8_t = calculate_ratio(df_test_med, 7, 7)

In [575]:
num = 5

fsc_6t =calculate_ratio2(df_test_med, num, FSC_channel)
ssc_6t = calculate_ratio2(df_test_med, num, SSC_channel)
pe_6t = calculate_ratio2(df_test_med, num, PE_channel)
pc5_6t = calculate_ratio2(df_test_med, num, PC5_channel)
pc7_6t = calculate_ratio2(df_test_med, num, PC7_channel)
fitc_6t = calculate_ratio2(df_test_med, num, FITC_channel)


# MACHINE LEARNING FOR TEST

In [576]:
from sklearn import metrics
from sklearn.decomposition import PCA 

blast_test = {'blast_ratio1': blast_t1, 
              'blast_ratio2': blast_t2,
              'blast_ratio3': blast_t3, 
              'blast_ratio4': blast_t4,
              'blast_ratio5':blast_t5,
              'blast_ratio6': blast_t6,
              'blast_ratio7': blast_t7,
              'blast_ratio8':blast_t8,

              'ratio1_t':ratio1_t,
              'ratio2_t':ratio2_t,
              'ratio3_t':ratio3_t,
              'ratio4_t':ratio4_t,
              'ratio5_t':ratio5_t,
              'ratio6_t':ratio6_t,
              'ratio7_t':ratio7_t,
              'ratio8_t':ratio8_t,

              'fsc_6t':fsc_6t,
              'ssc_6t':ssc_6t,
              'pe_6t':pe_6t,
              'pc5_6t':pc5_6t,
              'pc7_6t':pc7_6t,
              'fitc_6t':fitc_6t

              } 

Xt = pd.DataFrame(blast_test)

pca = PCA(n_components= pca_num)
blast_pca = pca.fit_transform(Xt)

test_predict = voting_clf.predict(blast_pca)

def convert_binary_to_text(binary_array):
    text_array = []
    for bit in binary_array:
        if bit == 1:
            text_array.append("aml")
        elif bit == 0:
            text_array.append("normal")
        else:
            raise ValueError("Input array should only contain 0s and 1s")
    return text_array

text_predict = convert_binary_to_text(test_predict)
# print(text_predict)

# Assuming y_test_predict is a numpy array
# test_predict = [np.array(df_test.iloc[:, 0]).flatten().transpose()] +  [voting_clf.predict(blast_pca).transpose()]
# print(test_predict)

patient_nums = df_test.iloc[:,0]
diagnosis = text_predict # voting_clf.predict(text_predict).transpose()

# df_predict = pd.DataFrame(test_predict)
df_predict = pd.DataFrame({'Patient Number': patient_nums, 'Diagnosis': diagnosis})
df_predict.to_csv('Gabe_Rustia_predictions3.csv',index=False)

print(df_predict)

# Count the number of 1s
count_ones = np.count_nonzero(test_predict == 1)

print("Number of 1s in the array:", count_ones)

     Patient Number Diagnosis
0                 4    normal
1                 5       aml
2                 6    normal
3                 8    normal
4                 9    normal
..              ...       ...
173             345    normal
174             352    normal
175             354    normal
176             358    normal
177             359    normal

[178 rows x 2 columns]
Number of 1s in the array: 15


# Figures for Data

In [577]:
""" import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Sample data (assuming df_train, FSC_channel, and SSC_channel are defined)
pa1 = 169
pa2 = 4

# Assuming df_train, FSC_channel, and SSC_channel are defined

fig, axs = plt.subplots(2, 4, figsize=(16, 10))

# Overlay scatter plots for pa1 and pa2 for each subplot

# Loop through each subplot
for i in range(2):
    for j in range(4):
        # Scatter plot for vial (i*4 + j + 1)
        sns.scatterplot(x=np.array(df_train.iloc[pa1, CD45_channel[i*4 + j]]).flatten(),
                        y=np.array(df_train.iloc[pa1, SSC_channel[i*4 + j]]).flatten(),
                        ax=axs[i, j], color='blue', alpha=0.5, s=0.75)
        sns.scatterplot(x=np.array(df_train.iloc[pa2, CD45_channel[i*4 + j]]).flatten(),
                        y=np.array(df_train.iloc[pa2, SSC_channel[i*4 + j]]).flatten(),
                        ax=axs[i, j], color='red', alpha=0.5, s=0.75)
        axs[i, j].set_title('CD45 v SS Scatter plot vial {}'.format(i*4 + j + 1))

# Set x and y axis ticks
for ax in axs.flat:
    ax.set_xticks(np.arange(-1.5, 2.25, 0.5))
    ax.set_yticks(np.arange(-1.5, 2.25, 0.5))

plt.tight_layout()
plt.show()
 """

" import matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\n\n\n# Sample data (assuming df_train, FSC_channel, and SSC_channel are defined)\npa1 = 169\npa2 = 4\n\n# Assuming df_train, FSC_channel, and SSC_channel are defined\n\nfig, axs = plt.subplots(2, 4, figsize=(16, 10))\n\n# Overlay scatter plots for pa1 and pa2 for each subplot\n\n# Loop through each subplot\nfor i in range(2):\n    for j in range(4):\n        # Scatter plot for vial (i*4 + j + 1)\n        sns.scatterplot(x=np.array(df_train.iloc[pa1, CD45_channel[i*4 + j]]).flatten(),\n                        y=np.array(df_train.iloc[pa1, SSC_channel[i*4 + j]]).flatten(),\n                        ax=axs[i, j], color='blue', alpha=0.5, s=0.75)\n        sns.scatterplot(x=np.array(df_train.iloc[pa2, CD45_channel[i*4 + j]]).flatten(),\n                        y=np.array(df_train.iloc[pa2, SSC_channel[i*4 + j]]).flatten(),\n                        ax=axs[i, j], color='red', alpha=0.5, s=0.75)\n        axs[i, j].set_

In [578]:
""" import seaborn as sns
import matplotlib.pyplot as plt

# Plot the first scatter plot


ra = ratio6
sns.scatterplot(x=df_train_med.iloc[:,0], y= ra, color='red', s=10)
sns.scatterplot(x=[df_train_med.iloc[i,0] for i in aml_positions], y=[ra[i] for i in aml_positions], color='blue', s=20)

plt.tight_layout()
plt.show()

# ratio: 6 """

" import seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Plot the first scatter plot\n\n\nra = ratio6\nsns.scatterplot(x=df_train_med.iloc[:,0], y= ra, color='red', s=10)\nsns.scatterplot(x=[df_train_med.iloc[i,0] for i in aml_positions], y=[ra[i] for i in aml_positions], color='blue', s=20)\n\nplt.tight_layout()\nplt.show()\n\n# ratio: 6 "