# Initial preprocessing

## Validate ground truth (DQC1)

**Evidence**	<br>
Events selected based on their cell type stored in the class matrices were matched with  the exported events from manual gating of the same cell type.<br>



In [1]:
# import required libraries
import pathlib
import os
import glob
from assets.functions import open_fcs, importFCS_compensate, \
    preprocess_raw_data
import pandas as pd
import numpy as np

In [2]:
# load FCS files and class matrices generated with mathematica
fcsDir = pathlib.Path(r"data\\fcs_data")
fcsFileNames = glob.glob(f"{fcsDir}\\*.fcs")
csvDir = pathlib.Path(r"data\\data_classified_by_expert1\\class_matrices_all_classes")
fcsDir = pathlib.Path(r"data\\data_classified_by_expert1")
EXT_CSV = "*.csv"
EXT_FCS = "*.fcs"

# load class matrix names
csvFileNames = [file 
                for csv, subdir, files in os.walk(csvDir) 
                for file in glob.glob(os.path.join(csv, EXT_CSV))]

# load fcs file names
exclude_strings = ['Lympho', 'BP', 'NKP', 'TP', 'T4P', 'T8P']
fcsFileNames = [file
                for path, subdir, files in os.walk(fcsDir)
                for file in glob.glob(os.path.join(path, EXT_FCS))
                if not any(ex_string in os.path.basename(file) for ex_string in exclude_strings)]

In [3]:
# load FCS files from manual gating
# each "_celltype_" FCS file contains only subset as determined through manual gating in BD FACSDiva v8.0.2
fcsSubDir = pathlib.Path(r"data\\data_classified_by_expert1")
EXT = "*.fcs"
LymphofcsFileNames = [file 
                for fcs, subdir, files in os.walk(fcsSubDir) 
                for file in glob.glob(os.path.join(fcs, EXT)) if '_Lympho_' in file]
BPfcsFileNames = [file 
                for fcs, subdir, files in os.walk(fcsSubDir) 
                for file in glob.glob(os.path.join(fcs, EXT)) if '_BP_' in file]
NKPfcsFileNames = [file 
                for fcs, subdir, files in os.walk(fcsSubDir) 
                for file in glob.glob(os.path.join(fcs, EXT)) if '_NKP_' in file]
TPfcsFileNames = [file 
                for fcs, subdir, files in os.walk(fcsSubDir) 
                for file in glob.glob(os.path.join(fcs, EXT)) if '_TP_' in file]
T4PfcsFileNames = [file 
                for fcs, subdir, files in os.walk(fcsSubDir) 
                for file in glob.glob(os.path.join(fcs, EXT)) if '_T4P_' in file]
T8PfcsFileNames = [file 
                for fcs, subdir, files in os.walk(fcsSubDir) 
                for file in glob.glob(os.path.join(fcs, EXT)) if '_T8P_' in file]

In [4]:
# define cell subsets of interest
cell_subsets_of_interst = ['Lympho', 'BP', 'NKP', 'TP', 'T4P', 'T8P']
# create list of all manually gated fcs filenames
fcsSubFileNames = [LymphofcsFileNames, BPfcsFileNames, NKPfcsFileNames, TPfcsFileNames, 
                   T4PfcsFileNames, T8PfcsFileNames]

# select cells in FCS file based on condition in class matrix (e.g. Lympho==1) 
# and check if the selection is equal to the "_celltype_" FCS file from BD FACSDiva v8.0.2

for celltype in range(6):
    print(cell_subsets_of_interst[celltype])
    for i in range(100):
        # rearrange column "Time" in subset FCS files equal to original fcs file
        if (open_fcs(fcsSubFileNames[celltype][i])[:,[10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]] ==
            open_fcs(fcsFileNames[i])[pd.read_csv(csvFileNames[i])[cell_subsets_of_interst[celltype]]==1]).all() == False:
            print('Error in class matrix detected', 'in', cell_subsets_of_interst[celltype], 
                  ' #file ', i+1)
        else:
            print('checked #file:', i+1, end="\r")
    print()

Lympho
checked #file: 100
BP
checked #file: 100
NKP
checked #file: 100
TP
checked #file: 100
T4P
checked #file: 100
T8P
checked #file: 100


**Conclusion**	<br>
The cell types stored in the class matrices align with the expert's manual gating results (ground truth).

## Data preprocessing for subsequent model implementation data quality assurance

In [5]:
# correct for fluorescence overlap

# manually assign fluorescent channels in spillover matrix (copy paste from metadata, see also header in fcsparser)
spilloverColNames = ['FITC-A','PE-A','PerCP-A','PE-Cy7-A','APC-A','APC-H7-A','Pacific Blue-A','AmCyan-A']

# manually generate spillover matrix (copy paste from metadata, see also header in fcsparser)
spilloverList = np.array([1,0.14499559729967626,0.030818902260052756,0.002935133548576454,
                          0.00025242166432390677,0,0,0.013892965463261886,0.017747527114277597,1,
                          0.2512006615565677,0.01936961292579762,0.0004649979391065989,
                          0.00022497163272324362,0,0,0,0,1,0.0782845473110962,0.08909214832730754,
                          0.015777378258149535,0,0,0.0014724232592962347,0.010032443224357373,
                          0.03608684801597224,1,0.0007297228679969534,0.09257233272762264,0,0,0,0,
                          0.00822608423900849,0.0009264873033695553,1,0.06805489109035323,0,0,
                          0.00019294268176635863,0.0001929426817663584,0.0005788280452990768,
                          0.0143742297915937,0.03086311021734536,1,0,0,0,0,0,0.00027757216876387914,
                          0.0008354928209373731,0.001008512177880296,1,0.2970022205773495,
                          0.11682892906815066,0.028685674547983413,0.007823365785813646,
                          0.0015646731571627294,0,0,0.08240611961057033,1])

# reshape spillover matrix (8 fluorescence markers)
spilloverMatrix = spilloverList.reshape(8,8)

In [6]:
# load FCS files and compensate fluoresent data
nFiles = 100
dataset = []

for i in range(0, nFiles):
    header_loop, data_loop = importFCS_compensate(fcsFileNames[i], spilloverMatrix)
    dataset.append(data_loop[:,1:])  # remove channel 'Time'

In [7]:
# load class matrices storing event labels
labelset = []
# load training class information of .csv-file n
for i in range(0, nFiles):
    labels_loop = pd.read_csv(csvFileNames[i])
    labelset.append(labels_loop.iloc[:, 1:])  # [...].iloc[:, 1:] to get rid of event_number which is redunant with row number

In [8]:
# save all compensated events in the fcs files and thier valid cell type information as pickle

raw_data_preprocessed_list = []
for i in range(100):
    raw_data_preprocessed_list.append(preprocess_raw_data(dataset[i], labelset[i], i))
raw_data_preprocessed = pd.concat(raw_data_preprocessed_list)
raw_data_preprocessed.to_pickle(pathlib.Path(r'data\\processed_data\\raw_data_preprocessed.pkl'))