# Prepare data for model quality assurance

In [1]:
import pandas as pd
import pathlib
import numpy as np
from assets.hierarchy import get_hierarchy_lookup
from assets.functions import gate_for_patient

In [2]:
# Load data
data_model = pd.read_pickle(
    pathlib.Path(r'data\\processed_data\\#1_aicontrol_dataset_pred.pkl')
    )

In [3]:
# convert multiindex into columns
data_model.reset_index(inplace=True, drop=False, col_level=1)
# drop header to easy data handling
data_model = data_model.droplevel(axis=1, level=0)

In [4]:
data_model

Unnamed: 0,file,event,FSC-A,SSC-A,FITC-A,PE-A,PerCP-A,PE-Cy7-A,APC-A,APC-H7-A,...,TBNK Sum,TP,valid_Q_file,used_for_training_Q_file,pred_Lympho,pred_BP,pred_NKP,pred_TP,pred_T4P,pred_T8P
0,001,0,50573.101562,9443.280273,-192.361191,49.568016,127.291069,-25.279428,-207.001297,49765.726562,...,1,1,1,0,1,0,0,1,0,0
1,001,1,60721.648438,15401.190430,50677.765625,11363.137695,607.533691,947.504639,-62.260956,21956.218750,...,0,0,1,0,0,0,0,0,0,0
2,001,2,38507.070312,11470.410156,102.436935,13033.757812,683.279968,4808.329102,-151.480698,41288.160156,...,1,0,1,0,1,0,1,0,0,0
3,001,3,88148.398438,88019.195312,99.228241,836.311340,-139.178345,2764.074951,67.033386,11105.057617,...,0,0,1,0,0,0,0,0,0,0
4,001,4,71808.062500,86550.664062,128.604401,1799.095459,123.587227,5954.442383,97.762772,14570.107422,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6887771,100,76983,73832.546875,74114.460938,163.966949,1048.523315,309.503815,10503.620117,473.652069,11203.947266,...,0,0,1,0,0,0,0,0,0,0
6887772,100,76984,56489.968750,44520.421875,61.773682,560.338135,-157.014450,6751.536133,240.275589,8255.482422,...,0,0,1,0,0,0,0,0,0,0
6887773,100,76985,63175.050781,67636.171875,79.222885,752.837341,251.232635,12247.950195,54.703140,10784.350586,...,0,0,1,0,0,0,0,0,0,0
6887774,100,76986,56315.968750,24900.330078,5775.441406,8930.164062,-378.949402,1057.262939,382.759033,29675.646484,...,0,0,1,0,0,0,0,0,0,0


In [5]:
# get traning data
data_train = data_model.loc[data_model['used_for_training_Q_file'] == 1]
# Get patient IDs of events used for training -> should not be used for quality assurance
patients_train = data_train.loc[:, 'file'].unique()
# get test data
data_test = data_model[~data_model['file'].isin(patients_train)]

In [6]:
# save training and test datasets
data_train.to_pickle(pathlib.Path(r'data/processed_data/dataset_pred_train.pkl'))
data_test.to_pickle(pathlib.Path(r'data/processed_data/dataset_pred_test.pkl'))

In [7]:
# save complete dataset (reformatted)
data_model.to_pickle(pathlib.Path(r'data/processed_data/dataset_pred_complete.pkl'))

In [8]:
# save patient IDs of training events
pd.Series(patients_train).to_pickle(pathlib.Path(r'data/processed_data/patient_ids_train.pkl'))

## Sum events per sample

In [9]:
# list of celltypes considered by the model
celltypes_ddm = ['Lympho', 'BP','NKP', 'TP', 'T4P', 'T8P']

# list of column names for predicted events by the model
celltypes_pred = [f'pred_{celltype}' for celltype in celltypes_ddm]

In [10]:
# names of columns that will be summed
sum_cols = celltypes_ddm + celltypes_pred

In [11]:
event_sum_per_sample = data_model.loc[:, ['file'] + sum_cols].groupby('file').sum()

event_sum_per_sample.loc[:, 'total'] = data_model.loc[:, ['file'] + sum_cols].groupby('file').size()

event_sum_per_sample

Unnamed: 0_level_0,Lympho,BP,NKP,TP,T4P,T8P,pred_Lympho,pred_BP,pred_NKP,pred_TP,pred_T4P,pred_T8P,total
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
001,18566,2798,6567,9242,5719,2919,18537,2788,6060,9219,5684,2919,49450
002,18647,2620,3062,12725,8924,2656,18475,2597,3015,12714,8921,2734,61105
003,19040,1849,1665,15422,7985,5678,18992,1834,1618,15404,7969,5670,74653
004,19217,2465,3792,12789,7466,4244,19267,2470,3781,12798,7468,4246,60175
005,17963,1607,2376,14355,10509,3241,18099,1618,1940,14383,10478,3248,97513
...,...,...,...,...,...,...,...,...,...,...,...,...,...
096,17477,3483,882,12775,8996,3058,17273,3337,824,12718,8823,3037,87710
097,19276,1495,2627,15041,8235,5146,18684,1394,2156,14916,8034,5129,56295
098,18543,2220,3489,12619,8667,3243,18195,2145,3300,12529,8500,3237,61959
099,17494,2102,5213,9880,5658,3771,17316,2039,5072,9869,5607,3769,50206


In [12]:
event_sum_per_sample.to_pickle(pathlib.Path(r'data/processed_data/event_sum_per_sample_complete.pkl'))

In [13]:
celltypes_pred.append('total')
event_sum_per_sample[celltypes_pred].to_pickle(pathlib.Path(r'data/processed_data/model_summed.pkl'))

## Determine manual linear gating

In [14]:
# Names of the marker features
marker_names = ['FSC-A', 'SSC-A', 'APC-A', 'APC-H7-A', 'AmCyan-A', 'FITC-A', 'PE-A', 
                'PE-Cy7-A', 'Pacific Blue-A', 'PerCP-A']

# Names of the ground truth features of all cell types
gt_names_all = ['BP', 'CD4', 'CD4x38',
       'CD4xDR', 'CD4xDRx38', 'CD8', 'CD8x38', 'CD8xDR', 'CD8xDRx38', 'Lympho',
       'Manja rufen!', 'NKP', 'NKTP', 'NKx38', 'NKxDR', 'NKxDRx38', 'Q1-1',
       'Q1-2', 'Q1-3', 'Q1-5', 'Q1-6', 'Q1-7', 'Q1-9', 'Q2', 'Q3', 'Q3-1',
       'Q3-2', 'Q3-4', 'Q3-5', 'Q3-6', 'Q3-7', 'Q3-9', 'Q4-1', 'Q4-2', 'Q4-3',
       'Q4-4', 'Q4-5', 'Q4-6', 'Q4-7', 'Q4-9', 'T48NT', 'T48PT', 'T4P', 'T8P',
       'TBNK Sum', 'TP']

# Names of the ground truth features of selected cell types, i.e. cell types that are also predicted by the DDM
gt_names_selected = ['Lympho', 'T4P', 'T8P', 'BP', 'TP', 'NKP']

# Names of the features for the predicted cell types
pred_names_selected = ['pred_{}'.format(celltype) for celltype in gt_names_selected]

In [15]:
hierarchy_lookup = get_hierarchy_lookup()

# Names of celltypes determined in gating plots
gating_celltype_names = list(hierarchy_lookup.loc[hierarchy_lookup['marker_x'].notna()].index.values)
gating_celltype_names.remove('Lympho')
gating_celltype_names.remove('Lympho_alternative')

gating_celltype_names.remove('NKxDRx38')
gating_celltype_names.remove('CD4xDRx38')
gating_celltype_names.remove('CD8xDRx38')

In [16]:
# create column Not(TP)
data_model.loc[:, 'Not_TP'] = (~data_model.loc[:, 'TP'].astype(bool)).astype(np.uint8)

In [17]:
# determine the x and y axis position of linear gates
gating_df = []

for patient_id in data_model['file'].unique():
    gating_df.append(gate_for_patient(patient_id, data_model))
    
gating_df = pd.concat(gating_df)

In [18]:
gating_df

Unnamed: 0,BP_AmCyan-A_q12,BP_AmCyan-A_q34,BP_APC-A_q13,BP_APC-A_q24,TP_AmCyan-A_q12,TP_AmCyan-A_q34,TP_APC-A_q13,TP_APC-A_q24,T48PT_Pacific Blue-A_q12,T48PT_Pacific Blue-A_q34,...,NKTP_PE-Cy7-A_q13,NKTP_PE-Cy7-A_q24,CD4_AmCyan-A_q12,CD4_AmCyan-A_q34,CD4_Pacific Blue-A_q13,CD4_Pacific Blue-A_q24,CD8_AmCyan-A_q12,CD8_AmCyan-A_q34,CD8_PerCP-A_q13,CD8_PerCP-A_q24
001,461.408005,460.324051,1546.691833,1546.691833,461.408005,460.324051,1546.691833,1546.691833,233.502617,233.502617,...,1073.911011,1078.053223,487.012085,498.808182,211.248085,211.248085,504.310776,487.861938,990.332001,3825.442261
002,465.573700,461.550323,1549.481140,1549.481140,465.573700,461.550323,1549.481140,1549.481140,186.389404,186.389404,...,11729.672852,1558.832581,590.344879,594.977478,312.575684,312.575684,490.223831,490.223831,1201.658386,6921.157471
003,451.195740,451.195740,1802.897827,1802.897827,451.195740,451.195740,1802.897827,1802.897827,200.654549,200.654549,...,2032.824158,760.268158,587.791046,591.941376,303.369293,303.369293,471.380859,479.999893,1184.574951,3082.143188
004,462.813980,462.813980,1613.322815,1613.322815,462.813980,462.813980,1613.322815,1613.322815,212.288345,212.288345,...,2025.792664,1355.437561,596.772156,596.772156,212.790939,212.790939,484.688721,492.254013,1183.739258,2999.500122
005,462.004196,462.004196,1568.070068,1568.070068,462.004196,462.004196,1568.070068,1568.070068,195.807983,195.807983,...,2566.633301,1253.903687,402.203644,402.203644,206.456284,206.456284,482.189163,487.244675,1170.567078,3372.401001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
096,468.626633,467.077530,1980.721741,1980.721741,468.626633,467.077530,1980.721741,1980.721741,213.936401,213.936401,...,2906.513062,1249.622498,460.437057,457.869614,200.150414,200.150414,427.757172,432.122986,1040.649231,3051.995850
097,465.489029,465.489029,2349.541382,2349.541382,465.489029,465.489029,2349.541382,2349.541382,203.944809,203.944809,...,1305.977539,1294.955444,465.489029,465.489029,144.499069,144.499069,440.895248,440.895248,1013.062683,2920.687866
098,462.685913,467.453308,1914.418884,1914.418884,462.685913,467.453308,1914.418884,1914.418884,232.355072,232.355072,...,3014.272339,1272.894592,477.395081,477.395081,155.943855,155.943855,371.646011,371.646011,825.394165,2151.710449
099,472.083267,466.991913,2254.812256,2254.812256,472.083267,466.991913,2254.812256,2254.812256,203.208473,203.208473,...,3049.025513,1068.188843,481.194122,484.080368,169.896675,169.896675,423.866104,423.866104,932.293732,3027.066162


In [19]:
gating_df.to_pickle(pathlib.Path(r'data/processed_data/manual_gating_expert1.pkl'))