# AML Diagnostic Subtype Predictor

## Load data

In [1]:
import pandas as pd

input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'

# read df_discovery and df_validation
df_discovery = pd.read_pickle(
    input_path+'df_discovery.pkl').sort_index()

df_validation = pd.read_pickle(
    input_path+'df_validation.pkl').sort_index()

# Load clinical data
discovery_clinical_data = pd.read_csv(input_path+'discovery_clinical_data.csv',
                                      low_memory=False, index_col=0)

validation_clinical_data = pd.read_csv(input_path+'validation_clinical_data.csv',
                                        low_memory=False, index_col=0)

# Load pacmap output data
df = pd.read_csv(output_path+'pacmap_output/pacmap_5d_output_acute_leukemia.csv', index_col=0)

In [10]:
df

Unnamed: 0,index,PaCMAP 1,PaCMAP 2,PaCMAP 3,PaCMAP 4,PaCMAP 5,Clinical Trial,Sample Type,Patient_ID,ELN AML 2022 Diagnosis,Train Test,Batch,Hematopoietic Lineage
1,0031efba-f564-4fff-bd7b-2b97f37218c1_noid,-5.859570,-11.129180,6.436264,7.430994,10.586304,AAML0531,Bone Marrow Normal,PASDKZ,Otherwise-Normal Control,Discovery (train) Samples,GDC_TARGET-AML,Otherwise-Normal (Control)
2,0037ec75-bb9e-4dbb-a2d9-de1f9bfd2362_noid,13.544900,-14.129482,-1.171913,-14.600304,-12.464355,AAML0531,Primary Blood Derived Cancer - Peripheral Blood,PASDKZ,AML with other rare recurring translocations,Discovery (train) Samples,GDC_TARGET-AML,Acute myeloid leukemia (AML)
7,00bb876d-e77c-4479-ab39-0c4d56d08077_noid,-0.367067,-11.666441,9.906540,11.882847,10.278313,AAML03P1,Bone Marrow Normal,PAMYMA,Otherwise-Normal Control,Discovery (train) Samples,GDC_TARGET-AML,Otherwise-Normal (Control)
14,02b94d3d-2b6f-4c7a-9554-8e6256ddc791_noid,12.756259,6.961876,21.689531,-1.026731,22.311605,AAML0531,Primary Blood Derived Cancer - Bone Marrow,PASSLT,AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement,Discovery (train) Samples,GDC_TARGET-AML,Acute myeloid leukemia (AML)
15,02d84607-0150-48ac-b9dc-f857e5cabf1c_noid,-13.957327,-13.374085,2.331018,25.993230,-13.191642,AAML0531,Recurrent Blood Derived Cancer - Bone Marrow,PAPXWI,AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q...,Discovery (train) Samples,GDC_TARGET-AML,Acute myeloid leukemia (AML)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,9426021149_R03C02,13.412078,-10.819320,-1.390031,-17.948680,-12.771581,AML08,Diagnosis,AML08_SP_8,AML with t(6;9)(p23;q34.1)/DEK::NUP214,Validation (test) Samples,St Jude Children's,Acute myeloid leukemia (AML)
196,9426021149_R04C01,-12.519629,10.018703,19.824512,-4.732997,-18.703844,AML08,Diagnosis,AML08_SP_120,AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement,Validation (test) Samples,St Jude Children's,Acute myeloid leukemia (AML)
197,9426021149_R04C02,1.061571,22.337622,-12.730034,-7.005747,14.116555,AML08,Diagnosis,AML08_SP_48,AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1,Validation (test) Samples,St Jude Children's,Acute myeloid leukemia (AML)
198,9426021149_R05C01,1.179837,21.893232,-12.804467,-7.287878,14.025510,AML08,Diagnosis,AML08_SP_240,AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1,Validation (test) Samples,St Jude Children's,Acute myeloid leukemia (AML)


## Preprocess data

### Exclude categories with <10 samples

In [2]:
# exclude the samples with mixed phenotypes and Down syndrome and t(9;22)(q34.1;q11.2)/BCR::ABL1
df = df[~df['ELN AML 2022 Diagnosis'].isin(['Mixed phenotype acute leukemia T/myeloid',
                                       'Myeloid leukaemia associated with Down syndrome',
                                       'AML with t(9;22)(q34.1;q11.2)/BCR::ABL1'])]

df['ELN AML 2022 Diagnosis'].value_counts()

ELN AML 2022 Diagnosis
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement                           360
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11             201
AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1                                198
AML with other rare recurring translocations                              172
AML with mutated NPM1                                                     172
MDS-related or secondary myeloid neoplasms                                148
Otherwise-Normal Control                                                  121
AML with in-frame bZIP mutated CEBPA                                       69
APL with t(15;17)(q24.1;q21.2)/PML::RARA                                   31
AML with t(6;9)(p23;q34.1)/DEK::NUP214                                     27
AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement     10
Name: count, dtype: int64

### Exclude samples with missing labels

In [3]:
# drop the samples with missing labels for the ELN AML 2022 Diagnosis
df = df[~df['ELN AML 2022 Diagnosis'].isna()]

### Define X and y

In [4]:

# Define X and y
X = df[['PaCMAP 1', 'PaCMAP 2', 'PaCMAP 3', 'PaCMAP 4', 'PaCMAP 5']].to_numpy() # shape (n_samples=1399, n_features=5)
y = df['ELN AML 2022 Diagnosis'].to_numpy() # shape (n_samples=1399,) with 11 string classes

# Split the data into train/test sets based on the `Train Test` column
X_train = X[df['Train Test']=='Discovery (train) Samples']
y_train = y[df['Train Test']=='Discovery (train) Samples']
X_test = X[df['Train Test']=='Validation (test) Samples']
y_test = y[df['Train Test']=='Validation (test) Samples']

## Introductory analysis with SPPL

- __SPPL__: Sum-Product Probabilistic Language

- __Github__: [https://github.com/probsys/sppl](https://github.com/probsys/sppl)

- __Paper__: [SPPL: Probabilistic Programming with Fast Exact Symbolic Inference](https://arxiv.org/abs/2010.03485)

- __Intro on SPNs__: [Visualizing and understanding Sum-Product Networks](https://link.springer.com/article/10.1007/s10994-018-5760-y)

In [6]:
%load_ext sppl.magics

In [None]:
%%sppl model

# Population model.
sex ~= choice({'female': .3307, 'male': .6693})
if (sex == 'female'):
    capital_gain ~= norm(loc=568.4105, scale=24248365.5428)
    if capital_gain < 7298.0000:
        age ~= norm(loc=38.4208, scale=184.9151)
        relationship ~= choice({
            '0': .0491, '1': .1556, '2': .4012,
            '3': .2589, '4': .0294, '5': .1058
        })
    else:
        age ~= norm(loc=38.8125, scale=193.4918)
        relationship ~= choice({
            '0': .0416, '1': .1667, '2': .4583,
            '3': .2292, '4': .0166, '5': .0876
        })
else:
    capital_gain ~= norm(loc=1329.3700, scale=69327473.1006)
    if capital_gain < 5178.0000:
        age ~= norm(loc=38.6361, scale=187.2435)
        relationship ~= choice({
            '0': .0497, '1': .1545, '2': .4021,
            '3': .2590, '4': .0294, '5': .1053
        })
    else:
        age ~= norm(loc=38.2668, scale=187.2747)
        relationship ~= choice({
            '0': .0417, '1': .1624, '2': .3976,
            '3': .2606, '4': .0356, '5': .1021
        })

condition(age > 18)

# Decision model.
if relationship == '1':
    if capital_gain < 5095.5:
        t ~= atomic(loc=1)
    else:
        t ~= atomic(loc=0)
elif relationship == '2':
    if capital_gain < 4718.5:
        t ~= atomic(loc=1)
    else:
        t ~= atomic(loc=0)
elif relationship == '3':
    if capital_gain < 5095.5:
        t ~= atomic(loc=1)
    else:
        t ~= atomic(loc=0)
elif relationship == '4':
    if capital_gain < 8296:
        t ~= atomic(loc=1)
    else:
        t ~= atomic(loc=0)
elif relationship == '5':
    t ~= atomic(loc=1)
else:
    if capital_gain < 4668.5:
        t ~= atomic(loc=1)
    else:
        t ~= atomic(loc=0)

#####################
n = %sppl_get_namespace model
model = n.model

##########################
model_c1 = model.condition(n.t << {0})

p_female_prior = model.prob(n.sex << {'female'})
p_female_given_no_hire = model_c1.prob(n.sex << {'female'})
print(100 * (p_female_given_no_hire / p_female_prior - 1))

## Watermark

In [None]:
# Add watermark with all packages used
get_ipython().run_line_magic('load_ext', 'watermark')

In [None]:
# print watermark with all packages used in this notebook


Author: Francisco_Marchi@Lamba_Lab_UF

Python implementation: CPython
Python version       : 3.8.16
IPython version      : 8.12.2

numpy     : 1.24.3
pandas    : 2.0.2
sklearn   : 1.2.2
matplotlib: 3.7.1
seaborn   : 0.12.2
lightgbm  : 3.3.5

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.15.90.1-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 20
Architecture: 64bit

