In [1]:
output_path = '../Data/Intermediate_Files/'

# Import functions to clean up clinical data
from FM_Functions.Clinical_Data_CleanUp import *

# Call functions to merge, index and clean clinical data files
labels_0531         = clean_cog       (merge_index_0531())
labels_1031         = clean_cog       (merge_index_1031())
labels_aml05        = clean_aml05     (merge_index_aml05())
labels_beataml      = clean_beataml   (merge_index_beataml())
labels_amltcga      = clean_amltcga   (merge_index_amltcga())
labels_nordic_all   = clean_nordic_all(merge_index_nordic_all())
labels_mds_taml     = clean_mds_taml  (merge_index_mds_taml())
labels_all_graal    = clean_all_graal (merge_index_all_graal())
labels_target_all   = clean_target_all(merge_index_target_all())

## COG trials

### ELN2022

In [2]:
def classify_controls(normal_samples):
    mapping = {
        'Bone Marrow Normal': 'Otherwise-Normal Control',
        'Blood Derived Normal': 'Otherwise-Normal Control'}
    
    for key, value in mapping.items():
        if key in normal_samples:
            return value


def classify_fusion(gene_fusion):
    mapping = {
    'RUNX1-RUNX1T1': 'AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1',
    'CBFB-MYH11':    'AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11',
    'KMT2A':         'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'add(11)(q23)':  'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'MLL':           'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'PML-RARA':      'APL with t(15;17)(q24.1;q21.2)/PML::RARA',
    'DEK-NUP214':    'AML with t(6;9)(p23;q34.1)/DEK::NUP214',
    'MECOM':         'AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement',
    'NPM1-MLF1':     'AML with other rare recurring translocations',
    'PRDM16-RPN1':   'AML with other rare recurring translocations',
    'RBM15-MRTFA':   'AML with other rare recurring translocations',
    'RBM15-MKL1':    'AML with other rare recurring translocations',
    'NUP98':         'AML with other rare recurring translocations',
    'ETV6-MNX1':     'AML with other rare recurring translocations',
    'KAT6A-CREBBP':  'AML with other rare recurring translocations',
    'PICALM-MLLT10': 'AML with other rare recurring translocations',
    'FUS-ERG':       'AML with other rare recurring translocations',
    'RUNX1-CBFA2T3': 'AML with other rare recurring translocations',
    'CBFA2T3-GLIS2': 'AML with other rare recurring translocations',
    'BCR-ABL1':       'AML with t(9;22)(q34.1;q11.2)/BCR::ABL1'}

    # Other uncharacterized abdnormalities present in the dataset but not in guidelines
    
    # 'CBFA2T3-GLIS3': 'AML with other rare recurring translocations',
    # 'PSIP1-NUP214':  'AML with other rare recurring translocations',
    # 'XPO1-TNRC18':   'AML with other rare recurring translocations', 
    # 'HNRNPH1-ERG':   'AML with other rare recurring translocations',
    # 'NIPBL-HOXB9':   'AML with other rare recurring translocations', 
    # 'SET-NUP214':    'AML with other rare recurring translocations', 
    # 'FLI1-IFIT2':    'AML with other rare recurring translocations', 
    # 'TCF4-ZEB2':     'AML with other rare recurring translocations',
    # 'MBTD1-ZMYND11': 'AML with other rare recurring translocations', 
    # 'FOSB-KLF6':     'AML with other rare recurring translocations', 
    # 'SFPQ-ZFP36L2':  'AML with other rare recurring translocations', 
    # 'RUNX1-LINC00478':'AML with other rare recurring translocations',
    # 'RUNX1-EVX1':     'AML with other rare recurring translocations',  
    # 'PSPC1-ZFP36L1':  'AML with other rare recurring translocations', 
    # 'EWSR1-FEV':      'AML with other rare recurring translocations',
    # 'STAG2-AFF2':     'AML with other rare recurring translocations', 
    # 'MYB-GATA1':      'AML with other rare recurring translocations', 
    # 'CBFA2T3-GLIS3':  'AML with other rare recurring translocations',
    # 'RUNX1-ZFPM2':    'AML with other rare recurring translocations', 
    # 'RUNX1-CBFA2T2':  'AML with other rare recurring translocations',
    # 'PIM3-BRD1':      'AML with other rare recurring translocations',
    # 'KAT6A-EP300':    'AML with other rare recurring translocations',
    # 'DOT1L-RPS15':    'AML with other rare recurring translocations',
    # 'FUS-FEV':        'AML with other rare recurring translocations',
    # 'KAT6A-NCOA2':    'AML with other rare recurring translocations',
    # 'JARID2-PTP4A1':  'AML with other rare recurring translocations',
    # 'FUS-FLI1':       'AML with other rare recurring translocations',     
    
    for key, value in mapping.items():
        if key in gene_fusion:
            return value

def classify_cebpa(cebpa_mutation):
    mapping = {
        'Yes': 'AML with in-frame bZIP mutated CEBPA'}
    
    for key, value in mapping.items():
        if key in cebpa_mutation:
            return value

def classify_npm(npm_mutation):
    mapping = {
        'Yes': 'AML with mutated NPM1',
    }

    for key, value in mapping.items():
        if key in npm_mutation:
            return value
        
def classify_annotated_diagnosis(diagnosis):
    mapping = {
        'mutated NPM1': 'AML with mutated NPM1',
        'mutated CEBPA': 'AML with in-frame bZIP mutated CEBPA',
        'myelodysplasia-related changes': 'MDS or MDS-related myeloid neoplasms'
        }
    
    for key, value in mapping.items():
        if key in diagnosis:
            return value

def process_labels(df):
    df['ELN 2022_Controls'] = df['Sample Type'].astype(str).apply(classify_controls)
    df['ELN 2022_Gene Fusion'] = df['Gene Fusion'].astype(str).apply(classify_fusion)
    df['ELN 2022_CEBPA'] = df['CEBPA mutation'].astype(str).apply(classify_cebpa)
    df['ELN 2022_NPM1'] = df['NPM mutation'].astype(str).apply(classify_npm)
    df['ELN 2022_Comment'] = df['Comment'].astype(str).apply(classify_annotated_diagnosis)

    df['ELN 2022 Combined Diagnoses'] = df[['ELN 2022_Controls','ELN 2022_Gene Fusion', 'ELN 2022_CEBPA', 'ELN 2022_NPM1', 'ELN 2022_Comment']]\
        .apply(lambda x: ','.join(filter(lambda i: i is not None and i==i, x)), axis=1)

    # Replace empty strings with NaN
    df['ELN 2022 Combined Diagnoses'] = df['ELN 2022 Combined Diagnoses'].replace('', np.nan)

    # Create `ELN 2022 Final Diagnosis` column by splitting `Combined Diagnosis` by comma and taking the first element
    df['ELN AML 2022 Diagnosis'] = df['ELN 2022 Combined Diagnoses'].str.split(',').str[0]

    # Drop columns created except for `ELN 2022 Final Diagnosis` and `Combined Diagnosis` columns
    df = df.drop(['ELN 2022_Controls','ELN 2022_Gene Fusion', 'ELN 2022_CEBPA', 'ELN 2022_NPM1', 'ELN 2022_Comment'], axis=1)
        
    return df

# Process labels
labels_1031 = process_labels(labels_1031)
labels_0531 = process_labels(labels_0531)

# Combine `['ELN 2022 Diagnosis'].value_counts()` from both datasets
labels_1031['ELN AML 2022 Diagnosis'].value_counts().add(labels_0531['ELN AML 2022 Diagnosis'].value_counts(), fill_value=0).sort_values(ascending=False).astype(int)

ELN AML 2022 Diagnosis
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement                           416
AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1                                245
AML with other rare recurring translocations                              214
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11             199
Otherwise-Normal Control                                                  181
AML with mutated NPM1                                                     132
AML with in-frame bZIP mutated CEBPA                                       85
MDS or MDS-related myeloid neoplasms                                       39
AML with t(6;9)(p23;q34.1)/DEK::NUP214                                     31
AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement      6
APL with t(15;17)(q24.1;q21.2)/PML::RARA                                    1
Name: count, dtype: int64

### WHO2021

In [3]:
def classify_controls(normal_samples):
    mapping = {
        'Bone Marrow Normal': 'Otherwise-Normal Control',
        'Blood Derived Normal': 'Otherwise-Normal Control'}
    
    for key, value in mapping.items():
        if key in normal_samples:
            return value


def classify_fusion(gene_fusion):
    mapping = {
    'RUNX1-RUNX1T1': 'AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1',
    'CBFB-MYH11':    'AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11',
    'KMT2A':         'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'add(11)(q23)':  'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'MLL':           'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'PML-RARA':      'APL with t(15;17)(q24.1;q21.2)/PML::RARA',
    'DEK-NUP214':    'AML with t(6;9)(p23;q34.1)/DEK::NUP214',
    'MECOM':         'AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement',
    'ETV6':          'AML with ETV6 fusion',
    'NPM1':          'AML with mutated NPM1',
    'RBM15-MKL1':    'AML with t(1;22)(p13.3;q13.1); RBM15::MKL1',
    'NUP98':         'AML with NUP98-fusion',
    'KAT6A-CREBBP':  'AML with t(8;16)(p11.2;p13.3); KAT6A::CREBBP',
    'FUS-ERG':       'AML with t(16;21)(p11;q22); FUS::ERG',
    'CBFA2T3-GLIS2': 'AML with CBFA2T3::GLIS2 (inv(16)(p13q24))',
    'BCR-ABL1':       'AML with t(9;22)(q34.1;q11.2)/BCR::ABL1'}

    # Other uncharacterized abdnormalities present in the dataset but not in guidelines
    # 'RUNX1-CBFA2T3': 'AML with other rare recurring translocations',
    # 'PRDM16-RPN1':   'AML with other rare recurring translocations',
    # 'PICALM-MLLT10': 'AML with other rare recurring translocations',
    # 'RBM15-MRTFA':   'AML with other rare recurring translocations',
    # 'CBFA2T3-GLIS3': 'AML with other rare recurring translocations',
    # 'PSIP1-NUP214':  'AML with other rare recurring translocations',
    # 'XPO1-TNRC18':   'AML with other rare recurring translocations', 
    # 'HNRNPH1-ERG':   'AML with other rare recurring translocations',
    # 'NIPBL-HOXB9':   'AML with other rare recurring translocations', 
    # 'SET-NUP214':    'AML with other rare recurring translocations', 
    # 'FLI1-IFIT2':    'AML with other rare recurring translocations', 
    # 'TCF4-ZEB2':     'AML with other rare recurring translocations',
    # 'MBTD1-ZMYND11': 'AML with other rare recurring translocations', 
    # 'FOSB-KLF6':     'AML with other rare recurring translocations', 
    # 'SFPQ-ZFP36L2':  'AML with other rare recurring translocations', 
    # 'RUNX1-LINC00478':'AML with other rare recurring translocations',
    # 'RUNX1-EVX1':     'AML with other rare recurring translocations',  
    # 'PSPC1-ZFP36L1':  'AML with other rare recurring translocations', 
    # 'EWSR1-FEV':      'AML with other rare recurring translocations',
    # 'STAG2-AFF2':     'AML with other rare recurring translocations', 
    # 'MYB-GATA1':      'AML with other rare recurring translocations', 
    # 'CBFA2T3-GLIS3':  'AML with other rare recurring translocations',
    # 'RUNX1-ZFPM2':    'AML with other rare recurring translocations', 
    # 'RUNX1-CBFA2T2':  'AML with other rare recurring translocations',
    # 'PIM3-BRD1':      'AML with other rare recurring translocations',
    # 'KAT6A-EP300':    'AML with other rare recurring translocations',
    # 'DOT1L-RPS15':    'AML with other rare recurring translocations',
    # 'FUS-FEV':        'AML with other rare recurring translocations',
    # 'KAT6A-NCOA2':    'AML with other rare recurring translocations',
    # 'JARID2-PTP4A1':  'AML with other rare recurring translocations',
    # 'FUS-FLI1':       'AML with other rare recurring translocations',     
    
    for key, value in mapping.items():
        if key in gene_fusion:
            return value

def classify_cebpa(cebpa_mutation):
    mapping = {
        'Yes': 'AML with bZIP mutated CEBPA'}
    
    for key, value in mapping.items():
        if key in cebpa_mutation:
            return value

def classify_npm(npm_mutation):
    mapping = {
        'Yes': 'AML with mutated NPM1',
    }

    for key, value in mapping.items():
        if key in npm_mutation:
            return value
        
def classify_annotated_diagnosis(diagnosis):
    mapping = {
        'mutated NPM1': 'AML with mutated NPM1',
        'mutated CEBPA': 'AML with bZIP mutated CEBPA',
        'myelodysplasia-related changes': 'MDS or MDS-related myeloid neoplasms'
        }
    
    for key, value in mapping.items():
        if key in diagnosis:
            return value

def process_labels(df):
    df['WHO 2021_Controls'] = df['Sample Type'].astype(str).apply(classify_controls)
    df['WHO 2021_Gene Fusion'] = df['Gene Fusion'].astype(str).apply(classify_fusion)
    df['WHO 2021_CEBPA'] = df['CEBPA mutation'].astype(str).apply(classify_cebpa)
    df['WHO 2021_NPM1'] = df['NPM mutation'].astype(str).apply(classify_npm)
    df['WHO 2021_Comment'] = df['Comment'].astype(str).apply(classify_annotated_diagnosis)

    df['WHO 2021 Combined Diagnoses'] = df[['WHO 2021_Controls','WHO 2021_Gene Fusion', 'WHO 2021_CEBPA', 'WHO 2021_NPM1', 'WHO 2021_Comment']]\
        .apply(lambda x: ','.join(filter(lambda i: i is not None and i==i, x)), axis=1)

    # Replace empty strings with NaN
    df['WHO 2021 Combined Diagnoses'] = df['WHO 2021 Combined Diagnoses'].replace('', np.nan)

    # Create `WHO 2021 Final Diagnosis` column by splitting `Combined Diagnosis` by comma and taking the first element
    df['WHO AML 2021 Diagnosis'] = df['WHO 2021 Combined Diagnoses'].str.split(',').str[0]

    # Drop columns created except for `WHO 2021 Final Diagnosis` and `Combined Diagnosis` columns
    df = df.drop(['WHO 2021_Controls','WHO 2021_Gene Fusion', 'WHO 2021_CEBPA', 'WHO 2021_NPM1', 'WHO 2021_Comment'], axis=1)
        
    return df

# Process labels
labels_1031 = process_labels(labels_1031)
labels_0531 = process_labels(labels_0531)

# Combine `['WHO 2021 Diagnosis'].value_counts()` from both datasets
labels_1031['WHO AML 2021 Diagnosis'].value_counts().add(labels_0531['WHO AML 2021 Diagnosis'].value_counts(), fill_value=0).sort_values(ascending=False).astype(int)


WHO AML 2021 Diagnosis
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement                           416
AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1                                245
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11             199
Otherwise-Normal Control                                                  181
AML with mutated NPM1                                                     142
AML with NUP98-fusion                                                     123
AML with bZIP mutated CEBPA                                                85
AML with CBFA2T3::GLIS2 (inv(16)(p13q24))                                  39
MDS or MDS-related myeloid neoplasms                                       37
AML with t(6;9)(p23;q34.1)/DEK::NUP214                                     31
AML with ETV6 fusion                                                       25
AML with t(1;22)(p13.3;q13.1); RBM15::MKL1                                 13
AML with t(8;16)(p11.2;p13.3); KAT6A::CRE

## AML05

### ELN2022

In [4]:
def classify_fusion_aml05(gene_fusion):
    mapping = {
    'KMT2A':         'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'NPM1':          'AML with mutated NPM1',
    'NUP98':         'AML with other rare recurring translocations'}
    for key, value in mapping.items():
        if key in gene_fusion:
            return value
        
# Rename `Other genetic alterations` column to `Gene Fusion`
labels_aml05 = labels_aml05.rename(columns={'Other genetic alterations': 'Gene Fusion'})

labels_aml05['ELN AML 2022 Diagnosis'] = labels_aml05['Gene Fusion'].astype(str).apply(classify_fusion_aml05)

labels_aml05['ELN AML 2022 Diagnosis'].value_counts()

ELN AML 2022 Diagnosis
AML with mutated NPM1                              4
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement    3
AML with other rare recurring translocations       2
Name: count, dtype: int64

### WHO2021

In [5]:
def classify_fusion_aml05(gene_fusion):
    mapping = {
    'KMT2A':         'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'NPM1':          'AML with mutated NPM1',
    'NUP98':         'AML with NUP98-fusion'}
    for key, value in mapping.items():
        if key in gene_fusion:
            return value
        
# Rename `Other genetic alterations` column to `Gene Fusion`
labels_aml05 = labels_aml05.rename(columns={'Other genetic alterations': 'Gene Fusion'})

labels_aml05['WHO AML 2021 Diagnosis'] = labels_aml05['Gene Fusion'].astype(str).apply(classify_fusion_aml05)

labels_aml05['WHO AML 2021 Diagnosis'].value_counts()

WHO AML 2021 Diagnosis
AML with mutated NPM1                              4
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement    3
AML with NUP98-fusion                              2
Name: count, dtype: int64

## TCGA

In [6]:
labels_amltcga[['Molecular Classification']].value_counts()

Molecular Classification                 
Normal Karyotype                             85
Complex Cytogenetics                         24
PML-RARA                                     18
Intermediate Risk Cytogenetic Abnormality    17
CBFB-MYH11                                   12
Poor Risk Cytogenetic Abnormality            10
MLL translocation, poor risk                  9
RUNX1-RUNX1T1                                 7
N.D.                                          4
BCR-ABL1                                      3
MLL translocation, t(9;11)                    2
NUP98 Translocation                           2
NUP98 translocation                           1
Name: count, dtype: int64

### ELN2022

In [7]:
def classify_annotated_diagnosis_amltcga(gene_fusion):
    mapping = {
    'PML-RARA':         'APL with t(15;17)(q24.1;q21.2)/PML::RARA',
    'CBFB-MYH11':       'AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11',
    'RUNX1-RUNX1T1':    'AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1',
    'MLL':              'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'BCR-ABL1':         'AML with t(9;22)(q34.1;q11.2)/BCR::ABL1',
    'NUP98':            'AML with other rare recurring translocations'}
    for key, value in mapping.items():
        if key in gene_fusion:
            return value
        
# Rename `Other genetic alterations` column to `Gene Fusion`

labels_amltcga['ELN AML 2022 Diagnosis'] = labels_amltcga['Molecular Classification']\
    .astype(str).apply(classify_annotated_diagnosis_amltcga)


labels_amltcga['ELN AML 2022 Diagnosis'].value_counts(dropna=False)

ELN AML 2022 Diagnosis
None                                                             140
APL with t(15;17)(q24.1;q21.2)/PML::RARA                          18
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11     12
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement                   11
AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1                         7
AML with other rare recurring translocations                       3
AML with t(9;22)(q34.1;q11.2)/BCR::ABL1                            3
Name: count, dtype: int64

### WHO2021

In [8]:
def classify_annotated_diagnosis_amltcga(gene_fusion):
    mapping = {
    'PML-RARA':         'APL with t(15;17)(q24.1;q21.2)/PML::RARA',
    'CBFB-MYH11':       'AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11',
    'RUNX1-RUNX1T1':    'AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1',
    'MLL':              'AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement',
    'NUP98':            'AML with NUP98-fusion'}
    for key, value in mapping.items():
        if key in gene_fusion:
            return value
        
# Rename `Other genetic alterations` column to `Gene Fusion`

labels_amltcga['WHO AML 2021 Diagnosis'] = labels_amltcga['Molecular Classification']\
    .astype(str).apply(classify_annotated_diagnosis_amltcga)

labels_amltcga['WHO AML 2021 Diagnosis'].value_counts(dropna=False)

WHO AML 2021 Diagnosis
None                                                             143
APL with t(15;17)(q24.1;q21.2)/PML::RARA                          18
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11     12
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement                   11
AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1                         7
AML with NUP98-fusion                                              3
Name: count, dtype: int64

## BeatAML

In [9]:
labels_beataml["SpecificDxAtAcquisition"].value_counts(dropna=False)

SpecificDxAtAcquisition
NaN                                                               75
AML with mutated NPM1                                             72
AML with myelodysplasia-related changes                           37
Acute myeloid leukaemia, NOS                                      24
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22); CBFB-MYH11     17
AML with mutated CEBPA                                            16
Therapy-related myeloid neoplasms                                 15
Acute myelomonocytic leukaemia                                    13
Acute promyelocytic leukaemia with t(15;17)(q22;q12); PML-RARA    12
AML with t(9;11)(p22;q23); MLLT3-MLL                               8
AML with t(8;21)(q22;q22); RUNX1-RUNX1T1                           7
Acute monoblastic and monocytic leukaemia                          7
AML with inv(3)(q21q26.2) or t(3;3)(q21;q26.2); RPN1-EVI1          5
AML without maturation                                             2
AML with m

### ELN2022

In [10]:
def classify_annotated_diagnosis_beataml(gene_fusion):
    mapping = {
        "AML with mutated NPM1"                                         : "AML with mutated NPM1",
        "AML with myelodysplasia-related changes"                       : "MDS or MDS-related myeloid neoplasms",
        "AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22); CBFB-MYH11" : "AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11",
        "AML with mutated CEBPA"                                        : "AML with in-frame bZIP mutated CEBPA",
        "Therapy-related myeloid neoplasms"                             : "Therapy-related or secondary myeloid neoplasms",
        "PML-RARA"                                                      : "APL with t(15;17)(q24.1;q21.2)/PML::RARA",
        "AML with t(9;11)(p22;q23); MLLT3-MLL"                          : "AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement",
        "AML with t(8;21)(q22;q22.1); RUNX1-RUNX1T1"                    : "AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1",
        "AML with inv(3)(q21q26.2) or t(3;3)(q21;q26.2); RPN1-EVI1"     : "AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement",
        "Mixed phenotype acute leukaemia, T/myeloid"                    : "Mixed phenotype acute leukemia T/myeloid",
        "Myeloid leukaemia associated with Down syndrome"               : "Myeloid leukaemia associated with Down syndrome",
    }
    for key, value in mapping.items():
        if key in gene_fusion:
            return value


# Rename `Other genetic alterations` column to `Gene Fusion`

labels_beataml["ELN AML 2022 Diagnosis"] = (
    labels_beataml["SpecificDxAtAcquisition"]
    .astype(str)
    .apply(classify_annotated_diagnosis_beataml)
)

labels_beataml["ELN AML 2022 Diagnosis"].value_counts()

ELN AML 2022 Diagnosis
AML with mutated NPM1                                                     72
MDS or MDS-related myeloid neoplasms                                      37
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11             17
AML with in-frame bZIP mutated CEBPA                                      16
Therapy-related or secondary myeloid neoplasms                            15
APL with t(15;17)(q24.1;q21.2)/PML::RARA                                  12
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement                            8
AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement     5
Mixed phenotype acute leukemia T/myeloid                                   1
Myeloid leukaemia associated with Down syndrome                            1
Name: count, dtype: int64

### WHO2021

In [11]:
def classify_annotated_diagnosis_beataml(gene_fusion):
    mapping = {
        "AML with mutated NPM1"                                         : "AML with mutated NPM1",
        "AML with myelodysplasia-related changes"                       : "MDS or MDS-related myeloid neoplasms",
        "AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22); CBFB-MYH11" : "AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11",
        "AML with mutated CEBPA"                                        : "AML with bZIP mutated CEBPA",
        "Therapy-related myeloid neoplasms"                             : "Therapy-related or secondary myeloid neoplasms",
        "PML-RARA"                                                      : "APL with t(15;17)(q24.1;q21.2)/PML::RARA",
        "AML with t(9;11)(p22;q23); MLLT3-MLL"                          : "AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement",
        "AML with t(8;21)(q22;q22.1); RUNX1-RUNX1T1"                    : "AML with t(8;21)(q22;q22.1)/RUNX1::RUNX1T1",
        "AML with inv(3)(q21q26.2) or t(3;3)(q21;q26.2); RPN1-EVI1"     : "AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement",
        "Mixed phenotype acute leukaemia, T/myeloid"                    : "Mixed phenotype acute leukemia T/myeloid",
        "Myeloid leukaemia associated with Down syndrome"               : "Myeloid leukemia associated with Down syndrome",
    }
    for key, value in mapping.items():
        if key in gene_fusion:
            return value


# Rename `Other genetic alterations` column to `Gene Fusion`

labels_beataml["WHO AML 2021 Diagnosis"] = (
    labels_beataml["SpecificDxAtAcquisition"]
    .astype(str)
    .apply(classify_annotated_diagnosis_beataml)
)

labels_beataml["WHO AML 2021 Diagnosis"].value_counts()

WHO AML 2021 Diagnosis
AML with mutated NPM1                                                     72
MDS or MDS-related myeloid neoplasms                                      37
AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22)/CBFB::MYH11             17
AML with bZIP mutated CEBPA                                               16
Therapy-related or secondary myeloid neoplasms                            15
APL with t(15;17)(q24.1;q21.2)/PML::RARA                                  12
AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement                            8
AML with inv(3)(q21.3q26.2) or t(3;3)(q21.3;q26.2)/MECOM-rearrangement     5
Mixed phenotype acute leukemia T/myeloid                                   1
Myeloid leukemia associated with Down syndrome                             1
Name: count, dtype: int64

## MDS tAML

### ELN2022 and WHO2021

In [12]:
def classify_controls_mds_taml(normal_samples):
    mapping = {
        'CTR': 'Otherwise-Normal Control'}
    
    for key, value in mapping.items():
        if key in normal_samples:
            return value

def classify_annotated_diagnosis_mds_taml(gene_fusion):
    mapping = {
        "MDS": "MDS or MDS-related myeloid neoplasms",
        "AML": "Therapy-related or secondary myeloid neoplasms",
    }
    for key, value in mapping.items():
        if key in gene_fusion:
            return value


def process_labels_mds_taml(df):
    df['ELN22_Controls'] = df['Patient_ID'].astype(str).apply(classify_controls_mds_taml)
    df["ELN22_Diagnosis"] = (df["Cytological category group"].astype(str).apply(classify_annotated_diagnosis_mds_taml))

    df['ELN22 Combined Diagnoses'] = df[['ELN22_Controls','ELN22_Diagnosis']]\
        .apply(lambda x: ','.join(filter(lambda i: i is not None and i==i, x)), axis=1)
        # Replace empty strings with NaN
    df['ELN22 Combined Diagnoses'] = df['ELN22 Combined Diagnoses'].replace('', np.nan)

    # Create `ELN 2022 Diagnosis` column by splitting `Combined Diagnosis` by comma and taking the first element
    df['ELN AML 2022 Diagnosis'] = df['ELN22 Combined Diagnoses'].str.split(',').str[0]

    # Create `WHO 2021 Diagnosis` column by splitting `Combined Diagnosis` by comma and taking the first element
    df['WHO AML 2021 Diagnosis'] = df['ELN22 Combined Diagnoses'].str.split(',').str[0]

    # Drop columns created except for `ELN 2022 Diagnosis` and `Combined Diagnosis` columns
    df = df.drop(['ELN22_Controls','ELN22_Diagnosis'], axis=1)
        
    return df

labels_mds_taml = process_labels_mds_taml(labels_mds_taml)

labels_mds_taml['ELN AML 2022 Diagnosis'].value_counts(dropna=False)


ELN AML 2022 Diagnosis
MDS or MDS-related myeloid neoplasms              108
Therapy-related or secondary myeloid neoplasms     48
Otherwise-Normal Control                           10
Name: count, dtype: int64

## TARGET ALL

In [13]:
labels_target_all['WHO ALAL Classification'].value_counts(dropna=False)

WHO ALAL Classification
NaN            91
T/M            24
B/M            19
MLL             6
NOS (T/B/M)     1
Name: count, dtype: int64

### ELN2022 and WHO2021

In [14]:
def classify_controls_target_all(normal_samples):
    mapping = {
        'Bone Marrow Normal': 'Otherwise-Normal Control'}
    
    for key, value in mapping.items():
        if key in normal_samples:
            return value

def classify_annotated_diagnosis_target_all(gene_fusion):
    mapping = {
        "T/M": "Mixed phenotype acute leukemia T/myeloid",
        'B/M': 'Mixed phenotype acute leukemia B/myeloid',
        'MLL': 'Mixed phenotype acute leukemia with t(v;11q23.3)/KMT2A-rearranged'
    }
    for key, value in mapping.items():
        if key in gene_fusion:
            return value


def process_labels_target_all(df):
    df['ELN22_Controls'] = df['Sample Type'].astype(str).apply(classify_controls_target_all)
    df["ELN22_Diagnosis"] = (df["WHO ALAL Classification"].astype(str).apply(classify_annotated_diagnosis_target_all))

    df['ELN22 Combined Diagnoses'] = df[['ELN22_Controls','ELN22_Diagnosis']]\
        .apply(lambda x: ','.join(filter(lambda i: i is not None and i==i, x)), axis=1)
        # Replace empty strings with NaN
    df['ELN22 Combined Diagnoses'] = df['ELN22 Combined Diagnoses'].replace('', np.nan)

    # Create `WHO 2021 Diagnosis` column by splitting `Combined Diagnosis` by comma and taking the first element
    df['WHO ALL 2021 Diagnosis'] = df['ELN22 Combined Diagnoses'].str.split(',').str[0]

    # Drop columns created except for `ELN22 Final Diagnosis` and `Combined Diagnosis` columns
    df = df.drop(['ELN22_Controls','ELN22_Diagnosis'], axis=1)
        
    return df

labels_target_all = process_labels_target_all(labels_target_all)

labels_target_all['WHO ALL 2021 Diagnosis'].value_counts(dropna=False)

WHO ALL 2021 Diagnosis
NaN                                                                  84
Mixed phenotype acute leukemia T/myeloid                             24
Mixed phenotype acute leukemia B/myeloid                             20
Otherwise-Normal Control                                              7
Mixed phenotype acute leukemia with t(v;11q23.3)/KMT2A-rearranged     6
Name: count, dtype: int64

## Nordic Peds ALL

### ELN2022 and WHO2021

In [15]:
labels_nordic_all['Immunophenotype_Subtype'].value_counts(dropna=False)

Immunophenotype_Subtype
BCP-ALL HeH              187
NaN                      180
BCP-ALL t(12;21)         163
BCP-ALL undefined        105
T-ALL T-ALL              101
BCP-ALL non-recurrent    100
BCP-ALL 11q23/MLL         28
BCP-ALL t(1;19)           23
BCP-ALL dic(9;20)         20
BCP-ALL t(9;22)           19
BCP-ALL iAMP21            10
BCP-ALL <45chr             5
BCP-ALL >67chr             3
Name: count, dtype: int64

In [16]:
def classify_controls(normal_samples):
    mapping = {
        'Bone Marrow Normal': 'Otherwise-Normal Control',
        'Peripheral Blood Normal': 'Otherwise-Normal Control'}
    
    for key, value in mapping.items():
        if key in normal_samples:
            return value


def classify_fusion(immunophenotype_subtype):
    mapping = {
    'BCP-ALL HeH'           :'B-ALL with hyperdiploidy, high',
    'BCP-ALL t(12;21)'      :'B-ALL with t(12;21)(p13.2;q22.1); ETV6::RUNX1',         
    'BCP-ALL undefined'     :'B-ALL NOS',     
    'T-ALL T-ALL'           :'T-ALL NOS',
    'BCP-ALL non-recurrent' :'B-ALL NOS', 
    'BCP-ALL 11q23/MLL'     :'B-ALL with t(v;11q23.3); KMT2A-rearranged',       
    'BCP-ALL t(1;19)'       :'B-ALL with t(1;19)(q23;p13.3); TCF3::PBX1',       
    'BCP-ALL dic(9;20)'     :'B-ALL dic(9;20)',       
    'BCP-ALL t(9;22)'       :'B-ALL with t(9;22)(q34.1;q11.2); BCR::ABL1',       
    'BCP-ALL iAMP21'        :'B-ALL with iAMP21',        
    'BCP-ALL <45chr'        :'B-ALL with hypodiploidy',        
    'BCP-ALL >67chr'        :'B-ALL with hyperdiploidy'}

    # This criteria above needs to be revised if we choose to include ALL samples in the analysis 
    
    for key, value in mapping.items():
        if key in immunophenotype_subtype:
            return value
        
def process_labels(df):
    df['WHO 2021_Controls'] = df['Sample Type'].astype(str).apply(classify_controls)
    df['WHO 2021_Immunophenotype_Subtype'] = df['Immunophenotype_Subtype'].astype(str).apply(classify_fusion)
    
    df['WHO 2021 Combined Diagnoses'] = df[['WHO 2021_Controls','WHO 2021_Immunophenotype_Subtype']]\
        .apply(lambda x: ','.join(filter(lambda i: i is not None and i==i, x)), axis=1)

    # Replace empty strings with NaN
    df['WHO 2021 Combined Diagnoses'] = df['WHO 2021 Combined Diagnoses'].replace('', np.nan)

    # Create `WHO 2021 Final Diagnosis` column by splitting `Combined Diagnosis` by comma and taking the first element
    df['WHO ALL 2021 Diagnosis'] = df['WHO 2021 Combined Diagnoses'].str.split(',').str[0]

    # Drop columns created except for `WHO 2021 Final Diagnosis` and `Combined Diagnosis` columns
    df = df.drop(['WHO 2021_Controls','WHO 2021_Immunophenotype_Subtype'], axis=1)
        
    return df

# Process labels
labels_nordic_all = process_labels(labels_nordic_all)


# Combine `['WHO 2021 Diagnosis'].value_counts()` from both datasets
labels_nordic_all['WHO ALL 2021 Diagnosis'].value_counts(dropna=False)

WHO ALL 2021 Diagnosis
B-ALL NOS                                        205
B-ALL with hyperdiploidy                         190
B-ALL with t(12;21)(p13.2;q22.1); ETV6::RUNX1    163
T-ALL NOS                                        101
NaN                                               94
Otherwise-Normal Control                          86
B-ALL with t(v;11q23.3); KMT2A-rearranged         28
B-ALL with t(1;19)(q23;p13.3); TCF3::PBX1         23
B-ALL dic(9;20)                                   20
B-ALL with t(9;22)(q34.1;q11.2); BCR::ABL1        19
B-ALL with iAMP21                                 10
B-ALL with hypodiploidy                            5
Name: count, dtype: int64

## ALL French GRAAL

In [17]:
labels_all_graal['Diagnosis'].value_counts(dropna=False)

Diagnosis
T-ALL                       143
Otherwise-Normal Control     12
Name: count, dtype: int64

### ELN2022 and WHO2021

In [18]:
def classify_all_graal(diagnosis):
    mapping = {
    'Otherwise-Normal Control':'Otherwise-Normal Control',
    'T-ALL':'T-ALL NOS'}
    for key, value in mapping.items():
        if key in diagnosis:
            return value
        
labels_all_graal['WHO ALL 2021 Diagnosis'] = labels_all_graal['Diagnosis'].astype(str).apply(classify_all_graal)

labels_all_graal['WHO ALL 2021 Diagnosis'].value_counts(dropna=False)


WHO ALL 2021 Diagnosis
T-ALL NOS                   143
Otherwise-Normal Control     12
Name: count, dtype: int64

## Merge clinical data

In [19]:

# Combine all clinical data labels into one dataframe
labels_combined = pd.concat([labels_aml05, labels_beataml,
                        labels_0531, labels_amltcga, labels_1031,
                        labels_nordic_all, labels_mds_taml,
                        labels_all_graal,labels_target_all], axis=0, join='outer')

# Read df
df = pd.read_pickle(output_path + 'df_batch_corrected.pkl')

# Remove samples that are not in the methyl dataset
df_labels = labels_combined.loc[labels_combined.index.isin(df.index)].sort_index()

## Create new columns

In [20]:
df_labels['Age (years)'].isna().sum()

1342

In [24]:
def categorize_age(age):
    if pd.isnull(age):
        return np.nan
    elif age < 5:
        return '0-5'
    elif age < 13:
        return '5-13'
    elif age < 39:
        return '13-39'
    elif age < 60:
        return '39-60'
    else:
        return '60+'

# Convert 'Age (years)' to numeric, errors='coerce' will turn non-numeric data to NaN
df_labels['Age (years)'] = pd.to_numeric(df_labels['Age (years)'], errors='coerce')

# Then apply your function
df_labels['Age (group years)'] = df_labels['Age (years)'].apply(categorize_age)



def classify_main_disease(subtype):
    mapping = {
    'AML':                              'Acute myeloid leukemia (AML)',
    'ALL':                              'Acute lymphoblastic leukemia (ALL)',
    'MDS':                              'Myelodysplastic syndrome (MDS or MDS-like)',
    'Mixed phenotype acute leukemia':  'Mixed phenotype acute leukemia (MPAL)',
    'APL':                              'Acute promyelocytic leukemia (APL)',
    'Otherwise-Normal Control':         'Otherwise-Normal (Control)'}

    for key, value in mapping.items():
        if key in subtype:
            return value
        

def process_labels(df):
    df['WHO_AML'] = df['WHO AML 2021 Diagnosis'].astype(str).apply(classify_main_disease)
    df["WHO_ALL"] = (df["WHO ALL 2021 Diagnosis"].astype(str).apply(classify_main_disease))

    df['WHO 2021 Diagnosis'] = df[['WHO_AML','WHO_ALL']]\
        .apply(lambda x: ','.join(filter(lambda i: i is not None and i==i, x)), axis=1)
        # Replace empty strings with NaN
    df['WHO 2021 Diagnosis'] = df['WHO 2021 Diagnosis'].replace('', np.nan)

    # Create `WHO 2021 Diagnosis` column by splitting `Combined Diagnosis` by comma and taking the first element
    # df['WHO ALL 2021 Diagnosis'] = df['WHO 2021 Diagnosis'].str.split(',').str[0]

    # Drop columns created except for `WHO Final Diagnosis` and `Combined Diagnosis` columns
    df = df.drop(['WHO_AML','WHO_ALL'], axis=1)
        
    return df

# Process labels
df_labels = process_labels(df_labels)


df_labels['WHO 2021 Diagnosis'].value_counts(dropna=False)

WHO 2021 Diagnosis
Acute myeloid leukemia (AML)                  1107
Acute lymphoblastic leukemia (ALL)             905
NaN                                            802
Otherwise-Normal (Control)                     267
Myelodysplastic syndrome (MDS or MDS-like)     167
Mixed phenotype acute leukemia (MPAL)           51
Acute promyelocytic leukemia (APL)              31
Name: count, dtype: int64

In [25]:
df_labels['Age (group years)'].value_counts(dropna=False)

Age (group years)
NaN      1342
13-39     658
5-13      482
0-5       480
60+       203
39-60     165
Name: count, dtype: int64

## Save Data

In [26]:

# Save the clinical data labels
df_labels.to_csv(output_path + 'clinical_data.csv')

print('The clinical data has been indexed and cleaned.\n\
Exclusion of samples may be applied later depending on the analysis.')

The clinical data has been indexed and cleaned.
Exclusion of samples may be applied later depending on the analysis.
