# Reclassification to Current Clinical Guidelines

```{contents}
```

Current thinking:

14 WHO subtypes of AML
+1 otherwise-normal control

May need to see how to think of mds-like and secondary neoplasms


## Guidelines

### WHO 2022 AML

In [1]:
import pandas as pd
import itables

from itables import show

# Read the CSV file
df = pd.read_csv('../data/who2022_aml_classification.csv')

# Display the dataframe using itables
itables.show(df)


Subtype,Frequency in paediatric AML,Morphology,Immunophenotype,Common co-occurring genetic aberrations,Prognosis
Loading... (need help?),,,,,


`````{admonition} Source
:class: tip
Hasle H, Meshinchi S, Fogelstrand L, Alaggio R, et al. Acute myeloid leukaemias (AMLs) with defining genetic abnormalities. In: WHO Classification of Tumours Editorial Board. Paediatric tumours [Internet]. Lyon (France): International Agency for Research on Cancer; 2022 [cited 2024 Jan 1]. (WHO classification of tumours series, 5th ed.; vol. 7). Available from: https://tumourclassification.iarc.who.int/chapters/44.
`````

## Load and process discovery clinical data

In [2]:
# Import functions to clean up clinical data
import sys
sys.path.append('../')
from source.clinical_data_cleanup_functions import *

# Call functions to merge, index and clean clinical data files
labels_0531         = clean_cog       (merge_index_0531())
labels_1031         = clean_cog       (merge_index_1031())
labels_aml05        = clean_aml05     (merge_index_aml05())
labels_beataml      = clean_beataml   (merge_index_beataml())
labels_amltcga      = clean_amltcga   (merge_index_amltcga())
labels_nordic_all   = clean_nordic_all(merge_index_nordic_all())
labels_mds_taml     = clean_mds_taml  (merge_index_mds_taml())
labels_all_graal    = clean_all_graal (merge_index_all_graal())
labels_target_all   = clean_target_all(merge_index_target_all())

# Combine all clinical data labels into one dataframe
labels_combined = pd.concat([labels_aml05, labels_beataml,
                        labels_0531, labels_amltcga, labels_1031,
                        labels_nordic_all, labels_mds_taml,
                        labels_all_graal,labels_target_all],
                        axis=0, join='outer')


# Redefine output path (for troubleshooting purposes in case only this cell is run)
# mount = '../../../'
mount = '/mnt/d/'
output_path = mount + 'MethylScore/Intermediate_Files/'

# Read df
df = pd.read_pickle(output_path + '3308samples_333059cpgs_withbatchcorrection_bvalues.pkl')

# Remove samples that are not in the methyl dataset
df_labels = labels_combined.loc[labels_combined.index.isin(df.index)].sort_index()

print('The clinical data has been indexed and cleaned.\n\
Exclusion of samples may be applied depending on the analysis.')

The clinical data has been indexed and cleaned.
Exclusion of samples may be applied depending on the analysis.


## Reclassification strategy for WHO 2022

### Functions

In [3]:
def classify_controls(normal_samples):
    mapping = {
        'Bone Marrow Normal'  : 'Otherwise-Normal Control',
        'Blood Derived Normal': 'Otherwise-Normal Control'}
    
    for key, value in mapping.items():
        if key in normal_samples:
            return value

def classify_fusion(gene_fusion):
    mapping = {
    'RUNX1-RUNX1T1': 'AML with t(8;21); RUNX1::RUNX1T1',
    'CBFB-MYH11':    'AML with inv(16); t(16;16); CBFB::MYH11',
    'KMT2A':         'AML with t(v;11q23); KMT2A-r',
    'MLL':           'AML with t(v;11q23); KMT2A-r',
    'PML-RARA':      'APL with t(15;17); PML::RARA',
    'DEK-NUP214':    'AML with t(6;9); DEK::NUP214',
    'MECOM':         'AML with inv(3); t(3;3); MECOM-r',
    'ETV6':          'AML with ETV6 fusion',
    'NPM1':          'AML with mutated NPM1',
    'RBM15-MKL1':    'AML with t(1;22); RBM15::MKL1',
    'NUP98':         'AML with NUP98-fusion',
    'KAT6A-CREBBP':  'AML with t(8;16); KAT6A::CREBBP',
    'FUS-ERG':       'AML with t(16;21); FUS::ERG',
    'CBFA2T3-GLIS2': 'AML with CBFA2T3::GLIS2',
    'BCR-ABL1':       'AML with t(9;22); BCR::ABL1',

    # Other uncharacterized abdnormalities present in the dataset but not specifically in guidelines

    # 'RUNX1-CBFA2T3': 'AML NOS',
    # 'PRDM16-RPN1':   'AML NOS',
    # 'RBM15-MRTFA':   'AML with t(1;22); RBM15::MKL1',
    # 'CBFA2T3-GLIS3': 'AML with CBFA2T3::GLIS2',
    # 'PSIP1-NUP214':  'AML with t(6;9); DEK::NUP214',
    # 'XPO1-TNRC18':   'AML NOS', 
    # 'HNRNPH1-ERG':   'AML with t(16;21); FUS::ERG',
    # 'NIPBL-HOXB9':   'AML NOS', 
    # 'SET-NUP214':    'AML with t(6;9); DEK::NUP214', 
    # 'FLI1-IFIT2':    'AML NOS', 
    # 'TCF4-ZEB2':     'AML NOS',
    # 'MBTD1-ZMYND11': 'AML NOS', 
    # 'FOSB-KLF6':     'AML NOS', 
    # 'SFPQ-ZFP36L2':  'AML NOS', 
    # 'RUNX1-LINC00478':'AML NOS',
    # 'RUNX1-EVX1':     'AML NOS',  
    # 'PSPC1-ZFP36L1':  'AML NOS', 
    # 'EWSR1-FEV':      'AML NOS',
    # 'STAG2-AFF2':     'AML NOS', 
    # 'MYB-GATA1':      'AML NOS', 
    # 'RUNX1-ZFPM2':    'AML NOS', 
    # 'RUNX1-CBFA2T2':  'AML NOS',
    # 'PIM3-BRD1':      'AML NOS',
    'KAT6A-EP300':    'AML with t(8;16); KAT6A::CREBBP',
    # 'DOT1L-RPS15':    'AML NOS',
    # 'FUS-FEV':        'AML with t(16;21); FUS::ERG',
    'KAT6A-NCOA2':    'AML with t(8;16); KAT6A::CREBBP',
    # 'JARID2-PTP4A1':  'AML NOS',
    # 'FUS-FLI1':       'AML with t(16;21); FUS::ERG',
    }    
    
    for key, value in mapping.items():
        if key in gene_fusion:
            return value

def classify_cebpa(cebpa_mutation):
    mapping = {
        'Yes': 'AML with mutated CEBPA',
        'Positive': 'AML with mutated CEBPA'}
    
    for key, value in mapping.items():
        if key in cebpa_mutation:
            return value

def classify_npm(npm_mutation):
    mapping = {
        'Yes': 'AML with mutated NPM1',
        'Positive': 'AML with mutated NPM1'
    }

    for key, value in mapping.items():
        if key in npm_mutation:
            return value

def classify_karyotype(structural_variation):
    mapping = {
        # 't(8;16)': 'AML with t(8;16); KAT6A::CREBBP',
        # 't(16;21)': 'AML with t(16;21); FUS::ERG',
        # 't(6;9)': 'AML with t(6;9); DEK::NUP214',
        # 't(1;22)': 'AML with t(1;22); RBM15::MKL1',
        # 'inv(3)': 'AML with inv(3); t(3;3); MECOM-r',
        # 't(3;3)': 'AML with inv(3); t(3;3); MECOM-r',
        # 't(6;11)': 'AML with t(v;11q23); KMT2A-r',
        # 't(1;11)': 'AML with t(v;11q23); KMT2A-r',
        # 't(4;11)': 'AML with t(v;11q23); KMT2A-r',
        # 'ins(6;11)': 'AML with t(v;11q23); KMT2A-r',

        }
    
    for key, value in mapping.items():
        if key in structural_variation:
            return value
            
def classify_annotated_diagnosis(diagnosis):
    mapping = {
        'mutated NPM1'                                                  : 'AML with mutated NPM1',
        'mutated CEBPA'                                                 : 'AML with mutated CEBPA',
        'myelodysplasia-related changes'                                : 'MDS-related; secondary myeloid',
        "AML with mutated NPM1"                                         : "AML with mutated NPM1",
        "AML with myelodysplasia-related changes"                       : "MDS-related; secondary myeloid",
        "AML with inv(16)(p13.1q22) or t(16;16)(p13.1;q22); CBFB-MYH11" : "AML with inv(16); t(16;16); CBFB::MYH11",
        "AML with mutated CEBPA"                                        : "AML with mutated CEBPA",
        "Therapy-related myeloid neoplasms"                             : "MDS-related; secondary myeloid",
        "PML-RARA"                                                      : "APL with t(15;17); PML::RARA",
        "AML with t(9;11)(p22;q23); MLLT3-MLL"                          : "AML with t(v;11q23); KMT2A-r",
        "AML with t(8;21)(q22;q22.1); RUNX1-RUNX1T1"                    : "AML with t(8;21); RUNX1::RUNX1T1",
        "AML with inv(3)(q21q26.2) or t(3;3)(q21;q26.2); RPN1-EVI1"     : "AML with inv(3); t(3;3); MECOM-r",
        "Mixed phenotype acute leukaemia, T/myeloid"                    : "MPAL T-Lymphoblastic/Myeloid",
        "Myeloid leukaemia associated with Down syndrome"               : "AML with Down syndrome",
        'AML with CBFB-MYH11'                                           : 'AML with inv(16); t(16;16); CBFB::MYH11',
        'AML with RUNX1-RUNX1T1'                                        : 'AML with t(8;21); RUNX1::RUNX1T1',
        'AML with BCR-ABL1'                                             : 'AML with t(9;22); BCR::ABL1',
        'AML inv(16)'                                                   : 'AML with inv(16); t(16;16); CBFB::MYH11',
        'AML t(9;11)'                                                   : 'AML with t(v;11q23); KMT2A-r',
        'AML t(8;21)'                                                   : 'AML with t(8;21); RUNX1::RUNX1T1', 
        'MDS-related or secondary myeloid neoplasms'                    : 'MDS-related; secondary myeloid',
        
        # ALL
        'T-ALL'                                                         :'T-ALL NOS',
        'BCP-ALL HeH'                                                   :'B-ALL with hyperdiploidy',
        'BCP-ALL t(12;21)'                                              :'B-ALL with t(12;21); ETV6::RUNX1',         
        'BCP-ALL 11q23/MLL'                                             :'B-ALL with t(v;11q23); KMT2A-r',       
        'BCP-ALL t(1;19)'                                               :'B-ALL with t(1;19); TCF3::PBX1',       
        'BCP-ALL dic(9;20)'                                             :'B-ALL dic(9;20)',       
        'BCP-ALL t(9;22)'                                               :'B-ALL with t(9;22); BCR::ABL1',       
        'BCP-ALL iAMP21'                                                :'B-ALL with iAMP21',        
        'BCP-ALL <45chr'                                                :'B-ALL with hypodiploidy',        
        'BCP-ALL >67chr'                                                :'B-ALL with hyperdiploidy',

        # MPAL
        'MPAL with MLL'                                                 :'MPAL with t(v;11q23.3)/KMT2A-r',
        'MPAL with T/M'                                                 :'MPAL T-Lymphoblastic/Myeloid',
        'MPAL with B/M'                                                 :'MPAL B-Lymphoblastic/Myeloid',

        
        }
    
    for key, value in mapping.items():
        if key in diagnosis:
            return value

def process_labels_who22(df):
    # Merge processing functions with source columns into a single dictionary
    column_mappings = {
        'Annotated Controls'            : ('Sample Type', classify_controls),
        'Annotated Gene Fusion'         : ('Gene Fusion', classify_fusion),
        'Annotated CEBPA'               : ('CEBPA mutation', classify_cebpa),
        'Annotated NPM1'                : ('NPM mutation', classify_npm),
        'Annotated Karyotype'           : ('Karyotype', classify_karyotype),
        'Annotated Dx at Acquisition'   : ('Dx at Acquisition', classify_annotated_diagnosis),
    }

    # Looping through the mapping to process and create new columns
    for new_col, (old_col, processing_func) in column_mappings.items():
        # Apply the corresponding classification function to each column
        df[new_col] = df[old_col].astype(str).apply(processing_func)

    # Combining processed columns into a single column, separating values with commas
    df['WHO 2022 Combined Diagnoses'] = df[list(column_mappings.keys())]\
        .apply(lambda x: ','.join(x.dropna()), axis=1).replace('', np.nan)

    # Extracting the first diagnosis from the combined diagnoses as the final diagnosis
    df['WHO 2022 Diagnosis'] = df['WHO 2022 Combined Diagnoses'].str.split(',').str[0]

    # Dropping intermediate columns used for processing
    df = df.drop(list(column_mappings.keys()), axis=1)

    return df


### Execution

In [4]:
# Add age categorization and main disease classification to the clinical data
df_labels2 = process_labels_who22(df_labels)
df_labels2['WHO 2022 Diagnosis'].value_counts(dropna=False)

WHO 2022 Diagnosis
NaN                                        866
AML with t(v;11q23); KMT2A-r               317
Otherwise-Normal Control                   251
T-ALL NOS                                  242
AML with mutated NPM1                      226
MDS-related; secondary myeloid             225
B-ALL with hyperdiploidy                   190
AML with inv(16); t(16;16); CBFB::MYH11    178
AML with t(8;21); RUNX1::RUNX1T1           176
B-ALL with t(12;21); ETV6::RUNX1           163
AML with NUP98-fusion                       95
AML with mutated CEBPA                      83
APL with t(15;17); PML::RARA                31
AML with CBFA2T3::GLIS2                     30
B-ALL with t(v;11q23); KMT2A-r              28
AML with t(6;9); DEK::NUP214                26
MPAL T-Lymphoblastic/Myeloid                25
B-ALL with t(1;19); TCF3::PBX1              23
B-ALL dic(9;20)                             20
MPAL B-Lymphoblastic/Myeloid                19
B-ALL with t(9;22); BCR::ABL1            

## Categorize ELN 2022, hematopoietic entity, age, and trisomy 8

### Functions

In [5]:
def process_df_labels(df):
    """
    Function to process a pandas dataframe, performing age categorization 
    and main disease classification.

    """
    def categorize_age(age):
        """
        Function to categorize age into a specific range.
        """
        if pd.isnull(age):
            return np.nan
        elif age < 5:
            return '0-5'
        elif age < 13:
            return '5-13'
        elif age < 39:
            return '13-39'
        elif age < 60:
            return '39-60'
        else:
            return '60+'

    # Convert 'Age (years)' to numeric, errors='coerce' will turn non-numeric data to NaN
    df['Age (years)'] = pd.to_numeric(df['Age (years)'], errors='coerce')

    # Then apply your function
    df['Age (group years)'] = df['Age (years)'].apply(categorize_age)
    
    def categorize_subtypes(subtype):
        """
        Function to classify the main disease based on a given subtype.
        """
        mapping = {
            'AML'                           :'Acute myeloid leukemia (AML)',
            'ALL'                           :'Acute lymphoblastic leukemia (ALL)',
            'MDS'                           :'Myelodysplastic syndrome (MDS or MDS-like)',
            'MPAL'                          :'Mixed phenotype acute leukemia (MPAL)',
            'APL'                           :'Acute promyelocytic leukemia (APL)',
            'Otherwise-Normal Control'      :'Otherwise-Normal (Control)',
        }

        for key, value in mapping.items():
            if key in subtype:
                return value
    
    df['Hematopoietic Entity'] = df['WHO 2022 Diagnosis'].astype(str).apply(categorize_subtypes)

    def categorize_trisomy8(karyotype):
        """
        Function to classify trisomy 8 by karyotype as a separate column.
        """
        mapping = {
            '+8': 'Positive',
            'trisomy 8': 'Positive',
        }

        for key, value in mapping.items():
            if key in karyotype:
                return value
    
    df['Trisomy 8 Status'] = df['Karyotype'].astype(str).apply(categorize_trisomy8).fillna('Negative or Unknown')

    def categorize_ELN2022(subtype):
        """
        Function to reclassify samples according to ELN AML 2022.
        """
        mapping = {
            'AML with t(v;11q23); KMT2A-r'                  :'AML with t(v;11q23); KMT2A-r',
            'Otherwise-Normal Control'                      :'Otherwise-Normal Control',
            'AML with mutated CEBPA'                        :'AML with mutated CEBPA',
            'MDS-related; secondary myeloid'                :'MDS-related; secondary myeloid',
            'AML with inv(3); t(3;3); MECOM-r'              :'AML with inv(3); t(3;3); MECOM-r',
            'AML with mutated NPM1'                         :'AML with mutated NPM1',
            'AML with inv(16); t(16;16); CBFB::MYH11'       :'AML with inv(16); t(16;16); CBFB::MYH11',
            'AML with t(8;21); RUNX1::RUNX1T1'              :'AML with t(8;21); RUNX1::RUNX1T1',
            'AML with t(9;22); BCR::ABL1'                   :'AML with t(9;22); BCR::ABL1',
            'AML with t(6;9); DEK::NUP214'                  :'AML with t(6;9); DEK::NUP214',
            'APL with t(15;17); PML::RARA'                  :'APL with t(15;17); PML::RARA',
            'AML with t(1;22); RBM15::MKL1'                 :'AML with other recurring translocations',
            'AML with t(16;21); FUS::ERG'                   :'AML with other recurring translocations',
            'AML with t(8;16); KAT6A::CREBBP'               :'AML with other recurring translocations',
            'AML with CBFA2T3::GLIS2'                       :'AML with other recurring translocations',
            'AML with NUP98-fusion'                         :'AML with other recurring translocations',
            'AML with ETV6 fusion'                          :'AML with other recurring translocations',   
        }

        for key, value in mapping.items():
            if key in subtype:
                return value
    
    df['ELN 2022 Diagnosis'] = df['WHO 2022 Diagnosis'].astype(str).apply(categorize_ELN2022)

    return df

### Execution

In [6]:
df_labels3 = process_df_labels(df_labels2)

df_labels3['Hematopoietic Entity'].value_counts(dropna=True)

Hematopoietic Entity
Acute myeloid leukemia (AML)                  1185
Acute lymphoblastic leukemia (ALL)             700
Otherwise-Normal (Control)                     251
Myelodysplastic syndrome (MDS or MDS-like)     225
Mixed phenotype acute leukemia (MPAL)           50
Acute promyelocytic leukemia (APL)              31
Name: count, dtype: int64

## Apply Classification to Test Dataset

In [7]:
# Import functions to clean up clinical data
import sys
sys.path.append('../')
from source.clinical_data_cleanup_functions import *

# Call functions to merge, index and clean clinical data files
labels_aml02         = clean_aml02       (merge_index_aml02())
labels_aml08         = clean_aml08       (merge_index_aml08())

# Combine all clinical data labels into one dataframe
labels_test = pd.concat([labels_aml02, labels_aml08] ,axis=0, join='outer')

# Redefine output path (for troubleshooting purposes in case only this cell is run)
# mount = '../../../'
'/mnt/d/'
output_path = mount + '/Intermediate_Files/'

# Read df_test
df_test = pd.read_pickle(output_path + '201samples_357839cpgs_withbatchcorrection_bvalues.pkl')

# Remove samples that are not in the methyl dataset
df_labels_test = labels_test.loc[labels_test.index.isin(df_test.index)].sort_index()

print('The clinical data has been indexed and cleaned.\n\
Exclusion of samples may be applied depending on the analysis.')

The clinical data has been indexed and cleaned.
Exclusion of samples may be applied depending on the analysis.


In [8]:
df_labels_test2 = process_labels_who22(df_labels_test)
df_labels_test2['WHO 2022 Diagnosis'].value_counts(dropna=False)

WHO 2022 Diagnosis
NaN                                        99
AML with t(v;11q23); KMT2A-r               47
AML with t(8;21); RUNX1::RUNX1T1           29
AML with inv(16); t(16;16); CBFB::MYH11    23
AML with mutated CEBPA                      2
AML with t(6;9); DEK::NUP214                1
Name: count, dtype: int64

In [9]:
df_labels_test3 = process_df_labels(df_labels_test2)

df_labels_test3['Hematopoietic Entity'].value_counts(dropna=True)

Hematopoietic Entity
Acute myeloid leukemia (AML)    102
Name: count, dtype: int64

In [10]:
df_labels3['WHO 2022 Diagnosis'].value_counts(dropna=False)

WHO 2022 Diagnosis
NaN                                        866
AML with t(v;11q23); KMT2A-r               317
Otherwise-Normal Control                   251
T-ALL NOS                                  242
AML with mutated NPM1                      226
MDS-related; secondary myeloid             225
B-ALL with hyperdiploidy                   190
AML with inv(16); t(16;16); CBFB::MYH11    178
AML with t(8;21); RUNX1::RUNX1T1           176
B-ALL with t(12;21); ETV6::RUNX1           163
AML with NUP98-fusion                       95
AML with mutated CEBPA                      83
APL with t(15;17); PML::RARA                31
AML with CBFA2T3::GLIS2                     30
B-ALL with t(v;11q23); KMT2A-r              28
AML with t(6;9); DEK::NUP214                26
MPAL T-Lymphoblastic/Myeloid                25
B-ALL with t(1;19); TCF3::PBX1              23
B-ALL dic(9;20)                             20
MPAL B-Lymphoblastic/Myeloid                19
B-ALL with t(9;22); BCR::ABL1            

In [11]:
df_labels3['ELN 2022 Diagnosis'].value_counts(dropna=False)

ELN 2022 Diagnosis
None                                       1616
AML with t(v;11q23); KMT2A-r                317
Otherwise-Normal Control                    251
AML with mutated NPM1                       226
MDS-related; secondary myeloid              225
AML with inv(16); t(16;16); CBFB::MYH11     178
AML with t(8;21); RUNX1::RUNX1T1            176
AML with other recurring translocations     164
AML with mutated CEBPA                       83
APL with t(15;17); PML::RARA                 31
AML with t(6;9); DEK::NUP214                 26
AML with inv(3); t(3;3); MECOM-r             12
AML with t(9;22); BCR::ABL1                   3
Name: count, dtype: int64

## Save Clinical Datasets

In [12]:
# Save the clinical data labels with WHO 2022 classification
df_labels3.to_csv(output_path + 'discovery_clinical_data.csv')

# Save the clinical data labels with WHO 2022 classification
df_labels3[['WHO 2022 Diagnosis','ELN 2022 Diagnosis', 'Clinical Trial', 'Vital Status']].to_excel(output_path + 'discovery_clinical_data_short.xlsx')

# savel labels_test to excel
df_labels_test3.to_csv(output_path + 'validation_clinical_data.csv')

# Save individual datasets

# labels_aml05.to_excel(output_path + 'aml05_clinical_data.xlsx')
# labels_beataml.to_excel(output_path + 'beataml_clinical_data.xlsx')
# labels_0531.to_excel(output_path + '0531_clinical_data.xlsx')
# labels_amltcga.to_excel(output_path + 'amltcga_clinical_data.xlsx')
# labels_1031.to_excel(output_path + '1031_clinical_data.xlsx')
# labels_nordic_all.to_excel(output_path + 'nordic_all_clinical_data.xlsx')
# labels_mds_taml.to_excel(output_path + 'mds_taml_clinical_data.xlsx')
# labels_all_graal.to_excel(output_path + 'all_graal_clinical_data.xlsx')
# labels_target_all.to_excel(output_path + 'target_all_clinical_data.xlsx')
# labels_aml02.to_excel(output_path + 'aml02_clinical_data.xlsx')
# labels_aml08.to_excel(output_path + 'aml08_clinical_data.xlsx')