# DNA methylation-based diagnosis and prognosis of pediatric AML

We propose to leverage machine learning tools to develop DNA methylation-based signatures of clinical utility in pediatric AML.

## The AML Methylome

Interactive visualization of the diagnostic map of AML for pediatric/adolecent/young adult patients based solely on DNA methylation.

In [1]:
import pandas as pd

PaCMAP_path = '../Data/Processed_Data/PaCMAP_Results/'
input_path = '../Data/Processed_Data/'
output_path = '../Data/Processed_Data/'

x_train = pd.read_pickle(PaCMAP_path+'embedding.pkl')
x_test = pd.read_pickle(PaCMAP_path+'embedding_test.pkl')

y = pd.read_csv(input_path+'y.csv', index_col=0)
y['PaCMAP Output'] = 'PaCMAP Output'

labels = pd.read_excel(input_path+'y_plus_WHOclass.xlsx', index_col=0)
labels = pd.concat([y, labels], axis=1)
labels = labels[labels.index.isin(x_train.index)]['WHO Classification']

y = y.join(labels.to_frame('WHO Classification'))
y['KMT2A Fusions'] = y[y['WHO Classification'].isin(['AML with KMT2A-rearrangement'])]['Gene Fusion'] 

y_train = y[~y['Clinical Trial'].isin(['AML02','AML08'])]
y_test = y[y['Clinical Trial'].isin(['AML02','AML08'])]

from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap

from bokeh.layouts import layout
from bokeh.models import Div, Slider, TabPanel, Tabs, Legend
from bokeh.io import curdoc, output_notebook

output_notebook()
custom_color_palette = [
    '#1f77b4',
    '#ff7f0e', 
    '#2ca02c',
    '#d62728',
    '#9467bd', 
    '#7f7f7f',
    '#e377c2',
    '#e7ba52',
    '#bcbd22',
    '#17becf',
    '#393b79',
    '#8c564b',
    '#f7b6d2',
    '#c49c94'] 

# Reference: 
# https://github.com/d3/d3-3.x-api-reference/blob/master/Ordinal-Scales.md#categorical-colors

# sort the subtypes by frequency
sorted_subtypes = y['WHO Classification'].value_counts(
                  ).sort_values(ascending=False).index.to_list()

# zip the subtypes and colors together
custom_color_map = dict(zip(sorted_subtypes, custom_color_palette))


In [7]:
from bokeh.layouts import layout
# Define the list of columns to include in the plot
cols = ['PaCMAP Output','WHO Classification', 'FAB', 'FLT3 ITD', 'Age group (years)',
        'Complex Karyotype','Primary Cytogenetic Code' ,'Karyotype']

# Join the training data with the labels and reset the index
df = x_train.join(y_train[cols]).reset_index()

# Set the theme for the plot
curdoc().theme = 'light_minimal'

# Define a function for creating the Bokeh figure
def create_figure():
    return figure(width=600, height=600, sizing_mode='fixed',
                  x_axis_label='PaCMAP 1', y_axis_label='PaCMAP 2',
                  tools="pan,wheel_zoom,reset,save", active_drag="pan",
                  active_scroll="wheel_zoom",
                  tooltips=[("Diagnosis", "@{WHO Classification}"),
                            ("Karyotype", "@Karyotype")])

# Define a function for creating a scatter plot with color encoding by a given column
def create_scatter(df, p, hue):
    df = df[~df[hue].isna()]  # Filter out rows with NaN values for the hue column
    return p.scatter(x="PaCMAP 1", y="PaCMAP 2", source=df.copy(),
                     fill_alpha=0.8, size=5,
                     color=factor_cmap(field_name=hue, palette=custom_color_palette,
                                       factors=df[hue].value_counts().sort_values(
                                           ascending=False).index.to_list()),
                     legend_group=hue)

# Create the Bokeh figure and scatter plots for each column
tabs = Tabs(tabs=[TabPanel(child=create_figure(), title=title) for title in cols[:-1]],
            tabs_location='left')

points = [create_scatter(df, tab.child, hue=col) for tab, col in zip(tabs.tabs, cols)]
for p in tabs.tabs:
    p.child.toolbar.logo = None

# Define a slider for adjusting the size of the data points
slider = Slider(title="Adjust datapoint size", start=0, end=20, step=1, value=points[0].glyph.size)
for p in points:
    slider.js_link("value", p.glyph, "size")

# Create a layout for the plot and display it
div = Div(text="<b>The AML Diagnostic Map</b>\nInteractive visualization of the pediatric AML methylome:",
          width=200, height=85)
layout = layout([[[div, tabs, slider]]])

show(layout)

In [3]:
# Define the list of columns to include in the plot
cols = ['Primary Cytogenetic Code', 'FAB', 'FLT3 ITD', 'Age group (years)',
        'WHO Classification', 'Complex Karyotype', 'Karyotype']

# Join the training data with the labels and reset the index
df = x_train.join(y_train[cols]).reset_index()

# Set the theme for the plot
curdoc().theme = 'light_minimal'

# Define a function for creating the Bokeh figure
def create_figure():
    return figure(width=600, height=600, sizing_mode='fixed',
                  x_axis_label='PaCMAP 1', y_axis_label='PaCMAP 2',
                  tools="pan,wheel_zoom,reset,save", active_drag="pan",
                  active_scroll="wheel_zoom",
                  tooltips=[("Sample", "@index"), ("Karyotype", "@Karyotype")])

# Define a function for creating a scatter plot with color encoding by a given column
def create_scatter(df, p, hue):
    df = df[~df[hue].isna()]  # Filter out rows with NaN values for the hue column
    return p.scatter(x="PaCMAP 1", y="PaCMAP 2", source=df.copy(),
                     fill_alpha=0.8, size=5,
                     color=factor_cmap(field_name=hue, palette=custom_color_palette,
                                       factors=df[hue].value_counts().sort_values(
                                           ascending=False).index.to_list()),
                     legend_group=hue)

# Create the Bokeh figure and scatter plots for each column
tabs = Tabs(tabs=[TabPanel(child=create_figure(), title='PaCMAP Output'),
                  TabPanel(child=create_figure(), title='FAB'),
                  TabPanel(child=create_figure(), title='Complex Karyotype'),
                  TabPanel(child=create_figure(), title='FLT3 ITD'),
                  TabPanel(child=create_figure(), title='Primary Cytogenetic Code'),
                  TabPanel(child=create_figure(), title='WHO Classification'),
                  TabPanel(child=create_figure(), title='Age group (years)')],
            tabs_location='left')

points = [create_scatter(df, tab.child, hue=col) for tab, col in zip(tabs.tabs, cols)]
for p in tabs.tabs:
    p.child.toolbar.logo = None

# Define a slider for adjusting the size of the data points
slider = Slider(title="Adjust datapoint size", start=0, end=20, step=1, value=points[0].glyph.size)
for p in points:
    slider.js_link("value", p.glyph, "size")

# Create a layout for the plot and display it
div = Div(text="<b>The AML Diagnostic Map</b>\nInteractive visualization of the pediatric AML methylome:",
          width=200, height=85)
layout = layout([[[div, tabs, slider]]])

show(layout)


In [3]:
list = ['Primary Cytogenetic Code', 'FAB', 'FLT3 ITD','Age group (years)',
       'WHO Classification','Complex Karyotype', 'Karyotype']

df = x_train.join(y_train[list]).reset_index() # join embedding with labels
df['PaCMAP Output'] = 'PaCMAP Output'

curdoc().theme = 'light_minimal'

def fig():
    """
    Figure specs for Bokeh plot
    """
    
    fig = figure(
           width=600,
           height=600,
           sizing_mode='fixed',
           x_axis_label='PaCMAP 1',
           y_axis_label='PaCMAP 2',
           tools="pan,wheel_zoom, reset, save",
           active_drag="pan",
           active_scroll="wheel_zoom",
           tooltips=[("Sample", "@index"),
                     ("Karyotype", "@Karyotype"),])

    return(fig)

def scatter(df, p, hue):
    """
    Scatter plot of embedding with color by hue
    
    Parameters
    ----------
    p : bokeh.plotting.figure.Figure
        Bokeh figure object
    hue : str
        Column name of df to color by
    Returns
    -------
    points : bokeh.models.renderers.GlyphRenderer
        Bokeh glyph renderer object
        
    """
    df = df[~df[hue].isna()] # df where df hue is not nan
    points = p.scatter(x="PaCMAP 1",
                   y= "PaCMAP 2",
                   source=df.copy(),
                   fill_alpha=0.8,
                   size=5,
                   color=factor_cmap(field_name= hue,
                                     palette=custom_color_palette,
                                     factors= df[hue].value_counts(
                                     ).sort_values(ascending=False
                                     ).index.to_list()),
                   legend_group=hue)
    return(points)

p1 = fig()
points1 = scatter(df, p1, hue='PaCMAP Output')
tab1 = TabPanel(child=p1, title='PaCMAP Output')
p1.toolbar.logo = None

p2 = fig()
points2 = scatter(df, p2, hue='FAB')
tab2 = TabPanel(child=p2, title="FAB")
p2.toolbar.logo = None

p3 = fig()
points3 = scatter(df, p3, hue='Complex Karyotype')
tab3 = TabPanel(child=p3, title="Complex Karyotype")
p3.toolbar.logo = None

p4 = fig()
points4 = scatter(df, p4, hue='FLT3 ITD')
tab4 = TabPanel(child=p4, title="FLT3 ITD")
p4.toolbar.logo = None

p5 = fig()
points5 = scatter(df, p5, hue='Primary Cytogenetic Code')
tab5 = TabPanel(child=p5, title='Primary Cytogenetic Code')
p5.toolbar.logo = None

p6 = fig()
points6 = scatter(df, p6, hue='WHO Classification')
tab6 = TabPanel(child=p6, title='WHO Classification')
p6.toolbar.logo = None

p7 = fig()
points7 = scatter(df, p7, hue='Age group (years)')
tab7 = TabPanel(child=p7, title='Age group (years)')
p7.toolbar.logo = None


tabs = Tabs(tabs=[tab1, tab2, tab3, tab4, tab5, tab6, tab7], tabs_location='left')

div = Div(
    text="""
          <b> The AML Diagnostic Map</b>
            <br> Interactive visualization of the pediatric AML methylome:</br>
          """,
    width=200,
    height=85)

slider = Slider(
    title="Adjust datapoint size",
    start=0,
    end=20,
    step=1,
    value=(points1.glyph.size))

slider.js_link("value", points1.glyph, "size")
slider.js_link("value", points2.glyph, "size")
slider.js_link("value", points3.glyph, "size")
slider.js_link("value", points4.glyph, "size")
slider.js_link("value", points5.glyph, "size")
slider.js_link("value", points6.glyph, "size")

# create layout
layout = layout([[[div,tabs, slider]]])

# show result
show(layout)

In [3]:
from bokeh.layouts import layout

list = ['Primary Cytogenetic Code', 'FAB', 'FLT3 ITD','Age group (years)',
       'WHO Classification','Complex Karyotype', 'Karyotype']

df = x_train.join(y_train[list]).reset_index() # join embedding with labels
df['PaCMAP Output'] = 'PaCMAP Output'

curdoc().theme = 'light_minimal'

def fig():
    """
    Figure specs for Bokeh plot
    """
    
    fig = figure(
           width=600,
           height=600,
           sizing_mode='fixed',
           x_axis_label='PaCMAP 1',
           y_axis_label='PaCMAP 2',
           tools="pan,wheel_zoom, reset, save",
           active_drag="pan",
           active_scroll="wheel_zoom",
           tooltips=[("Sample", "@index"),
                     ("Karyotype", "@Karyotype"),])

    return(fig)

def scatter(df, p, hue):
    """
    Scatter plot of embedding with color by hue
    
    Parameters
    ----------
    p : bokeh.plotting.figure.Figure
        Bokeh figure object
    hue : str
        Column name of df to color by
    Returns
    -------
    points : bokeh.models.renderers.GlyphRenderer
        Bokeh glyph renderer object
        
    """
    df = df[~df[hue].isna()] # df where df hue is not nan
    points = p.scatter(x="PaCMAP 1",
                   y= "PaCMAP 2",
                   source=df.copy(),
                   fill_alpha=0.8,
                   size=5,
                   color=factor_cmap(field_name= hue,
                                     palette=custom_color_palette,
                                     factors= df[hue].value_counts(
                                     ).sort_values(ascending=False
                                     ).index.to_list()),
                   legend_group=hue)
    return(points)

def create_tabs(df):
    def create_tab(title, hue):
        p = fig()
        points = scatter(df, p, hue=hue)
        tab = TabPanel(child=p, title=title)
        p.toolbar.logo = None

        return tab

    tabs = Tabs(
        tabs=[create_tab(title, hue) for title, hue in list],
        tabs_location='left'
    )

    return tabs



div = Div(
    text="""
          <b> The AML Diagnostic Map</b>
            <br> Interactive visualization of the pediatric AML methylome:</br>
          """,
    width=200,
    height=85)

slider = Slider(
    title="Adjust datapoint size",
    start=0,
    end=20,
    step=1,
    value=(points1.glyph.size))

slider.js_link("value", points1.glyph, "size")
slider.js_link("value", points2.glyph, "size")
slider.js_link("value", points3.glyph, "size")
slider.js_link("value", points4.glyph, "size")
slider.js_link("value", points5.glyph, "size")
slider.js_link("value", points6.glyph, "size")
slider.js_link("value", points7.glyph, "size")

# create layout
layout = layout([[[tabs, slider]]])

# show result
show(layout)

NameError: name 'points1' is not defined

## Table of Contents

```{tableofcontents}
```
