# The Pediatric AML Methylome Atlas

In [1]:
import pandas as pd

PaCMAP_path = '../Data/Processed_Data/PaCMAP_Results/'
input_path = '../Data/Processed_Data/'
output_path = '../Data/Processed_Data/'

x_train = pd.read_pickle(PaCMAP_path+'embedding.pkl')
x_test = pd.read_pickle(PaCMAP_path+'embedding_test.pkl')

y = pd.read_csv(input_path+'y.csv', index_col=0)
y['PaCMAP Output'] = 'Patient Samples at Diagnosis'

labels = pd.read_excel(input_path+'y_plus_WHOclass.xlsx', index_col=0)['WHO Classification']
# labels = pd.concat([y, labels], axis=1)
# labels = labels[labels.index.isin(x_train.index)]['WHO Classification']

y = y.join(labels.to_frame('WHO Classification'))
y['KMT2A Fusions'] = y[y['WHO Classification'].isin(['AML with KMT2A-rearrangement'])]['Gene Fusion'] 

y_train = y[~y['Clinical Trial'].isin(['AML02','AML08'])]
y_test = y[y['Clinical Trial'].isin(['AML02','AML08'])]

from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap

from bokeh.models import Div, Slider, TabPanel, Tabs, Legend
from bokeh.io import curdoc, output_notebook

output_notebook()

custom_color_palette = [
    '#1f77b4',
    '#ff7f0e', 
    '#2ca02c',
    '#d62728',
    '#9467bd', 
    '#7f7f7f',
    '#e377c2',
    '#e7ba52',
    '#bcbd22',
    '#17becf',
    '#393b79',
    '#8c564b',
    '#f7b6d2',
    '#c49c94'] 

# Reference: 
# https://github.com/d3/d3-3.x-api-reference/blob/master/Ordinal-Scales.md#categorical-colors

# sort the subtypes by frequency
sorted_subtypes = y['WHO Classification'].value_counts(
                  ).sort_values(ascending=False).index.to_list()

# zip the subtypes and colors together
custom_color_map = dict(zip(sorted_subtypes, custom_color_palette))
# Define the list of columns to include in the plot
cols = ['PaCMAP Output','WHO Classification', 'FAB', 'FLT3 ITD', 'Age group (years)',
        'Complex Karyotype','Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status',
        'Leucocyte counts (10⁹/L)', 'Risk Group','Race or ethnic group',
        'Clinical Trial','Vital Status','Sample Type','Karyotype']

# Join the training data with the labels and reset the index
df = x_train.join(y_train[cols]).reset_index()

# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'

FileNotFoundError: [Errno 2] No such file or directory: '../Data/Processed_Data/PaCMAP_Results/embedding.pkl'

Interactive map of 1,142 pediatric AML patients based solely on their methylomes at diagnosis:


In [29]:
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, Legend
from bokeh.plotting import figure
#from bokeh.transform import factor_cmap

# Define a function for creating the Bokeh figure
def create_figure():
    return figure(title='The Pediatric AML Methylome Atlas',
                  width=1000, height=600, sizing_mode='fixed',
                  x_axis_label='PaCMAP 1', y_axis_label='PaCMAP 2',
                  x_range=(-40, 40), y_range=(-40, 40),
                  tools="pan,wheel_zoom,reset,save", active_drag="pan",
                  active_scroll="auto",
                  tooltips=[("Diagnosis", "@{WHO Classification}"),
                            ("Karyotype", "@Karyotype")])
# Define a function for creating the scatter plots
def create_scatters(df, p, hue):
    df = df[~df[hue].isna()]  # Filter out rows with NaN values for the hue column
    filtered_dfs = [df[df[hue] == val] for val in df[hue].value_counts().sort_values(ascending=False).index.to_list()]
    
    renderers = []
    items = []
    for i in range(len(filtered_dfs)):
        name = filtered_dfs[i][hue].head(1).values[0]
        color = custom_color_palette[i]
        source = ColumnDataSource(filtered_dfs[i])
        r = p.scatter(x="PaCMAP 1", y="PaCMAP 3", source=source,
                     fill_alpha=0.8, size=5,
                     color=color)
        renderers.append(r)
        items.append((name, [r]))

    return renderers, items

# Create the Bokeh figure and scatter plots for each column
tabs = Tabs(tabs=[TabPanel(child=create_figure(), title=title) for title in cols[:-1]],
            tabs_location='left')

points = [create_scatters(df, tab.child, hue=col) for tab, col in zip(tabs.tabs, cols[:-1])]
for p, (renderers, items) in zip(tabs.tabs, points):
    p.child.toolbar.logo = None
    p.child.toolbar_location = 'above'
    legend = Legend(items=items, location='top_left')
    p.child.add_layout(legend, 'right')
    p.child.legend.click_policy = 'hide'

# Add title to legend the same as the tab
for i in range(len(tabs.tabs)):
    tabs.tabs[i].child.legend.title = tabs.tabs[i].title
    # Save a high resolution version of the plot
    tabs.tabs[i].child.output_backend = "svg"

# Define a slider for adjusting the size of the data points
slider = Slider(title="Adjust datapoint size", start=0, end=20, step=1, value=points[0][0][0].glyph.size)
for i in range(len(points)): 
    for r in points[i][0]: 
        slider.js_link("value", r.glyph, "size")

# Add a Div that only skips a line
div = Div(text="""<br>""", width=1000, height=10)

layout = layout([[[div, tabs, slider]]])

show(layout)