# PaCMAP Benchmark

```{contents}
```

## Load data

In [3]:
import pandas as pd

input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'


df_methyl = pd.read_pickle(
    input_path+'df_batch_corrected.pkl').sort_index()

df_labels = pd.read_csv(
    input_path+'clinical_data.csv', index_col=0, low_memory=False).sort_index()

print(
    f' Dataset (df) contains {df_methyl.shape[1]} columns (5mC nucleotides/probes) and {df_methyl.shape[0]} rows (samples).')

 Dataset (df) contains 333352 columns (5mC nucleotides/probes) and 3330 rows (samples).


## Remove samples based on certain clinical features

### Select samples from AAML1031, 0531, and 03P1 clinical trials

In [6]:
df1 = df_labels[df_labels['Clinical Trial'].isin(['AAML0531', 'AAML1031', 'AAML03P1'])]

print(
    f'{df_labels.shape[0]-df1.shape[0]} samples were removed. {df1.shape[0]} samples remaining.')


2049 samples were removed. 1281 samples remaining.


### Select diagnostic bone marrow samples only

In [7]:
df2 = df1[df1['Sample Type'].isin(
    ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow'])]

print(
    f'{df1.shape[0]-df2.shape[0]} samples were removed. {df2.shape[0]} samples remaining.')

350 samples were removed. 931 samples remaining.


### Remove duplicate samples

In [8]:
df3 = df2[~df2['Patient_ID'].duplicated(keep='last')]

print(
    f'{df2.shape[0]-df3.shape[0]} samples were removed. {df3.shape[0]} samples remaining.')


7 samples were removed. 924 samples remaining.


### Match samples in clinical data to samples in methylation data

In [9]:
# Match samples in clinical data to samples in methylation data
df_methyl_filtered = df_methyl[df_methyl.index.isin(df3.index)].iloc[:, 1:]

print('Samples in clinical data matched to samples in methylation data.')

Samples in clinical data matched to samples in methylation data.


## Run PaCMAP

In [20]:
import pacmap

# Initialize PaCMAP. Note: hyperparameter tuning has been performed.
reducer = pacmap.PaCMAP(n_components=2, n_neighbors=15,
                        MN_ratio=0.4, FP_ratio=16.0, random_state=42,
                        lr=0.1, num_iters=5000)

# Fit (estimate) parameters to the training dataset to learn the embedding
embedding = reducer.fit_transform(df_methyl_filtered.to_numpy(dtype='float16'))



In [36]:
df_embedding = pd.DataFrame(embedding, index=df_methyl_filtered.index, columns=[
                            'PaCMAP 1', 'PaCMAP 2'])

df_labels['PaCMAP Output'] = 'Patient Samples'
df_labels['Batch'] = df_methyl['Batch']

## Visualize PaCMAP results

In [38]:
from bokeh.io import curdoc, output_notebook
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap

from bokeh.models import Div, Slider, TabPanel, Tabs, Legend

output_notebook()

custom_color_palette = [
    '#1f77b4',
    '#ff7f0e', 
    '#2ca02c',
    '#d62728',
    '#9467bd', 
    '#7f7f7f',
    '#e377c2',
    '#e7ba52',
    '#bcbd22',
    '#17becf',
    '#393b79',
    '#8c564b',
    '#f7b6d2',
    '#c49c94'] 

# Reference: 
# https://github.com/d3/d3-3.x-api-reference/blob/master/Ordinal-Scales.md#categorical-colors


# Define the list of columns to include in the plot
cols = ['PaCMAP Output', 'FAB', 'FLT3 ITD', 'Age group (years)',
        'Complex Karyotype','Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status',
        'Leucocyte counts (10⁹/L)', 'Risk Group','Race or ethnic group',
        'Clinical Trial','Vital Status','Sample Type','Karyotype']

# Join the training data with the labels and reset the index
df = df_embedding.join(df_labels[cols]).reset_index()

# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'

In [39]:
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, Legend
from bokeh.plotting import figure
#from bokeh.transform import factor_cmap

# Define a function for creating the Bokeh figure
def create_figure():
    return figure(title='The Pediatric AML Methylome Atlas',
                  width=1000, height=600, sizing_mode='fixed',
                  x_axis_label='PaCMAP 1', y_axis_label='PaCMAP 2',
                  x_range=(-40, 40), y_range=(-40, 40),
                  tools="pan,wheel_zoom,reset,save", active_drag="pan",
                  active_scroll="auto",
                  tooltips=[("Karyotype", "@Karyotype")])
# Define a function for creating the scatter plots
def create_scatters(df, p, hue):
    df = df[~df[hue].isna()]  # Filter out rows with NaN values for the hue column
    filtered_dfs = [df[df[hue] == val] for val in df[hue].value_counts().sort_values(ascending=False).index.to_list()]
    
    renderers = []
    items = []
    for i in range(len(filtered_dfs)):
        name = filtered_dfs[i][hue].head(1).values[0]
        color = custom_color_palette[i]
        source = ColumnDataSource(filtered_dfs[i])
        r = p.scatter(x="PaCMAP 1", y="PaCMAP 2", source=source,
                     fill_alpha=0.8, size=5,
                     color=color)
        renderers.append(r)
        items.append((name, [r]))

    return renderers, items

# Create the Bokeh figure and scatter plots for each column
tabs = Tabs(tabs=[TabPanel(child=create_figure(), title=title) for title in cols[:-1]],
            tabs_location='left')

points = [create_scatters(df, tab.child, hue=col) for tab, col in zip(tabs.tabs, cols[:-1])]
for p, (renderers, items) in zip(tabs.tabs, points):
    p.child.toolbar.logo = None
    p.child.toolbar_location = 'above'
    legend = Legend(items=items, location='top_left')
    p.child.add_layout(legend, 'right')
    p.child.legend.click_policy = 'hide'

# Add title to legend the same as the tab
for i in range(len(tabs.tabs)):
    tabs.tabs[i].child.legend.title = tabs.tabs[i].title
    # Save a high resolution version of the plot
    tabs.tabs[i].child.output_backend = "svg"

# Define a slider for adjusting the size of the data points
slider = Slider(title="Adjust datapoint size", start=0, end=20, step=1, value=points[0][0][0].glyph.size)
for i in range(len(points)): 
    for r in points[i][0]: 
        slider.js_link("value", r.glyph, "size")

# Add a Div that only skips a line
div = Div(text="""<br>""", width=1000, height=10)

layout = layout([[[div, tabs, slider]]])

show(layout)