In [2]:
import os
conda_envir_loc = '/home/ubuntu/miniconda3/envs/ADTnormPy'
os.environ["R_HOME"] = conda_envir_loc + "/lib/R"
import adtnormpy
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
import seaborn as sns

## Load in your data
Here, we will load in some example datasets for batch correction. We can input these data into the adtnorm function many different ways, but here we will use a pandas.DataFrame

In [3]:
files = ['pbmc_10k_protein_v3.h5','5k_pbmc_protein_v3.h5','pbmc_1k_protein_v3.h5']
cellranger_versions = ['3.0.0','3.0.2','3.0.0']
base_url =  f"http://cf.10xgenomics.com/samples/cell-exp/"
urls = [base_url + f"{v}/{i[:-3]}/{i[:-3]}_filtered_feature_bc_matrix.h5" for i,v in zip(files,cellranger_versions)]

In [4]:
adatas = []
for i,file in enumerate(files):
    ad = sc.read_10x_h5('data/'+file, backup_url=urls[i], gex_only=False)
    ad.var_names_make_unique()
    adatas.append(ad)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [5]:
adata = anndata.concat(adatas,merge='first',label='sample',keys=files)
prot_df = adata[:,adata.var.feature_types == 'Antibody Capture'].to_df()
obs = adata.obs

  utils.warn_names_duplicates("obs")


## Running ADTnorm

In [7]:
adtnorm_df = adtnormpy.adtnorm(prot_df,
                         obs = obs,
                         sample_column='sample',
                         exclude_zeroes=True,
                         peak_type='mode',
                         save_outpath='ADTnormSave',
                         save_landmark=True,
                         save_fig=True,
                         clean_adt_name=True,
                         marker_to_process = 'CD3',
                         target_landmark_location="fixed")



[1] "'batch' column is not provided. It will be set as the same as the 'sample' column."
[1] "Will align negative peak to 1 and right-most positive peak to 5."
[1] "Note: ADTnorm will process the following ADT markers as provided: CD3"
Progress:  Each dot is a curve
...


In [8]:
adtnormpy.ADTnormPy._process_kwargs(dict(target_landmark_location=[1.2,4]))['target_landmark_location']

0,1
1.2,4.0


First we'll run it through as a dataframe, providing ADT expression as a dataframe and a corresponding metadata dataframe.

In [9]:
adtnorm_df = adtnormpy.adtnorm(prot_df,
                         obs = obs,
                         sample_column='sample',
                         exclude_zeroes=True,
                         peak_type='mode',
                         save_outpath='ADTnormSave',
                         save_landmark=True,
                         save_fig=True,
                         clean_adt_name=True)



[1] "'batch' column is not provided. It will be set as the same as the 'sample' column."
[1] "Note: ADTnorm will process the following ADT markers as provided: CD3, CD4, CD8, CD14, CD15, CD16, CD56, CD19, CD25, CD45RA, CD45RO, PD1, TIGIT, CD127, IgG2a, IgG1, IgG2b"
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...


In [10]:
adtnorm_df

Unnamed: 0,CD3,CD4,CD8,CD14,CD15,CD16,CD56,CD19,CD25,CD45RA,CD45RO,PD1,TIGIT,CD127,IgG2a,IgG1,IgG2b
AAACCCAAGATTGTGA-1,1.913011,3.581841,1.565323,5.084027,2.080433,1.573063,1.793180,1.136582,1.477726,3.166289,3.078941,1.427612,1.151893,1.322484,1.200492,0.891687,1.167918
AAACCCACATCGGTTA-1,2.339913,3.454205,1.816354,5.043536,2.521969,3.174261,4.189953,1.136582,1.199244,3.304994,3.748452,1.427612,1.501344,1.389186,1.118552,1.344396,1.167918
AAACCCAGTACCGCGT-1,1.913011,3.922756,1.413146,6.073554,2.743418,2.518141,2.115727,1.575710,1.798884,6.981815,2.179546,1.934572,1.613337,1.626900,1.684101,1.926180,1.436742
AAACCCAGTATCGAAA-1,1.913011,1.518511,1.739053,1.866719,2.711444,5.718765,4.877872,1.330239,1.798884,6.873774,2.179546,2.108754,2.931431,1.830224,1.684101,1.788049,1.263894
AAACCCAGTCGTCATA-1,1.177589,1.668269,1.611502,1.828801,2.949905,6.253226,4.806057,1.236846,1.844707,6.728374,1.942487,2.202188,3.175057,1.876577,1.353014,1.688355,1.657929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCGATCAGGCATTT-1,6.094236,7.903278,2.406731,2.285466,2.131973,1.307704,1.260289,1.278513,2.487777,3.714524,4.065046,2.459846,1.235967,3.439004,1.468859,1.262869,1.204049
TTTCGATGTCAACGCC-1,6.247052,8.182787,2.344182,2.439319,2.547288,1.780975,1.166756,1.278513,1.818528,2.683397,5.289026,2.245026,1.336229,3.554467,1.543704,1.700944,1.076490
TTTGACTTCTCCGAGG-1,1.462260,3.834868,2.166254,5.690381,4.385559,2.114295,1.629485,,1.270985,2.492913,5.113585,1.403582,1.126277,,1.068050,1.042469,1.204049
TTTGATCCAAACCACT-1,2.290122,6.889439,1.839625,4.487260,1.764155,2.544269,1.260289,,1.270985,4.531825,4.639676,1.403582,1.126277,1.274380,1.292631,1.159371,1.204049


In [11]:
# We could also do this using an AnnData of protein information:
prot = adata[:,adata.var.feature_types == 'Antibody Capture'].copy()
# Note: to use an AnnData as an input, sparse matrices must be made dense:
prot.X = prot.X.A
prot = adtnormpy.adtnorm(prot,
                         ADT_location=None,
                         sample_column='sample',
                         exclude_zeroes=True, 
                         return_location='ADTnorm',
                         peak_type='mode',
                         save_fig=True)

  utils.warn_names_duplicates("obs")


[1] "'batch' column is not provided. It will be set as the same as the 'sample' column."
[1] "Note: ADTnorm will process the following ADT markers as provided: CD3_TotalSeqB, CD4_TotalSeqB, CD8a_TotalSeqB, CD14_TotalSeqB, CD15_TotalSeqB, CD16_TotalSeqB, CD56_TotalSeqB, CD19_TotalSeqB, CD25_TotalSeqB, CD45RA_TotalSeqB, CD45RO_TotalSeqB, PD-1_TotalSeqB, TIGIT_TotalSeqB, CD127_TotalSeqB, IgG2a_control_TotalSeqB, IgG1_control_TotalSeqB, IgG2b_control_TotalSeqB"
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Each dot is a curve
...
Progress:  Ea

Now you can see the landmark registered expression is added as a layer to the .X. This was controlled by the "return_location" parameter.

In [12]:
prot

AnnData object with n_obs × n_vars = 13825 × 17
    obs: 'sample'
    var: 'gene_ids', 'feature_types', 'genome'
    layers: 'ADTnorm'

## View plots
We can load the pdf files saved during landmark registration (when save_outpath=True) to see the distributions for each sample before and after landmark registration

In [13]:
class PDF(object):
    '''grabbed from: https://stackoverflow.com/questions/19470099/view-pdf-image-in-an-ipython-notebook'''
    def __init__(self, pdf, size=(200,200)):
        self.pdf = pdf
        self.size = size

    def _repr_html_(self):
        return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)

    def _repr_latex_(self):
        return r'\includegraphics[width=1.0\textwidth]{{{0}}}'.format(self.pdf)

In [14]:
PDF('ADTnormSave/figures/ArcsinhTransformation_CD3_ADTnormPy.pdf',size=(600,150))

In [15]:
PDF('ADTnormSave/figures/ADTnorm_CD3_ADTnormPy.pdf',size=(600,150))

## View landmarks
While running ADTnorm, you can also optionally save landmarks detected (peaks and valleys) to .rds files for each marker. Here, we've provided a function to load those landmarks into python, given the save_outpath used during ADTnorm.

In [16]:
landmark_override = adtnormpy.landmarks_to_python('ADTnormSave')

In [17]:
landmark_override

{'CD127': {'peak_landmark_list':                                0         1
  pbmc_10k_protein_v3.h5  1.941562  4.021367
  5k_pbmc_protein_v3.h5   1.310240  2.795740
  pbmc_1k_protein_v3.h5   1.644436  3.984215,
  'valley_landmark_list':                                0
  pbmc_10k_protein_v3.h5  2.881360
  5k_pbmc_protein_v3.h5   2.052990
  pbmc_1k_protein_v3.h5   2.814325},
 'CD14': {'peak_landmark_list':                                0         1
  pbmc_10k_protein_v3.h5  2.653008  6.224008
  5k_pbmc_protein_v3.h5   1.784367  5.548419
  pbmc_1k_protein_v3.h5   2.170437  6.368777,
  'valley_landmark_list':                                0
  pbmc_10k_protein_v3.h5  3.827699
  5k_pbmc_protein_v3.h5   2.917539
  pbmc_1k_protein_v3.h5   3.796171},
 'CD15': {'peak_landmark_list':                                0
  pbmc_10k_protein_v3.h5  3.784464
  5k_pbmc_protein_v3.h5   1.764095
  pbmc_1k_protein_v3.h5   2.093036,
  'valley_landmark_list':                                0
  pbmc_10k_prot

We can also edit these right here in the notebook, or save the python landmarks to .csv files to edit offline:

In [18]:
landmark_override['CD3']['valley_landmark_list'].loc['pbmc_10k_protein_v3.h5',0] = 4.5

In [19]:
landmark_override['CD3']['valley_landmark_list']

Unnamed: 0,0
pbmc_10k_protein_v3.h5,4.5
5k_pbmc_protein_v3.h5,3.09887
pbmc_1k_protein_v3.h5,4.024145


In [20]:
adtnormpy.save_python_landmarks(landmark_override,'ADTnormSave')

Once we've finished editing the .csv files we can load them back in as a dictionary using this function

In [21]:
loaded_from_csv = adtnormpy.load_python_landmarks('ADTnormSave')

In [22]:
loaded_from_csv['CD3']['valley_landmark_list']

Unnamed: 0,0
pbmc_10k_protein_v3.h5,4.5
5k_pbmc_protein_v3.h5,3.09887
pbmc_1k_protein_v3.h5,4.024145


We can also convert this to R to get the overrides saved as .rds files like we originally loaded in.

In [23]:
r_result = adtnormpy.landmarks_to_r(loaded_from_csv,'ADTnormSave')

## Advanced running of ADTnorm

ADTnormPy has options to support all of the keyword arguments specified in ADTnorm documentation. This includes using marker_to_process to limit to one (or a few) ADTs, using customize_landmark to open up a GUI for landmark editing, and providing overrides to landmark detection using override_landmark. Here, we'll use all of those features to perform ADTnorm on CD3 using the altered valley landmark that we specified above. 

Note, you will need to open up and respond to the popup created by customize_landmark to continue running this code. 

In [24]:
df = adtnormpy.adtnorm(prot_df,obs=obs,ADT_location=None,exclude_zeroes=True, peak_type = 'mode',
                       save_outpath='./ADTnormSave',customize_landmark=True,override_landmark='ADTnormSave/CSV',marker_to_process= 'CD3',
                       save_landmark=True,clean_adt_name=True)

Attempting to load override_landmark from .rds
Failed. Attempting to load override_landmark from .csv
Success, found overrides for: dict_keys(['IgG1', 'CD56', 'CD14', 'CD19', 'CD45RO', 'CD127', 'IgG2b', 'IgG2a', 'CD16', 'CD3', 'CD45RA', 'CD8', 'CD4', 'CD3_TotalSeqB', 'TIGIT', 'CD25', 'CD15', 'PD1'])




[1] "'batch' column is not provided. It will be set as the same as the 'sample' column."
[1] "Note: ADTnorm will process the following ADT markers as provided: CD3"


R[write to console]: Loading required package: shiny

R[write to console]: 
Listening on http://127.0.0.1:7617

Error: no DISPLAY environment variable specified
R[write to console]:  Removed 13 rows containing non-finite outside the scale range
(`stat_density_ridges()`).

R[write to console]:  Removed 13 rows containing non-finite outside the scale range
(`stat_density_ridges()`).

R[write to console]:  Removed 13 rows containing non-finite outside the scale range
(`stat_density_ridges()`).



Progress:  Each dot is a curve
...
