# Preprocess cfDNA fragments from Snyder cfDNA paper


data downloaded from: FinaleDB

You will need to format the cfDNA files using run_process_cfdna first


In [1]:
# imports

import os

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

import muon as mu
from muon import atac as ac


2024-04-19 15:18:02.049267: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-19 15:18:02.050322: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-19 15:18:02.070658: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-19 15:18:02.071182: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load intermediate files

data_path = f"{os.getcwd()}/../data/"

results_file = f"{data_path}/cfDNA/cfdna_processed_all.h5ad"


# get TCGA regions
tcga_file = f"{data_path}/TCGA/tcga_atac_promoters.h5ad"
tcga_adata = ad.read_h5ad(tcga_file)
tcga_adata

# get the regions of interest
region_df = tcga_adata.var
region_df = region_df.iloc[:,0:3]
region_df = region_df.rename(columns={"seqnames":"Chromosome", "start":"Start", "end":"End"})
region_df



Unnamed: 0,Chromosome,Start,End
0,chr1,17238,17739
20,chr1,817118,817619
24,chr1,826524,827025
25,chr1,827303,827804
26,chr1,830679,831180
...,...,...,...
562680,chrX,155767443,155767944
562686,chrX,155880523,155881024
562687,chrX,155881036,155881537
562703,chrX,156003787,156004288


In [3]:
def make_adata_cfdna(frag_unzip_path, frag_path, cancer_type):

    with open(frag_unzip_path) as your_data:
        adata = pd.read_csv(your_data, delimiter='\t', header=None)

    # get the region info and counts
    adata.columns = ["Chromosome", "Start", "End", "Sample_id", "count"]
    samp_id = adata.Sample_id[0]
    adata_var = adata.iloc[:,0:3]
    adata_X = pd.DataFrame(adata.iloc[:,4])
    adata_X.columns = [samp_id]


    # get the sample info
    adata_obs = pd.DataFrame({"sample_id": adata_X.columns})
    adata_obs.index = adata_X.columns
    adata_obs["cancer_type"] = cancer_type

    # transpose to anndata format
    adata_X = adata_X.transpose()



    # remake anndata
    adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
    
    
    # get fragments file
    mu.atac.tl.locate_fragments(adata, frag_path)
    counts_cfdna = mu.atac.tl.count_fragments_features(adata, region_df)

    counts_cfdna.var['gene_ids'] = counts_cfdna.var.Chromosome.astype(str) + ":" + counts_cfdna.var.Start.astype(str)  + "-" +  counts_cfdna.var.End.astype(str)

    return counts_cfdna


In [4]:
# read in BRCA cancer type

frag_path =       f"{data_path}/cfDNA/EE86257.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86257.hg38.frag_filt_format.tsv"
adata_full_BRCA = make_adata_cfdna(frag_unzip_path, frag_path, "BRCA")


frag_path =       f"{data_path}/cfDNA/EE86269.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86269.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "BRCA")
adata_full_BRCA = adata_full_BRCA.concatenate(adata2)


frag_path =       f"{data_path}/cfDNA/EE86267.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86267.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "BRCA")
adata_full_BRCA = adata_full_BRCA.concatenate(adata2)


frag_path =       f"{data_path}/cfDNA/EE86256.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86256.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "BRCA")
adata_full_BRCA = adata_full_BRCA.concatenate(adata2)


frag_path =       f"{data_path}/cfDNA/EE86235.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86235.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "BRCA")
adata_full_BRCA = adata_full_BRCA.concatenate(adata2)


frag_path =       f"{data_path}/cfDNA/EE86228.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86228.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "BRCA")
adata_full_BRCA = adata_full_BRCA.concatenate(adata2)



  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
[W::hts_idx_load3] The index file is older than the data file: /home/natalie/projects/checkouts/buddi_atac/buddi_atac/preprocessing_qc/../data//cfDNA/EE86257.hg38.frag_filt_format_bgzip.tsv.gz.tbi
[W::hts_idx_load3] The index file is older than the data file: /home/natalie/projects/checkouts/buddi_atac/buddi_atac/preprocessing_qc/../data//cfDNA/EE86257.hg38.frag_filt_format_bgzip.tsv.gz.tbi
100%|██████████| 45782/45782 [00:27<00:00, 1658.70it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:11<00:00, 4111.43it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:11<00:00, 3842.15it/s]
  return A

In [5]:
# read in lung cancer type



frag_path =       f"{data_path}/cfDNA/EE86230.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86230.hg38.frag_filt_format.tsv"
adata_full_LUNG = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")


frag_path =       f"{data_path}/cfDNA/EE86264.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86264.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")
adata_full_BRCA = adata_full_BRCA.concatenate(adata2)

frag_path =       f"{data_path}/cfDNA/EE86233.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86233.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")
adata_full_LUNG = adata_full_LUNG.concatenate(adata2)

""" commented out for now -- file too big
frag_path =       f"{data_path}/cfDNA/EE86238.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86238.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")
adata_full_LUNG = adata_full_LUNG.concatenate(adata2)

frag_path =       f"{data_path}/cfDNA/EE86243.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86243.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")
adata_full_LUNG = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")

 """



  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:11<00:00, 3838.62it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:11<00:00, 4115.02it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:11<00:00, 3936.14it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


' \nfrag_path =       f"{data_path}/cfDNA/EE86238.hg38.frag_filt_format_bgzip.tsv.gz"\nfrag_unzip_path = f"{data_path}/cfDNA/EE86238.hg38.frag_filt_format.tsv"\nadata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")\nadata_full_LUNG = adata_full_LUNG.concatenate(adata2)\n\nfrag_path =       f"{data_path}/cfDNA/EE86243.hg38.frag_filt_format_bgzip.tsv.gz"\nfrag_unzip_path = f"{data_path}/cfDNA/EE86243.hg38.frag_filt_format.tsv"\nadata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")\nadata_full_LUNG = make_adata_cfdna(frag_unzip_path, frag_path, "LUNG")\n\n '

In [6]:
# read in liver cancer type

""" commented out for now -- file too big
frag_path =       f"{data_path}/cfDNA/EE86240.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86240.hg38.frag_filt_format.tsv"
adata_full_LIHC = make_adata_cfdna(frag_unzip_path, frag_path, "LIHC")
 """
frag_path =       f"{data_path}/cfDNA/EE86246.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86246.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LIHC")
adata_full_LIHC = make_adata_cfdna(frag_unzip_path, frag_path, "LIHC")

frag_path =       f"{data_path}/cfDNA/EE86258.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86258.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "LIHC")
adata_full_LIHC = adata_full_LIHC.concatenate(adata2)


  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:12<00:00, 3523.65it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:11<00:00, 3940.02it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
100%|██████████| 45782/45782 [00:12<00:00, 3805.74it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [7]:
# read in blood healthy type

""" commented out for now -- file too big
frag_path =       f"{data_path}/cfDNA/EE86240.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86240.hg38.frag_filt_format.tsv"
adata_full_blood = make_adata_cfdna(frag_unzip_path, frag_path, "blood")

frag_path =       f"{data_path}/cfDNA/EE86246.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86246.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "blood")
adata_full_blood = adata_full_blood.concatenate(adata2)

frag_path =       f"{data_path}/cfDNA/EE86258.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86258.hg38.frag_filt_format.tsv"
adata2 = make_adata_cfdna(frag_unzip_path, frag_path, "blood")
adata_full_blood = adata_full_blood.concatenate(adata2)
 """

' frag_path =       f"{data_path}/cfDNA/EE86240.hg38.frag_filt_format_bgzip.tsv.gz"\nfrag_unzip_path = f"{data_path}/cfDNA/EE86240.hg38.frag_filt_format.tsv"\nadata_full_blood = make_adata_cfdna(frag_unzip_path, frag_path, "blood")\n\nfrag_path =       f"{data_path}/cfDNA/EE86246.hg38.frag_filt_format_bgzip.tsv.gz"\nfrag_unzip_path = f"{data_path}/cfDNA/EE86246.hg38.frag_filt_format.tsv"\nadata2 = make_adata_cfdna(frag_unzip_path, frag_path, "blood")\nadata_full_blood = adata_full_blood.concatenate(adata2)\n\nfrag_path =       f"{data_path}/cfDNA/EE86258.hg38.frag_filt_format_bgzip.tsv.gz"\nfrag_unzip_path = f"{data_path}/cfDNA/EE86258.hg38.frag_filt_format.tsv"\nadata2 = make_adata_cfdna(frag_unzip_path, frag_path, "blood")\nadata_full_blood = adata_full_blood.concatenate(adata2)\n '

In [8]:
counts_cfdna = adata_full_BRCA.concatenate(adata_full_LUNG, adata_full_LIHC) #, adata_full_blood)


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [9]:
counts_cfdna.var

Unnamed: 0,Chromosome,Start,End,gene_ids
0,chr1,17238,17739,chr1:17238-17739
20,chr1,817118,817619,chr1:817118-817619
24,chr1,826524,827025,chr1:826524-827025
25,chr1,827303,827804,chr1:827303-827804
26,chr1,830679,831180,chr1:830679-831180
...,...,...,...,...
562680,chrX,155767443,155767944,chrX:155767443-155767944
562686,chrX,155880523,155881024,chrX:155880523-155881024
562687,chrX,155881036,155881537,chrX:155881036-155881537
562703,chrX,156003787,156004288,chrX:156003787-156004288


In [10]:
counts_cfdna.obs

Unnamed: 0,sample_id,cancer_type,batch
BRCA_IC35-0-0-0-0-0-0-0,BRCA_IC35,BRCA,0
BRCA_IC48 -1-0-0-0-0-0-0,BRCA_IC48,BRCA,0
BRCA_IC46 -1-0-0-0-0-0,BRCA_IC46,BRCA,0
BRCA_IC34 -1-0-0-0-0,BRCA_IC34,BRCA,0
BRCA_IC12 -1-0-0-0,BRCA_IC12,BRCA,0
BRCA_IC04 -1-0-0,BRCA_IC04,BRCA,0
LUNG_IC42 -1-0,LUNG_IC42,LUNG,0
LUNG_IC06 -0-1,LUNG_IC06,LUNG,1
LUNG_IC10 -1-1,LUNG_IC10,LUNG,1
LIHC_IC23 -0-2,LIHC_IC23,LIHC,2


In [11]:
counts_cfdna.X

<11x45782 sparse matrix of type '<class 'numpy.float32'>'
	with 492486 stored elements in Compressed Sparse Row format>

# Write out

In [12]:
counts_cfdna.write_h5ad(results_file)