# Preprocess cfDNA fragments from Snyder cfDNA paper


data downloaded from: FinaleDB


From these fragment files you will need to 
1) filter out zeros 

cat EE86257.hg38.frag.tsv | awk '!($4 == 0)'


2) reformat to expected fragment format  

awk '{ printf("%s\t%s\t%s\tBRCA_IC35\t%s\n", $1, $2, $3, $4, $5) }' EE86257.hg38.frag_filt.tsv > EE86257.hg38.frag_filt_format.tsv

3) bgzip EE86257.hg38.frag_filt_format.tsv

4) tabix -p bed EE86257.hg38.frag_filt_format.tsv.gz


In [1]:
# imports

import os

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

import muon as mu
from muon import atac as ac


2024-04-18 21:00:52.646673: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-18 21:00:52.707111: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-18 21:00:53.097316: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-18 21:00:53.099355: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# load intermediate files

data_path = f"{os.getcwd()}/../data/"

results_file = f"{data_path}/cfDNA/cfdna_processed.h5ad"

frag_path =       f"{data_path}/cfDNA/EE86257.hg38.frag_filt_format_bgzip.tsv.gz"
frag_unzip_path = f"{data_path}/cfDNA/EE86257.hg38.frag_filt_format.tsv"

# get TCGA regions
tcga_file = f"{data_path}/TCGA/tcga_atac_promoters.h5ad"
tcga_adata = ad.read_h5ad(tcga_file)
tcga_adata

# get the regions of interest
region_df = tcga_adata.var
region_df = region_df.iloc[:,0:3]
region_df = region_df.rename(columns={"seqnames":"Chromosome", "start":"Start", "end":"End"})
region_df



Unnamed: 0,Chromosome,Start,End
0,chr1,17238,17739
20,chr1,817118,817619
24,chr1,826524,827025
25,chr1,827303,827804
26,chr1,830679,831180
...,...,...,...
562680,chrX,155767443,155767944
562686,chrX,155880523,155881024
562687,chrX,155881036,155881537
562703,chrX,156003787,156004288


In [3]:
with open(frag_unzip_path) as your_data:
    adata = pd.read_csv(your_data, delimiter='\t', header=None)

# get the region info and counts
adata.columns = ["Chromosome", "Start", "End", "Sample_id", "count"]
samp_id = adata.Sample_id[0]
adata_var = adata.iloc[:,0:3]
adata_X = pd.DataFrame(adata.iloc[:,4])
adata_X.columns = [samp_id]


# get the sample info
adata_obs = pd.DataFrame({"sample_id": adata_X.columns})
adata_obs.index = adata_X.columns

# transpose to anndata format
adata_X = adata_X.transpose()



# remake anndata
adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
adata


  adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)


AnnData object with n_obs × n_vars = 1 × 181832045
    obs: 'sample_id'
    var: 'Chromosome', 'Start', 'End'

# Get promoter regions from TCGA and recount fragments


In [6]:
# get fragments file
mu.atac.tl.locate_fragments(adata, frag_path)
counts_cfdna = mu.atac.tl.count_fragments_features(adata, region_df)

counts_cfdna.var['gene_ids'] = counts_cfdna.var.Chromosome.astype(str) + ":" + counts_cfdna.var.Start.astype(str)  + "-" +  counts_cfdna.var.End.astype(str)


[W::hts_idx_load3] The index file is older than the data file: /home/natalie/projects/checkouts/buddi_atac/buddi_atac/preprocessing_qc/../data//cfDNA/EE86257.hg38.frag_filt_format_bgzip.tsv.gz.tbi
[W::hts_idx_load3] The index file is older than the data file: /home/natalie/projects/checkouts/buddi_atac/buddi_atac/preprocessing_qc/../data//cfDNA/EE86257.hg38.frag_filt_format_bgzip.tsv.gz.tbi
  0%|          | 0/45782 [00:00<?, ?it/s]

100%|██████████| 45782/45782 [00:19<00:00, 2307.21it/s]
  return AnnData(X=mx, obs=adata.obs, var=features)


In [7]:
counts_cfdna.var

Unnamed: 0,Chromosome,Start,End,gene_ids
0,chr1,17238,17739,chr1:17238-17739
20,chr1,817118,817619,chr1:817118-817619
24,chr1,826524,827025,chr1:826524-827025
25,chr1,827303,827804,chr1:827303-827804
26,chr1,830679,831180,chr1:830679-831180
...,...,...,...,...
562680,chrX,155767443,155767944,chrX:155767443-155767944
562686,chrX,155880523,155881024,chrX:155880523-155881024
562687,chrX,155881036,155881537,chrX:155881036-155881537
562703,chrX,156003787,156004288,chrX:156003787-156004288


In [8]:
counts_cfdna.obs

Unnamed: 0,sample_id
BRCA_IC35,BRCA_IC35


In [9]:
counts_cfdna.X

<1x45782 sparse matrix of type '<class 'numpy.float32'>'
	with 34794 stored elements in Compressed Sparse Column format>

# Write out

In [10]:
counts_cfdna.write_h5ad(results_file)