# Process TCGA regions

We will first select only the promoter regions and convert to AnnData. Later we will specifically filter to variable and accessible/inaccessible regions.

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from matplotlib.pyplot import rc_context

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

import os

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')


2024-04-18 20:37:55.583061: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-18 20:37:55.583941: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-18 20:37:55.603248: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-18 20:37:55.603805: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


scanpy==1.8.2 anndata==0.8.0 umap==0.5.2 numpy==1.24.3 scipy==1.13.0 pandas==1.5.3 scikit-learn==1.4.2 statsmodels==0.14.1 python-igraph==0.11.4 pynndescent==0.5.12


## Open files

In [3]:
data_path = f"{os.getcwd()}/../data/"

in_file = f"{data_path}/TCGA/TCGA-ATAC_PanCan_Raw_Counts.tsv"
results_file = f"{data_path}/TCGA/tcga_atac_promoters.h5ad"

with open(in_file) as your_data:
    adata = pd.read_csv(your_data, delimiter='\t')
adata

Unnamed: 0,seqnames,start,end,name,score,annotation,GC,ACCx_025FE5F8_885E_433D_9018_7AE322A92285_X034_S09_L133_B1_T1_PMRG,ACCx_025FE5F8_885E_433D_9018_7AE322A92285_X034_S09_L134_B1_T2_PMRG,ACCx_2A5AE757_20D5_49B6_95FF_CAE08E8197A0_X012_S05_L033_B1_T1_P024,...,UCEC_939BD27B_C1F6_4C55_949E_511A1D916E28_X040_S04_L128_B1_T2_P101,UCEC_9D108B12_D50E_4280_AA1A_219BF0E1856D_X031_S01_L049_B1_T1_P076,UCEC_BAD10669_0C6A_4B48_94B3_CAB6C3799360_X029_S09_L017_B1_T1_P082,UCEC_BAD10669_0C6A_4B48_94B3_CAB6C3799360_X029_S09_L018_B1_T2_P077,UCEC_BDFE8123_081E_49AF_930B_2371D8DEC261_X030_S01_L025_B1_T1_P080,UCEC_BDFE8123_081E_49AF_930B_2371D8DEC261_X030_S01_L026_B1_T2_P078,UCEC_C335297F_2D63_4973_9182_FA18C28E001E_X037_S04_L055_B1_T1_P088,UCEC_C335297F_2D63_4973_9182_FA18C28E001E_X037_S04_L056_B1_T2_P089,UCEC_D820B024_6B3B_4B5B_866E_F9A8139C270B_X039_S09_L113_B1_T1_P099,UCEC_D820B024_6B3B_4B5B_866E_F9A8139C270B_X039_S09_L114_B1_T2_P098
0,chr1,17238,17739,PCPG_2,4.406228,Promoter,0.612774,19,28,17,...,10,24,6,6,16,8,2,8,15,16
1,chr1,102709,103210,LIHC_2,3.472406,Intron,0.449102,0,2,2,...,4,4,0,0,0,0,1,4,1,0
2,chr1,136494,136995,LIHC_3,8.211594,Distal,0.706587,3,6,1,...,8,8,0,2,1,2,1,0,2,5
3,chr1,180653,181154,TGCT_2,7.718365,Distal,0.558882,73,130,49,...,30,41,43,72,9,13,31,30,31,29
4,chr1,181202,181703,LGG_2,67.948112,Distal,0.758483,256,535,53,...,21,63,18,43,23,16,24,30,44,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562704,chrX,156005167,156005668,PCPG_87383,2.894347,Promoter,0.582834,8,14,5,...,9,14,9,9,5,8,13,13,7,17
562705,chrX,156008314,156008815,PCPG_87384,4.723303,Intron,0.542914,4,9,2,...,9,14,7,4,0,2,3,2,7,0
562706,chrX,156025204,156025705,THCA_93794,3.181337,Distal,0.602794,9,31,15,...,14,6,17,16,7,8,18,9,12,3
562707,chrX,156029185,156029686,TGCT_74401,5.304760,Distal,0.706587,33,65,8,...,8,4,4,11,0,0,2,0,2,6


In [2]:
data_path = f"{os.getcwd()}/../data/"

in_file = f"{data_path}/TCGA/TCGA-ATAC_PanCan_Raw_Counts.tsv"
results_file = f"{data_path}/TCGA/tcga_atac_promoters.h5ad"

with open(in_file) as your_data:
    adata = pd.read_csv(your_data, delimiter='\t')

# get the region info and counts
adata_var = adata.iloc[:,0:6]
adata_X = adata.iloc[:,7:adata.shape[1]]

# get the sample info
adata_obs = pd.DataFrame({"sample_id": adata_X.columns})
adata_obs.index = adata_X.columns

# transpose to anndata format
adata_X = adata_X.transpose()



# remake anndata
adata = ad.AnnData(adata_X, obs=adata_obs, var=adata_var)
adata





AnnData object with n_obs × n_vars = 796 × 562709
    obs: 'sample_id'
    var: 'seqnames', 'start', 'end', 'name', 'score', 'annotation'

In [3]:
# remove non promoter regions
nonprom_idx = np.where(adata.var.annotation == "Promoter")[0]
adata = adata[:, nonprom_idx]

adata.var

Unnamed: 0,seqnames,start,end,name,score,annotation
0,chr1,17238,17739,PCPG_2,4.406228,Promoter
20,chr1,817118,817619,GBM_4,5.098012,Promoter
24,chr1,826524,827025,PRAD_10,5.532052,Promoter
25,chr1,827303,827804,CESC_4,31.756936,Promoter
26,chr1,830679,831180,LIHC_13,7.906675,Promoter
...,...,...,...,...,...,...
562680,chrX,155767443,155767944,ACC_90704,13.200734,Promoter
562686,chrX,155880523,155881024,LIHC_121524,5.648079,Promoter
562687,chrX,155881036,155881537,CESC_56114,54.718996,Promoter
562703,chrX,156003787,156004288,BLCA_107924,11.245066,Promoter


In [4]:
# get cancer types
adata.obs["scpred_CellType"] = adata.obs['sample_id'].str.split('_').str[0]


# filter to cancer types of interest
ct_interest = ["BRCA", "COAD", "KIRP", "KIRC", "LIHC", "LUAD", "LUSC", "PRAD", "STAD"]
ct_index = np.where(np.isin(adata.obs.scpred_CellType, ct_interest))[0]
adata = adata[ct_index, :]

adata.obs.scpred_CellType[np.where(adata.obs.scpred_CellType == "KIRP")[0]] = "KIDNEY"
adata.obs.scpred_CellType[np.where(adata.obs.scpred_CellType == "KIRC")[0]] = "KIDNEY"

adata.obs.scpred_CellType[np.where(adata.obs.scpred_CellType == "LUAD")[0]] = "LUNG"
adata.obs.scpred_CellType[np.where(adata.obs.scpred_CellType == "LUSC")[0]] = "LUNG"



  adata.obs["scpred_CellType"] = adata.obs['sample_id'].str.split('_').str[0]


In [5]:
# write out the result

adata.write_h5ad(results_file)


  df[key] = c
  df[key] = c


  df[key] = c
