# Split bamfiles by impurity bin

Only for the coordinates of interesting genes

In [None]:
library(dplyr)
library(biomaRt)

### Load data

In [None]:
cancer<-"KIRC"
events_obj <- "../output/objects/psi.stats.ascdb.RDS"
fileids_dir <- paste0("figures_v3_data/",cancer,"_impurity_bins")
cram_dir <- paste0("../data/tcga_crams/",cancer)

In [None]:
events<-readRDS(events_obj)
hgMart <- useMart("ensembl")
hg <- useDataset("hsapiens_gene_ensembl", hgMart)
cram_ids<-list.files(cram_dir,pattern=".cram$") %>% unlist()
cram_ids<-sub(".cram","",cram_ids)

### Select gene

In [None]:
goi_name<-"CD46"

In [None]:
geneid <- events %>% 
            ungroup() %>% 
            filter(gene_name==goi_name) %>% 
            distinct(event_id) %>% 
            unlist() %>% 
            sub(";.*","",.) %>% sub("\\..*","",.) %>%
            unique()
geneid

In [None]:
gene.coords <- getBM(attributes=c("chromosome_name","start_position","end_position","strand","ensembl_gene_id"), 
                     filters="ensembl_gene_id", 
                     values=list(ensembl_gene_id = geneid), 
                     mart=hg) %>%
               mutate(gene_name=goi_name,
                      region=paste0("chr",chromosome_name,":",start_position,"-",end_position))

In [None]:
gene.coords

## Build sample array 

In [None]:
missing_ids<-list()
for(ib_file in list.files(fileids_dir,full.names = T,pattern="txt")){
    
    bin<-sub(".txt","",basename(ib_file))
    # Create one directory for all the bams in the same bin
    bin_dir<-file.path(fileids_dir,bin)
    dir.create(path = bin_dir,showWarnings = FALSE)
    
    # Prepare sample ids
    file_ids<-read.table(ib_file) %>%
              rename("file_id"="V1") %>%
              mutate(cram = paste0(cram_dir,"/",file_id,".cram"),
                     gene_name = goi_name,
                     gene_id = geneid ) %>%
              mutate(out_bam = paste0(bin_dir,"/",
                                      gene_name,"_",
                                      file_id,".bam")) %>%
              left_join(.,gene.coords %>% distinct(gene_name,region),by = "gene_name")
    
    missing_ids[[bin]]<-file_ids$file_id[!file_ids$file_id%in%cram_ids]
    
    # Write column with command to extract specific region from bam file 
    # Example: samtools view -hb file.cram "chr1:207752037-207795513" -o region.bam
    file_ids <- file_ids %>%
                filter(file_id %in% cram_ids) %>%
                mutate(cmd=paste0('samtools view -hb ',cram,' "',region,'" -o ',out_bam))
    
    # Extract commands
    cmds <- file_ids$cmd
    
    dir.create("commands",showWarnings = FALSE)
    write.table(cmds,
                col.names = FALSE,
                row.names = FALSE,
                quote = FALSE,
                file = paste0("commands/view_",gene_name,"_",bin,".sh"))
}

In [None]:
write.table(unlist(missing_ids),file = "missing_sample_ids.txt",row.names = F,quote=F,col.names=F)

In [None]:
# To run all commands
#system('sbatch run_all_commands.sh')

In [None]:
# Once all commands have finished
#samtools merge finalBamFile.bam *.bam