In [8]:
suppressPackageStartupMessages(library(RMySQL))

home = "/frazer01/projects/CARDIPS/analysis/family1070/ase_rnas_lactate"
setwd(home)

source("/frazer01/home/matteo/my_software/cardips_functions.R") 
source("/frazer01/home/paola/my_software/ase_pipeline_functions.R")    



In [9]:
rnas_table = read.csv("rna_for_ase.csv", header = TRUE , stringsAsFactors = FALSE) 
rnas_table

uuid,subject,clone,name,Tissue,mapped_reads
1198c156-b1d3-440e-804f-5c66457eebe7,S07001,C4,CM.2_1_R1,iPSC-CM,15737698
7ed45406-0db1-4e52-9d2b-fd113e2df425,S07001,C4,CM.2_1_R2,iPSC-CM,18951503
ef1f05e2-fa53-4aa0-9490-2cd02eb9d8d5,S07003,C5,CM.2_3_R1,iPSC-CM,17888786
389af0a1-6076-4bae-9bb2-d8b42229f17a,S07003,C5,CM.2_3_R2,iPSC-CM,21261640
28e21163-cd01-4302-a75b-440b772d54fb,S07001,C2,UDID007,iPSC-CM_L_D25,22404879
0ad04243-b101-4c41-be59-7029d943df10,S07001,C2,UDID021,iPSC-CM_L_D25,19262945
b958f299-3ced-4ab9-9901-fe7008745909,S07001,C4,UDID043,iPSC-CM_L_D25,26032937
936f69d3-5ca1-4464-b090-325880a42c11,S07001,C2,UDID048,iPSC-CM_L_D25,24659209
34dc9f1a-5203-4354-8c9a-d2df18f0c645,S07003,C5,UDID001,iPSC-CM_L_D25,29257150
a5e7b56f-dd4b-42bd-9c13-afe3436ac9c0,S07003,C5,UDID020,iPSC-CM_L_D25,25515804


For each tissue (iPSC, CM), for each subject (S07001...) get all the BAM files and merge them.

Each goes to a sub-folder in /frazer01/projects/CARDIPS/analysis/family1070/ase_rnas

In [10]:
bam_folder = "/frazer01/projects/CARDIPS/pipeline/RNAseq/sample"

In [11]:
rnas_table$bam = paste(bam_folder, "/", rnas_table$uuid, "/alignment/", rnas_table$uuid, "_sorted_mdup.bam", sep = "")

In [12]:
source("/frazer01/home/paola/my_software/ase_pipeline_functions.R")    

ppn        = 8
vcf_phased = "/projects/CARDIPS/analysis/family1070/sevenIndi.phased.variant.vcf.gz"
bed        = "/frazer01/publicdata/gencode_v19_20151104/genes.bed"

analysis_dir = paste(home, "analysis", sep = "/")
sh_dir       = paste(home, "sh"      , sep = "/")
log_dir      = paste(home, "log"     , sep = "/")

dir.create(analysis_dir, showWarnings = FALSE)
dir.create(sh_dir      , showWarnings = FALSE)
dir.create(log_dir     , showWarnings = FALSE)

for (tissue in sort(unique(rnas_table$Tissue)))
{
    cat("Tissue : ", tissue, "\n")
    this_tissue = rnas_table[rnas_table$Tissue == tissue,]
    tissue_dir  = paste(analysis_dir, tissue, sep = "/")
    
    dir.create(tissue_dir, showWarnings = FALSE)

 for (name in sort(unique(this_tissue$name)))     ##### MODIFIED IN ORDER NOT TO MERGE REPLICATES OF SAME SUBJECT
    {
        cat("Name:", name, "\n")
        
       
        this_subject = this_tissue[this_tissue$name == name,]
        subject = unique(this_subject$subject)
        subject_dir  = paste(tissue_dir, name,          sep = "/")
        full_name    = paste(tissue    , name, "genes", sep = "_")
        out_bam      = paste(subject_dir, "/", subject, ".bam", sep = "")
        bam_files    = unique(this_subject$bam)
        
        dir.create(subject_dir, showWarnings = FALSE)
        
        sh_file = runAse(full_name, sh_dir, subject, log_dir, subject, subject_dir, bam_files, out_bam, vcf_phased, bed, ppn, is.gz = TRUE, run = TRUE)
    }
}


Tissue :  iPSC-CM 
Name: CM.2_1_R1 
Name: CM.2_1_R2 
Name: CM.2_3_R1 
Name: CM.2_3_R2 
Tissue :  iPSC-CM_L_D16 
Name: CM.2_3_R4 
Tissue :  iPSC-CM_L_D25 
Name: UDID001 
Name: UDID007 
Name: UDID020 
Name: UDID021 
Name: UDID043 
Name: UDID044 
Name: UDID048 
Name: UDID067 


### After the previous analysis is done: merge data into a single table

In [17]:
analysis_dir = paste(home, "analysis", sep = "/")

In [18]:
started_analysis = 0

for (tissue in sort(unique(rnas_table$Tissue)))
{
    cat("Tissue : ", tissue, "\n")
    this_tissue = rnas_table[rnas_table$Tissue == tissue,]
    tissue_dir  = paste(analysis_dir, tissue, sep = "/")
    
    for (subject in sort(unique(this_tissue$subject)))
    {
        cat("Subject:", subject, "\n")

        mbased_folder = paste(tissue_dir, subject, "mbased", sep = "/")

        locus_file    = paste(mbased_folder, "/", subject, "_locus.tsv"       , sep = "" )
        snv_file      = paste(mbased_folder, "/", subject, "_snv.tsv"         , sep = "" )
        in_file       = paste(mbased_folder, "/", subject, "_mbased_input.tsv", sep = "" )

        if ((file.exists(locus_file)== TRUE)&(file.exists(snv_file)== TRUE))
        {
            locus = read.table(locus_file, header = TRUE , sep = "\t", stringsAsFactors = FALSE, row.names = 1, comment.char = "") 
            snv   = read.table(snv_file  , header = TRUE , sep = "\t", stringsAsFactors = FALSE, row.names = 1, comment.char = "") 
            inp   = read.table(in_file   , header = TRUE , sep = "\t", stringsAsFactors = FALSE, row.names = 1, comment.char = "") 

            snv$new_maf = snv$maf
            snv[snv$ref_is_major == FALSE, "new_maf"] = 1 - snv[snv$ref_is_major == FALSE, "maf"]
            snv$coord   = paste(snv$chrom, snv$position, sep = ":")
            inp$coord   = paste(inp$contig, inp$position, sep = ":")

            locus$locus = rownames(locus)
            snv$maf = NULL

            out_data          = merge(locus, snv)
            out_data          = merge(inp[, c("coord", "variantID", "expectedRefFreq", "binomialPValue")], out_data)
            out_data$subject  = subject
            out_data$tissue   = tissue

            if (started_analysis == 0)
            {
                all_ase = out_data
                started_analysis = 1
                col_names_locus = colnames(locus)
            }else
            {
                all_ase = rbind(all_ase, out_data)
            }
        }else
        {
            cat(locus_file, "does not exist\n")
        }
    }
    #break
}

Tissue :  CM 
Subject: S07001 
Subject: S07002 
Subject: S07003 
Subject: S07004 
Subject: S07006 
Subject: S07007 
Subject: S07009 
Tissue :  iPSC 
Subject: S07001 
Subject: S07002 
Subject: S07003 
Subject: S07004 
Subject: S07006 
Subject: S07007 
Subject: S07009 


In [19]:
write.table(all_ase, file = paste(analysis_dir, "all_ase.txt", sep = "/"), col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

In [20]:
all_ase  = read.table(paste(analysis_dir, "all_ase.txt", sep = "/"), header = TRUE , sep = "\t", stringsAsFactors = FALSE) 

for (tissue in sort(unique(rnas_table$Tissue)))
{
    cat("Tissue : ", tissue, "\n")
    this_tissue = rnas_table[rnas_table$Tissue == tissue,]
    tissue_dir  = paste(analysis_dir, tissue, sep = "/")
    
    this = unique(all_ase[all_ase$tissue == tissue, c(col_names_locus, "subject")])

    out_root = paste(analysis_dir, "/", tissue, sep = "")

    loci     = sort(unique(this$locus  ))
    subjects = sort(unique(this$subject))

    empty = matrix(nrow = length(loci), ncol = length(subjects))
    rownames(empty) = loci
    colnames(empty) = subjects

    empty_df = data.frame(empty, stringsAsFactors = FALSE)

    p_val_ases = empty_df
    p_val_hets = empty_df
    mafs       = empty_df

    for (ii in 1: length(this$locus))
    {
        locus     = this$locus            [[ii]]
        subject   = this$subject          [[ii]]
        p_val_ase = this$p_val_ase        [[ii]]
        p_val_het = this$p_val_het        [[ii]]
        maf       = this$major_allele_freq[[ii]]

        p_val_ases[locus, subject] = p_val_ase
        p_val_hets[locus, subject] = p_val_het
        mafs      [locus, subject] = maf
    }
    write.table(p_val_ases, file = paste(out_root, "p_val_ase.txt", sep = "."), col.names = NA, row.names = TRUE, quote = FALSE, sep = "\t")
    write.table(p_val_hets, file = paste(out_root, "p_val_het.txt", sep = "."), col.names = NA, row.names = TRUE, quote = FALSE, sep = "\t")
    write.table(mafs      , file = paste(out_root, "maf.txt"      , sep = "."), col.names = NA, row.names = TRUE, quote = FALSE, sep = "\t")
}


Tissue :  CM 
Tissue :  iPSC 


In [21]:
all_ase  = read.table(paste(analysis_dir, "all_ase.txt", sep = "/"), header = TRUE , sep = "\t", stringsAsFactors = FALSE) 
tissues  = sort(unique(rnas_table$Tissue))
#tissues  = c("CM")

for (tissue in tissues)
{
    cat("Tissue : ", tissue, "\n")
    this_tissue = rnas_table[rnas_table$Tissue == tissue,]
    tissue_dir  = paste(analysis_dir, tissue, sep = "/")

    this = unique(all_ase[all_ase$tissue == tissue, ])

    this$coord = paste(this$chrom, this$position, sep = ":")

    out_root = paste(analysis_dir, "/", tissue, sep = "")

    coords   = sort(unique(this$coord  ))
    subjects = sort(unique(this$subject))

    heads = c("locus", "coord", "ref", "alt")

    mafs  = unique(this[,heads])
    mafs[,subjects] = NA

    rownames(mafs) = mafs$coord
    mafs$coord = NULL

    for (ii in 1: length(this$coord))
    {
        coord     = this$coord  [[ii]]
        subject   = this$subject[[ii]]
        maf       = this$new_maf[[ii]]

        mafs      [coord, subject] = maf
    }
    write.table(mafs      , file = paste(out_root, "maf_by_snv.txt", sep = "."), col.names = NA, row.names = TRUE, quote = FALSE, sep = "\t")
    #break
}


Tissue :  CM 
Tissue :  iPSC 
