## Prepare tables of gene expression and genotypes to use for associations

In [41]:
home = "/frazer01/home/paola/Family1070/private_output/Validation/data_tables"
setwd(home)


In [2]:
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(limma))

### 1. Get all rna-seq uuid done so far in iPSC-CMs 
this table (from Margaret D.) contains all cardips uuis for different assays. Only succesfull iPSC-CMs (cTNT>35%) were included


In [3]:
main = read.table ("/frazer01/projects/CARDIPS/data/CARDiPS_production_metadata/assay_mastertable.txt", sep= "\t", header=T)
main= main[!duplicated(main),]

table(main$assay, main$dataset)

rna_cm = droplevels(subset(main, dataset=="production" &assay =="rna" & cell=="CM"))
dim(rna_cm)

      
       baseline family1070 production timecourse
  atac        0         91        373         66
  chip        0        123        112         12
  rna       232         35        235         45

In [4]:
head(rna_cm)

Unnamed: 0,subject_uuid,clone,passage,day,udid,assay_uuid,assay,assay_type,cell,dataset
6,000a5dcf-764a-4f69-ab1c-5950cec4bbbe,3,27,25,UDID106,eee96072-e740-4967-ba17-c9178796fde1,rna,,CM,production
12,032f42b3-42da-4a6a-9b01-cdf540e1866c,5,20,25,UDID184,cbab32a9-f80d-4093-a2dc-7bdcaa4f8357,rna,,CM,production
13,03346cae-a4f3-4481-92ff-d76db0c82468,2,22,25,UDID113,067afe59-e81d-4176-800d-f1b111c1ec82,rna,,CM,production
27,0bf3da28-3985-4c34-8197-5816fd73b588,5,27,25,UDID064,cdccb7cb-5770-4585-a1e1-4f0ac6d5b676,rna,,CM,production
37,10479353-d46d-478d-86f0-c6be025de81b,2,24,25,UDID138,e45ff871-b499-4c9a-8714-7d0b65eaa7b0,rna,,CM,production
47,10571d4e-c0e9-474b-80b6-75c59ebd1942,6,21,25,UDID270,c6e37323-0e85-4d3c-8942-2bac6c563359,rna,,CM,production


### 2. Get the gene expression RNA-Seq data 
normalize, correct for TNNT and export it

In [7]:
raw<-read.table("/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_expected_counts.tsv", header=T, check.names=F, 
                  sep="\t", row.names=1)

In [8]:
tpm<-read.table("/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_tpm.tsv", header=T, check.names=F, 
                  sep="\t", row.names=1)

In [9]:
gene_info<-read.table("/publicdata/gencode_v19_20151104/gene_info.tsv", header=T, sep="\t")

chromo<-str_split_fixed(gene_info$chrom, "chr", 2)[,2]
unique(chromo)
autoso<- as.character(1:22)


In [10]:
raw_sel<-raw[chromo %in% autoso,]
tpm_sel<-tpm[chromo %in% autoso,]

In [11]:
raw_sel<-subset(raw_sel, select=c(as.character(rna_cm$assay_uuid)))
tpm_sel<-subset(tpm_sel, select=c(as.character(rna_cm$assay_uuid)))
raw_sel<-raw_sel[(rowMeans(tpm_sel)>2),]
tpm_sel<-tpm_sel[(rowMeans(tpm_sel)>2),]

dim(tpm_sel)
dim(raw_sel)

In [12]:
## put id first for Deseq
rna_cm<-rna_cm[,c(6:length(rna_cm), 1:5)]
rna_cm<-rna_cm[as.character(rna_cm$assay_uuid) %in% colnames(raw),]
ctnt<- tpm_sel[as.character(gene_info[gene_info$gene_name=="TNNT2","gene_id"]),]
ctnt<- subset(ctnt, select=as.character(rna_cm$assay_uuid))
rna_cm$cTNT_TPM = t(ctnt)[,1]

In [13]:
write.csv(rna_cm, "Production_cm_samples.csv")

In [None]:
counts<-round(raw_sel,0)
workingData<-DESeqDataSetFromMatrix(counts, rna_cm, design= ~ subject_uuid)
workingData<-estimateSizeFactors(workingData)
workingData<-estimateDispersions(workingData, fitType="parametric")
vst_workingData<-varianceStabilizingTransformation(workingData, blind=TRUE)

mat<-as.data.frame(assay(vst_workingData))
colnames(mat)<-as.character(rna_cm$assay_uuid)
write.table(mat, file="Production_cms_vst_counts.txt", quote=F, row.names=T, col.names=T, sep="\t")


converting counts to integer mode
gene-wise dispersion estimates


In [9]:
rna_cm =read.csv( "Production_cm_samples.csv", row.names=1)


In [10]:
head(rna_cm)

Unnamed: 0,assay_uuid,assay,assay_type,cell,dataset,subject_uuid,clone,passage,day,udid,cTNT_TPM
6,eee96072-e740-4967-ba17-c9178796fde1,rna,,CM,production,000a5dcf-764a-4f69-ab1c-5950cec4bbbe,3,27,25,UDID106,5687.32
12,cbab32a9-f80d-4093-a2dc-7bdcaa4f8357,rna,,CM,production,032f42b3-42da-4a6a-9b01-cdf540e1866c,5,20,25,UDID184,4710.74
13,067afe59-e81d-4176-800d-f1b111c1ec82,rna,,CM,production,03346cae-a4f3-4481-92ff-d76db0c82468,2,22,25,UDID113,3555.09
27,cdccb7cb-5770-4585-a1e1-4f0ac6d5b676,rna,,CM,production,0bf3da28-3985-4c34-8197-5816fd73b588,5,27,25,UDID064,6200.72
37,e45ff871-b499-4c9a-8714-7d0b65eaa7b0,rna,,CM,production,10479353-d46d-478d-86f0-c6be025de81b,2,24,25,UDID138,6089.46
47,c6e37323-0e85-4d3c-8942-2bac6c563359,rna,,CM,production,10571d4e-c0e9-474b-80b6-75c59ebd1942,6,21,25,UDID270,2532.27


In [6]:
mat = read.table( "Production_cms_vst_counts.txt", header=T, row.names=1, check.names=F)

In [12]:
### normalize using residuals from TTNT2
fit <- lmFit( mat, model.matrix(~ cTNT_TPM, rna_cm))
res <- residuals(fit, mat)
mat_corrected<- res + rowMeans(as.matrix(mat))
write.table(mat_corrected, file="Production_cms_vst_counts_corrected_ctnt.txt", quote=F, row.names=T, col.names=T, sep="\t")

### 4. Get genotypes for selected SNPs and individuals 

In [38]:
system("module load cardips")

In [33]:
snps = read.table("../../Enrichment_annotations/Supplementary_table_ase_annotations.txt",  header=T, fill=T)

nkx_snps = subset(snps, mark=="NKX25")
dim(nkx_snps)
nkx_snps = str_split_fixed(nkx_snps$variantID, ":",2)


write.table(nkx_snps, "NKX_ase_coord.txt", sep="\t", quote=F, col.names=F, row.names=F)

get_genotypes<- "bcftools query -R NKX_ase_coord.txt -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%GT]\n' -H /projects/CARDIPS/pipeline/WGS/mergedVCF/CARDIPS_201512.PASS.274.hg19.vcf.gz -o Cardips_NKX25_ase_gt.tsv"

system (get_genotypes)

In [None]:
### manually remove the # in the header

In [44]:
gt<-read.table("Cardips_NKX25_ase_gt.tsv", header=T,check.names=F, stringsAsFactors=FALSE )

gt <- replace(gt, gt == "1/1", 2)
gt <- replace(gt, gt == "0/1", 1)
gt <- replace(gt, gt == "0/0", 0)


names<-str_split_fixed(colnames(gt), "]", 2)[,2]
names2<-str_split_fixed(names[-c(1:5)], ":",2)[,1]
colnames(gt)<-c(names[1:5], names2)
write.csv(gt, "Cardips_NKX25_ase_genotypes_coded.csv")

In [68]:
dim(rna_cm)
wgs     = read.table ("wgs.txt", header=T)
rna_cm<-merge(rna_cm, wgs, by.x="subject_uuid", by.y="subject_id")
dim(rna_cm)
rna_cm=  rna_cm [!duplicated(rna_cm),]
rna_cm = subset(rna_cm, wgs_id %in% colnames(gt))
dim(rna_cm)

In [69]:
write.csv(rna_cm, "Production_cm_samples_wgsid.csv")