In [1]:
library(repr)
options(repr.plot.width=14, repr.plot.height=12)

In [109]:
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(GenomicRanges))

In [58]:
base_url = '/gpfs/commons/groups/sanjana_lab/cdai/TFscreen/'

In [59]:
list.files(base_url, 'rsem')

In [60]:
# read in counts
cnts <- fread(file = paste0(base_url, '/rsem_counts_B1B2_combined.csv'))

In [61]:
# remove some gene_ids
remove_gene_ids <- fread(paste0(base_url, "/RNAseq_remove_gene_id_list.txt"), header = F) %>% pull(V1)
cnts <- cnts[!gene_id %in% remove_gene_ids]

In [62]:
cnts %>% dim

In [63]:
cnts %>% head

gene_id,gene_name,S01_B1,S01_B2,S02_B1,S02_B2,S03_B1,S03_B2,S04_B1,S04_B2,⋯,S18_B1,S18_B2,S19_B1,S19_B2,S20_B1,S20_B2,S21_B1,S21_B2,S22_B1,S22_B2
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000000003.14,TSPAN6,817.0001,966.0001,789.0001,1226.0001,757.0001,723.0001,991.0001,1400.0001,⋯,1262.0001,2215.0001,1604.0001,2076.0001,1064.0001,2301.0001,175.0001,457.0001,294.0001,584.0001
ENSG00000000005.6,TNMD,19.0001,0.0001,0.0001,11.0001,11.0001,14.0001,1.0001,11.0001,⋯,0.0001,2.0001,0.0001,0.0001,0.0001,1.0001,0.0001,0.0001,0.0001,0.0001
ENSG00000000419.12,DPM1,762.0001,731.0001,746.0001,908.0001,809.0001,644.0001,516.0001,1264.0001,⋯,853.0001,901.0001,797.0001,791.0001,345.0001,1147.0001,378.0001,400.0001,255.0001,567.0001
ENSG00000000457.14,SCYL3,20.5201,67.4201,49.0501,70.2001,28.5701,27.2601,56.7901,14.3201,⋯,0.0001,58.1201,129.1501,46.1601,82.9401,80.7301,46.0801,31.0001,111.0001,124.8201
ENSG00000000460.17,C1orf112,114.4801,211.6401,160.9501,266.8001,153.4301,145.7401,101.2101,253.6701,⋯,146.0001,165.8801,114.8501,187.8401,118.0601,126.2701,54.9201,0.0001,0.0001,1.1801
ENSG00000000938.13,FGR,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,⋯,0.0001,0.0001,0.0001,0.0001,0.0001,2.0001,0.0001,0.0001,0.0001,0.0001


In [64]:
cols1 = names(cnts)[str_detect(names(cnts), "^S.+_B.$")]

In [65]:
cnts = cnts[, c(list(gene_id = gene_id, gene_name = gene_name),
         lapply(.SD, as.integer)),
     .SDcols = cols1]

In [66]:
cnts %>% head

gene_id,gene_name,S01_B1,S01_B2,S02_B1,S02_B2,S03_B1,S03_B2,S04_B1,S04_B2,⋯,S18_B1,S18_B2,S19_B1,S19_B2,S20_B1,S20_B2,S21_B1,S21_B2,S22_B1,S22_B2
<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
ENSG00000000003.14,TSPAN6,817,966,789,1226,757,723,991,1400,⋯,1262,2215,1604,2076,1064,2301,175,457,294,584
ENSG00000000005.6,TNMD,19,0,0,11,11,14,1,11,⋯,0,2,0,0,0,1,0,0,0,0
ENSG00000000419.12,DPM1,762,731,746,908,809,644,516,1264,⋯,853,901,797,791,345,1147,378,400,255,567
ENSG00000000457.14,SCYL3,20,67,49,70,28,27,56,14,⋯,0,58,129,46,82,80,46,31,111,124
ENSG00000000460.17,C1orf112,114,211,160,266,153,145,101,253,⋯,146,165,114,187,118,126,54,0,0,1
ENSG00000000938.13,FGR,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,2,0,0,0,0


In [67]:
all.sample.annotation <- fread(paste0(base_url, '/RNASeqSampleNames.csv'))

In [68]:
all.sample.annotation %>% head

sample,condition,group
<chr>,<chr>,<chr>
S01_B1,WT.ES,ES
S01_B2,WT.ES,ES
S02_B1,WT.ES,ES
S02_B2,WT.ES,ES
S03_B1,K108.ES,KO.ES
S03_B2,K108.ES,KO.ES


In [74]:
cols2 = all.sample.annotation[!str_detect(group, "KO")]$sample

In [85]:
mx = cnts[, ..cols2]


In [91]:
x = c(1, 2, 3, 4)
x/sum(x)

In [95]:
cnts2 = cnts[, c(list(gene_id = gene_id, gene_name = gene_name),
    lapply(.SD, function(x) 1e6*x/sum(x))
), .SDcols = cols2]

In [99]:
row_means = cnts2[, ..cols2] %>% rowMeans
row_means = cnts2[, .(gene_id, gene_name, mean_rpm = row_means)]

In [113]:
top_5k_genes = row_means[order(-mean_rpm)][1:5000, gene_name]

In [136]:
top_5k_tss_region = fread('../../resources/annotations/hs38/gencode_v31_protein_tss_u1k_d1k.bed')
top_5k_tss_region = top_5k_tss_region[V4 %in% top_5k_genes]

In [138]:
fwrite(top_5k_tss_region, sep = '\t', col.names = F, 
       file = "../../results/reviews/top5k_byRNA_proteincoding_tss_u1k_d1k.bed", 
       )