# MPRA Calculate and Filter for CPM

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


## Pool1

In [2]:
# import DNA counts for Pool 1
dna_counts_pool <- read.table("../data/pool1_dna_counts_allelic.txt", header=TRUE)

In [3]:
head(dna_counts_pool)

Unnamed: 0_level_0,V1.dna.r1_1_ref,V1.dna.r1_1_alt,V1.dna.r1_2_ref,V1.dna.r1_2_alt,V1.dna.r1_3_ref,V1.dna.r1_3_alt,V1.dna.r1_4_ref,V1.dna.r1_4_alt,V1.dna.r1_5_ref,V1.dna.r1_5_alt,⋯,V2.dna.r3_46_ref,V2.dna.r3_46_alt,V2.dna.r3_47_ref,V2.dna.r3_47_alt,V2.dna.r3_48_ref,V2.dna.r3_48_alt,V2.dna.r3_49_ref,V2.dna.r3_49_alt,V2.dna.r3_50_ref,V2.dna.r3_50_alt
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1_6147297_rs11583631_C_T,0,0,1,0,0,0,0,0,0,0,⋯,0,0,12,2,0,11,2,0,0,0
chr1_6147340_rs11584419_A_C,0,2,0,0,0,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
chr1_6157296_rs749435_T_C,0,7,22,10,5,2,5,1,2,38,⋯,41,15,222,8,0,5,40,17,47,15
chr1_6258793_rs3789524_A_G,68,106,148,111,10,32,67,90,119,183,⋯,355,73,259,304,305,723,190,336,376,611
chr1_6259024_rs3789522_C_T,1,34,12,26,28,22,12,17,52,62,⋯,529,187,76,132,116,743,341,218,544,157
chr1_6260662_rs11121478_A_C,0,1,0,1,1,0,0,0,0,0,⋯,0,4,0,0,0,0,0,0,13,3


In [4]:
# aggregate DNA counts
master_pool <- dna_counts_pool %>%  
    mutate(V1_DNA_ref_count = rowSums(select(., starts_with("V1") & ends_with("ref")))) %>% 
    mutate(V1_DNA_alt_count = rowSums(select(., starts_with("V1") & ends_with("alt")))) %>% 
    mutate(V2_DNA_ref_count = rowSums(select(., starts_with("V2") & ends_with("ref")))) %>% 
    mutate(V2_DNA_alt_count = rowSums(select(., starts_with("V2") & ends_with("alt")))) %>% 
    mutate(V1_DNA_ref_CPM = (V1_DNA_ref_count / (sum(V1_DNA_ref_count) + sum(V1_DNA_alt_count))) * 1000000) %>%
    mutate(V1_DNA_alt_CPM = (V1_DNA_alt_count / (sum(V1_DNA_ref_count) + sum(V1_DNA_alt_count))) * 1000000) %>%
    mutate(V2_DNA_ref_CPM = (V2_DNA_ref_count / (sum(V2_DNA_ref_count) + sum(V2_DNA_alt_count))) * 1000000) %>%
    mutate(V2_DNA_alt_CPM = (V2_DNA_alt_count / (sum(V2_DNA_ref_count) + sum(V2_DNA_alt_count))) * 1000000) %>%
    mutate(DNA_CPM_QC = ifelse((V1_DNA_ref_CPM>=8 & V1_DNA_alt_CPM>=8) | 
                                 (V2_DNA_ref_CPM>=8 & V2_DNA_alt_CPM>=8), TRUE, FALSE)) %>%
    select(V1_DNA_ref_count, V1_DNA_alt_count, V2_DNA_ref_count, V2_DNA_alt_count, V1_DNA_ref_CPM, 
           V1_DNA_alt_CPM, V2_DNA_ref_CPM, V2_DNA_alt_CPM, DNA_CPM_QC) %>% 
           rownames_to_column("testcre")

In [5]:
head(master_pool)

Unnamed: 0_level_0,testcre,V1_DNA_ref_count,V1_DNA_alt_count,V2_DNA_ref_count,V2_DNA_alt_count,V1_DNA_ref_CPM,V1_DNA_alt_CPM,V2_DNA_ref_CPM,V2_DNA_alt_CPM,DNA_CPM_QC
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1,chr1_6147297_rs11583631_C_T,39,36,193,234,5.812979,5.365827,6.337149,7.683383,False
2,chr1_6147340_rs11584419_A_C,29,43,191,368,4.322472,6.409182,6.271479,12.083269,False
3,chr1_6157296_rs749435_T_C,870,899,5367,4969,129.674157,133.996629,176.225288,163.156969,True
4,chr1_6258793_rs3789524_A_G,9743,14828,46568,70300,1452.201509,2210.1246,1529.058916,2308.298441,True
5,chr1_6259024_rs3789522_C_T,7689,6910,35203,29103,1146.051258,1029.94072,1155.889474,955.596153,True
6,chr1_6260662_rs11121478_A_C,128,43,952,499,19.078497,6.409182,31.258892,16.38465,True


In [6]:
# split DNA count row names into enhancer information
pool_names <- data.frame(do.call(rbind, strsplit(master_pool$testcre, "_")))
colnames(pool_names) <- c("chrom", "hg19_pos", "snp", "allele", "mutation")

In [7]:
# add enhancer information to master table
master_pool <- cbind(pool_names, master_pool)

In [8]:
head(master_pool)

Unnamed: 0_level_0,chrom,hg19_pos,snp,allele,mutation,testcre,V1_DNA_ref_count,V1_DNA_alt_count,V2_DNA_ref_count,V2_DNA_alt_count,V1_DNA_ref_CPM,V1_DNA_alt_CPM,V2_DNA_ref_CPM,V2_DNA_alt_CPM,DNA_CPM_QC
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1,chr1,6147297,rs11583631,C,T,chr1_6147297_rs11583631_C_T,39,36,193,234,5.812979,5.365827,6.337149,7.683383,False
2,chr1,6147340,rs11584419,A,C,chr1_6147340_rs11584419_A_C,29,43,191,368,4.322472,6.409182,6.271479,12.083269,False
3,chr1,6157296,rs749435,T,C,chr1_6157296_rs749435_T_C,870,899,5367,4969,129.674157,133.996629,176.225288,163.156969,True
4,chr1,6258793,rs3789524,A,G,chr1_6258793_rs3789524_A_G,9743,14828,46568,70300,1452.201509,2210.1246,1529.058916,2308.298441,True
5,chr1,6259024,rs3789522,C,T,chr1_6259024_rs3789522_C_T,7689,6910,35203,29103,1146.051258,1029.94072,1155.889474,955.596153,True
6,chr1,6260662,rs11121478,A,C,chr1_6260662_rs11121478_A_C,128,43,952,499,19.078497,6.409182,31.258892,16.38465,True


In [9]:
# make position 1-based
master_pool <- master_pool %>% mutate(hg19_pos = as.numeric(hg19_pos) + 1)

In [10]:
head(master_pool)

Unnamed: 0_level_0,chrom,hg19_pos,snp,allele,mutation,testcre,V1_DNA_ref_count,V1_DNA_alt_count,V2_DNA_ref_count,V2_DNA_alt_count,V1_DNA_ref_CPM,V1_DNA_alt_CPM,V2_DNA_ref_CPM,V2_DNA_alt_CPM,DNA_CPM_QC
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1,chr1,6147298,rs11583631,C,T,chr1_6147297_rs11583631_C_T,39,36,193,234,5.812979,5.365827,6.337149,7.683383,False
2,chr1,6147341,rs11584419,A,C,chr1_6147340_rs11584419_A_C,29,43,191,368,4.322472,6.409182,6.271479,12.083269,False
3,chr1,6157297,rs749435,T,C,chr1_6157296_rs749435_T_C,870,899,5367,4969,129.674157,133.996629,176.225288,163.156969,True
4,chr1,6258794,rs3789524,A,G,chr1_6258793_rs3789524_A_G,9743,14828,46568,70300,1452.201509,2210.1246,1529.058916,2308.298441,True
5,chr1,6259025,rs3789522,C,T,chr1_6259024_rs3789522_C_T,7689,6910,35203,29103,1146.051258,1029.94072,1155.889474,955.596153,True
6,chr1,6260663,rs11121478,A,C,chr1_6260662_rs11121478_A_C,128,43,952,499,19.078497,6.409182,31.258892,16.38465,True


In [11]:
# save counts and CPM values for Pool
write.table(master_pool, "../results/pool1_dna_counts_cpm.txt")

## Pool2

In [12]:
# import DNA counts for Pool 1
dna_counts_pool <- read.table("../data/pool2_dna_counts_allelic.txt", header=TRUE)

In [13]:
head(dna_counts_pool)

Unnamed: 0_level_0,V1.dna.r1_1_ref,V1.dna.r1_1_alt,V1.dna.r1_2_ref,V1.dna.r1_2_alt,V1.dna.r1_3_ref,V1.dna.r1_3_alt,V1.dna.r1_4_ref,V1.dna.r1_4_alt,V1.dna.r1_5_ref,V1.dna.r1_5_alt,⋯,V2.dna.r3_46_ref,V2.dna.r3_46_alt,V2.dna.r3_47_ref,V2.dna.r3_47_alt,V2.dna.r3_48_ref,V2.dna.r3_48_alt,V2.dna.r3_49_ref,V2.dna.r3_49_alt,V2.dna.r3_50_ref,V2.dna.r3_50_alt
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr2_201172085_rs67190025_C_T,0,0,3,0,0,0,0,0,0,0,⋯,0,0,7,0,0,9,4,0,9,0
chr2_201172627_rs7580924_G_C,0,0,0,6,0,0,6,0,1,0,⋯,24,13,2,4,9,1,10,1,0,0
chr2_201185918_rs13028959_C_A,47,21,64,39,34,29,52,8,21,14,⋯,170,106,299,270,85,98,661,59,34,191
chr2_201194503_rs112467613_C_CAA,106,61,161,138,176,66,105,125,57,133,⋯,613,209,233,202,918,320,437,228,557,300
chr3_38417344_rs196378_G_C,0,0,1,0,0,1,0,0,0,2,⋯,0,0,0,14,0,35,5,4,3,0
chr3_38417366_rs169046_T_C,2,4,0,1,0,1,0,0,0,0,⋯,2,0,7,7,1,4,14,8,0,4


In [14]:
# aggregate DNA counts
master_pool <- dna_counts_pool %>%  
    mutate(V1_DNA_ref_count = rowSums(select(., starts_with("V1") & ends_with("ref")))) %>% 
    mutate(V1_DNA_alt_count = rowSums(select(., starts_with("V1") & ends_with("alt")))) %>% 
    mutate(V2_DNA_ref_count = rowSums(select(., starts_with("V2") & ends_with("ref")))) %>% 
    mutate(V2_DNA_alt_count = rowSums(select(., starts_with("V2") & ends_with("alt")))) %>% 
    mutate(V1_DNA_ref_CPM = (V1_DNA_ref_count / (sum(V1_DNA_ref_count) + sum(V1_DNA_alt_count))) * 1000000) %>%
    mutate(V1_DNA_alt_CPM = (V1_DNA_alt_count / (sum(V1_DNA_ref_count) + sum(V1_DNA_alt_count))) * 1000000) %>%
    mutate(V2_DNA_ref_CPM = (V2_DNA_ref_count / (sum(V2_DNA_ref_count) + sum(V2_DNA_alt_count))) * 1000000) %>%
    mutate(V2_DNA_alt_CPM = (V2_DNA_alt_count / (sum(V2_DNA_ref_count) + sum(V2_DNA_alt_count))) * 1000000) %>%
    mutate(DNA_CPM_QC = ifelse((V1_DNA_ref_CPM>=8 & V1_DNA_alt_CPM>=8) | 
                                 (V2_DNA_ref_CPM>=8 & V2_DNA_alt_CPM>=8), TRUE, FALSE)) %>%
    select(V1_DNA_ref_count, V1_DNA_alt_count, V2_DNA_ref_count, V2_DNA_alt_count, V1_DNA_ref_CPM, 
           V1_DNA_alt_CPM, V2_DNA_ref_CPM, V2_DNA_alt_CPM, DNA_CPM_QC) %>% 
           rownames_to_column("testcre")

In [15]:
head(master_pool)

Unnamed: 0_level_0,testcre,V1_DNA_ref_count,V1_DNA_alt_count,V2_DNA_ref_count,V2_DNA_alt_count,V1_DNA_ref_CPM,V1_DNA_alt_CPM,V2_DNA_ref_CPM,V2_DNA_alt_CPM,DNA_CPM_QC
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1,chr2_201172085_rs67190025_C_T,21,36,307,295,2.920649,5.006827,9.546576,9.17342,True
2,chr2_201172627_rs7580924_G_C,245,263,868,1119,34.074242,36.577655,26.991623,34.7968,True
3,chr2_201185918_rs13028959_C_A,5321,4688,19770,16641,740.0369,652.000186,614.774649,517.4742,True
4,chr2_201194503_rs112467613_C_CAA,21922,17420,79508,68046,3048.879708,2422.748131,2472.407829,2115.98158,True
5,chr3_38417344_rs196378_G_C,155,138,1087,667,21.557173,19.192838,33.801722,20.74126,True
6,chr3_38417366_rs169046_T_C,277,188,1586,1383,38.524755,26.146765,49.318796,43.00624,True


In [16]:
# split DNA count row names into enhancer information
pool_names <- data.frame(do.call(rbind, strsplit(master_pool$testcre, "_")))
colnames(pool_names) <- c("chrom", "hg19_pos", "snp", "allele", "mutation")

In [17]:
# add enhancer information to master table
master_pool <- cbind(pool_names, master_pool)

In [18]:
head(master_pool)

Unnamed: 0_level_0,chrom,hg19_pos,snp,allele,mutation,testcre,V1_DNA_ref_count,V1_DNA_alt_count,V2_DNA_ref_count,V2_DNA_alt_count,V1_DNA_ref_CPM,V1_DNA_alt_CPM,V2_DNA_ref_CPM,V2_DNA_alt_CPM,DNA_CPM_QC
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1,chr2,201172085,rs67190025,C,T,chr2_201172085_rs67190025_C_T,21,36,307,295,2.920649,5.006827,9.546576,9.17342,True
2,chr2,201172627,rs7580924,G,C,chr2_201172627_rs7580924_G_C,245,263,868,1119,34.074242,36.577655,26.991623,34.7968,True
3,chr2,201185918,rs13028959,C,A,chr2_201185918_rs13028959_C_A,5321,4688,19770,16641,740.0369,652.000186,614.774649,517.4742,True
4,chr2,201194503,rs112467613,C,CAA,chr2_201194503_rs112467613_C_CAA,21922,17420,79508,68046,3048.879708,2422.748131,2472.407829,2115.98158,True
5,chr3,38417344,rs196378,G,C,chr3_38417344_rs196378_G_C,155,138,1087,667,21.557173,19.192838,33.801722,20.74126,True
6,chr3,38417366,rs169046,T,C,chr3_38417366_rs169046_T_C,277,188,1586,1383,38.524755,26.146765,49.318796,43.00624,True


In [19]:
# make position 1-based
master_pool <- master_pool %>% mutate(hg19_pos = as.numeric(hg19_pos) + 1)

In [20]:
head(master_pool)

Unnamed: 0_level_0,chrom,hg19_pos,snp,allele,mutation,testcre,V1_DNA_ref_count,V1_DNA_alt_count,V2_DNA_ref_count,V2_DNA_alt_count,V1_DNA_ref_CPM,V1_DNA_alt_CPM,V2_DNA_ref_CPM,V2_DNA_alt_CPM,DNA_CPM_QC
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1,chr2,201172086,rs67190025,C,T,chr2_201172085_rs67190025_C_T,21,36,307,295,2.920649,5.006827,9.546576,9.17342,True
2,chr2,201172628,rs7580924,G,C,chr2_201172627_rs7580924_G_C,245,263,868,1119,34.074242,36.577655,26.991623,34.7968,True
3,chr2,201185919,rs13028959,C,A,chr2_201185918_rs13028959_C_A,5321,4688,19770,16641,740.0369,652.000186,614.774649,517.4742,True
4,chr2,201194504,rs112467613,C,CAA,chr2_201194503_rs112467613_C_CAA,21922,17420,79508,68046,3048.879708,2422.748131,2472.407829,2115.98158,True
5,chr3,38417345,rs196378,G,C,chr3_38417344_rs196378_G_C,155,138,1087,667,21.557173,19.192838,33.801722,20.74126,True
6,chr3,38417367,rs169046,T,C,chr3_38417366_rs169046_T_C,277,188,1586,1383,38.524755,26.146765,49.318796,43.00624,True


In [21]:
# save counts and CPM values for Pool
write.table(master_pool, "../results/pool2_dna_counts_cpm.txt")