# MPRAnalyze Workflow for Preparing Count Data for Allelic Comparison

In [1]:
library(tidyverse)
library(ggplot2)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


## Pool1

In [2]:
# read in dataset
mpra_v1 <- read_tsv("../data/mpra_qtigc_pgl4v1_pool1.txt")
mpra_v2 <- read_tsv("../data/mpra_qtigc_pgl4v2_pool1.txt")

[1mRows: [22m[34m50900[39m [1mColumns: [22m[34m16[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (3): snp, testcre, barcode
[32mdbl[39m (13): dna.r1, dna.r2, dna.r3, rna.r1, rna.r2, rna.r3, rna.r4, rna.r5, rn...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m50900[39m [1mColumns: [22m[34m16[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (3): snp, testcre, barcode
[32mdbl[39m (13): dna.r1, dna.r2, dna.r3, rna.r1, rna.r2, rna.r3, rna.r4, rna.r5, rn...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this m

In [3]:
num_enhancers_v1 <- dim(mpra_v1)[1]
num_enhancers_v2 <- dim(mpra_v2)[1]

In [4]:
# format DNA counts for v1
dna_counts_v1 <- mpra_v1 %>% select(-starts_with("rna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v1/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("dna")) %>%
    rename_with(~paste0("V1.", .), -testcre)

In [5]:
# format DNA counts for v2
dna_counts_v2 <- mpra_v2 %>% select(-starts_with("rna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v2/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("dna")) %>%
    rename_with(~paste0("V2.", .), -testcre)

In [6]:
# merge v1 and v2 DNA counts
dna_counts <- dna_counts_v1 %>% inner_join(dna_counts_v2, by = "testcre") %>%
    column_to_rownames("testcre") %>% as.matrix()

In [7]:
head(dna_counts)

Unnamed: 0,V1.dna.r1_1,V1.dna.r1_2,V1.dna.r1_3,V1.dna.r1_4,V1.dna.r1_5,V1.dna.r1_6,V1.dna.r1_7,V1.dna.r1_8,V1.dna.r1_9,V1.dna.r1_10,⋯,V2.dna.r3_41,V2.dna.r3_42,V2.dna.r3_43,V2.dna.r3_44,V2.dna.r3_45,V2.dna.r3_46,V2.dna.r3_47,V2.dna.r3_48,V2.dna.r3_49,V2.dna.r3_50
chr1_6147297_rs11583631_C_T_ref,0,1,0,0,0,0,0,0,0,7,⋯,0,0,0,5,0,0,12,0,2,0
chr1_6147297_rs11583631_C_T_alt,0,0,0,0,0,1,2,0,0,0,⋯,0,0,2,0,0,0,2,11,0,0
chr1_6147340_rs11584419_A_C_ref,0,0,0,0,1,0,0,0,0,0,⋯,2,0,1,0,5,0,0,0,0,0
chr1_6147340_rs11584419_A_C_alt,2,0,0,0,0,2,0,0,0,0,⋯,0,0,0,3,0,0,0,0,0,0
chr1_6157296_rs749435_T_C_ref,0,22,5,5,2,8,0,3,2,4,⋯,4,16,14,21,4,41,222,0,40,47
chr1_6157296_rs749435_T_C_alt,7,10,2,1,38,2,4,2,4,9,⋯,12,16,14,127,71,15,8,5,17,15


In [8]:
# format RNA counts
rna_counts_v1 <- mpra_v1 %>% select(-starts_with("dna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v1/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("rna")) %>%
    rename_with(~paste0("V1.", .), -testcre)

In [9]:
# format RNA counts
rna_counts_v2 <- mpra_v2 %>% select(-starts_with("dna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v2/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("rna")) %>%
    rename_with(~paste0("V2.", .), -testcre)

In [10]:
# merge v1 and v2 RNA counts
rna_counts <- rna_counts_v1 %>% inner_join(rna_counts_v2, by = "testcre") %>%
    column_to_rownames("testcre") %>% as.matrix()

In [11]:
# split row names from DNA counts into enhancer information
dna_enc_names <- data.frame(do.call(rbind, strsplit(rownames(dna_counts), "_")))
colnames(dna_enc_names) <- c("chrom", "pos", "snp", "allele", "mutation", "alleletype")

In [12]:
# expand dna counts to have ref and alt counts on same rows for each enhancer
dna_counts_allelic <- as.data.frame(dna_counts) %>% 
    mutate(dna_enc_names) %>% 
    pivot_wider(names_from=alleletype, values_from=contains('dna')) %>% 
    unite(enhancer, chrom, pos, snp, allele, mutation, sep='_') %>%
    column_to_rownames('enhancer')

In [13]:
# split row names from RNA counts into enhancer information
rna_enc_names <- data.frame(do.call(rbind, strsplit(rownames(rna_counts), "_")))
colnames(rna_enc_names) <- c("chrom", "pos", "snp", "allele", "mutation", "alleletype")

In [14]:
# expand rna counts to have ref and alt counts on same rows for each enhancer
rna_counts_allelic <- as.data.frame(rna_counts) %>% 
    mutate(rna_enc_names) %>% 
    pivot_wider(names_from=alleletype, values_from=contains('rna')) %>% 
    unite(enhancer, chrom, pos, snp, allele, mutation, sep='_') %>%
    column_to_rownames('enhancer')

In [15]:
# save DNA allelic counts
write.table(dna_counts_allelic, '../data/pool1_dna_counts_allelic.txt')

In [16]:
# save RNA allelic counts
write.table(rna_counts_allelic, '../data/pool1_rna_counts_allelic.txt')

## Pool2

In [17]:
# read in dataset
mpra_v1 <- read_tsv("../data/mpra_qtigc_pgl4v1_pool2.txt")
mpra_v2 <- read_tsv("../data/mpra_qtigc_pgl4v2_pool2.txt")

[1mRows: [22m[34m50900[39m [1mColumns: [22m[34m16[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (3): snp, testcre, barcode
[32mdbl[39m (13): dna.r1, dna.r2, dna.r3, rna.r1, rna.r2, rna.r3, rna.r4, rna.r5, rn...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m50900[39m [1mColumns: [22m[34m16[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (3): snp, testcre, barcode
[32mdbl[39m (13): dna.r1, dna.r2, dna.r3, rna.r1, rna.r2, rna.r3, rna.r4, rna.r5, rn...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this m

In [18]:
num_enhancers_v1 <- dim(mpra_v1)[1]
num_enhancers_v2 <- dim(mpra_v2)[1]

In [19]:
# format DNA counts for v1
dna_counts_v1 <- mpra_v1 %>% select(-starts_with("rna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v1/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("dna")) %>%
    rename_with(~paste0("V1.", .), -testcre)

In [20]:
# format DNA counts for v2
dna_counts_v2 <- mpra_v2 %>% select(-starts_with("rna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v2/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("dna")) %>%
    rename_with(~paste0("V2.", .), -testcre)

In [21]:
# merge v1 and v2 DNA counts
dna_counts <- dna_counts_v1 %>% inner_join(dna_counts_v2, by = "testcre") %>%
    column_to_rownames("testcre") %>% as.matrix()

In [22]:
head(dna_counts)

Unnamed: 0,V1.dna.r1_1,V1.dna.r1_2,V1.dna.r1_3,V1.dna.r1_4,V1.dna.r1_5,V1.dna.r1_6,V1.dna.r1_7,V1.dna.r1_8,V1.dna.r1_9,V1.dna.r1_10,⋯,V2.dna.r3_41,V2.dna.r3_42,V2.dna.r3_43,V2.dna.r3_44,V2.dna.r3_45,V2.dna.r3_46,V2.dna.r3_47,V2.dna.r3_48,V2.dna.r3_49,V2.dna.r3_50
chr2_201172085_rs67190025_C_T_ref,0,3,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,7,0,4,9
chr2_201172085_rs67190025_C_T_alt,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,1,0,0,9,0,0
chr2_201172627_rs7580924_G_C_ref,0,0,0,6,1,0,6,1,5,0,⋯,2,4,4,0,0,24,2,9,10,0
chr2_201172627_rs7580924_G_C_alt,0,6,0,0,0,0,2,0,0,0,⋯,17,0,10,0,8,13,4,1,1,0
chr2_201185918_rs13028959_C_A_ref,47,64,34,52,21,36,2,13,73,32,⋯,30,53,50,135,20,170,299,85,661,34
chr2_201185918_rs13028959_C_A_alt,21,39,29,8,14,25,24,27,45,15,⋯,440,18,86,41,226,106,270,98,59,191


In [23]:
# format RNA counts
rna_counts_v1 <- mpra_v1 %>% select(-starts_with("dna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v1/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("rna")) %>%
    rename_with(~paste0("V1.", .), -testcre)

In [24]:
# format RNA counts
rna_counts_v2 <- mpra_v2 %>% select(-starts_with("dna"), -barcode, -snp) %>% 
    mutate(row=rep(c(1:50), times=num_enhancers_v2/50)) %>% 
    pivot_wider(names_from=row, values_from=starts_with("rna")) %>%
    rename_with(~paste0("V2.", .), -testcre)

In [25]:
# remove low quality replicate (v1:r2 and v2:r7)
rna_counts_v1 <- rna_counts_v1 %>%
    select(-c(starts_with("V1.rna.r2")))

rna_counts_v2 <- rna_counts_v2 %>%
    select(-c(starts_with("V2.rna.r7")))

In [26]:
# merge v1 and v2 RNA counts
rna_counts <- rna_counts_v1 %>% inner_join(rna_counts_v2, by = "testcre") %>%
    column_to_rownames("testcre") %>% as.matrix()

In [27]:
# split row names from DNA counts into enhancer information
dna_enc_names <- data.frame(do.call(rbind, strsplit(rownames(dna_counts), "_")))
colnames(dna_enc_names) <- c("chrom", "pos", "snp", "allele", "mutation", "alleletype")

In [28]:
# expand dna counts to have ref and alt counts on same rows for each enhancer
dna_counts_allelic <- as.data.frame(dna_counts) %>% 
    mutate(dna_enc_names) %>% 
    pivot_wider(names_from=alleletype, values_from=contains('dna')) %>% 
    unite(enhancer, chrom, pos, snp, allele, mutation, sep='_') %>%
    column_to_rownames('enhancer')

In [29]:
# split row names from RNA counts into enhancer information
rna_enc_names <- data.frame(do.call(rbind, strsplit(rownames(rna_counts), "_")))
colnames(rna_enc_names) <- c("chrom", "pos", "snp", "allele", "mutation", "alleletype")

In [30]:
# expand rna counts to have ref and alt counts on same rows for each enhancer
rna_counts_allelic <- as.data.frame(rna_counts) %>% 
    mutate(rna_enc_names) %>% 
    pivot_wider(names_from=alleletype, values_from=contains('rna')) %>% 
    unite(enhancer, chrom, pos, snp, allele, mutation, sep='_') %>%
    column_to_rownames('enhancer')

In [31]:
# save DNA allelic counts
write.table(dna_counts_allelic, '../data/pool2_dna_counts_allelic.txt')

In [32]:
# save RNA allelic counts
write.table(rna_counts_allelic, '../data/pool2_rna_counts_allelic.txt')

---

Note: we need to modify RNA annotation data for pool 2

In [33]:
rna_annot_allelic_p2 <- read.table('../data/merged_rna_annot_pool_allelic.txt')

In [34]:
head(rna_annot_allelic_p2)

Unnamed: 0_level_0,version,batch,barcode,alleletype,barcode_allelic,barcode_allelic_version
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<chr>,<chr>
V1:1:1:ref,V1,1,1,ref,1.ref,V1.1.ref
V1:1:1:alt,V1,1,1,alt,1.alt,V1.1.alt
V1:1:2:ref,V1,1,2,ref,2.ref,V1.2.ref
V1:1:2:alt,V1,1,2,alt,2.alt,V1.2.alt
V1:1:3:ref,V1,1,3,ref,3.ref,V1.3.ref
V1:1:3:alt,V1,1,3,alt,3.alt,V1.3.alt


In [35]:
rna_annot_allelic_p2 <- rna_annot_allelic_p2 %>%
    mutate_if(rep(TRUE, ncol(rna_annot_allelic_p2)), as.factor)

In [36]:
head(rna_annot_allelic_p2)

Unnamed: 0_level_0,version,batch,barcode,alleletype,barcode_allelic,barcode_allelic_version
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
V1:1:1:ref,V1,1,1,ref,1.ref,V1.1.ref
V1:1:1:alt,V1,1,1,alt,1.alt,V1.1.alt
V1:1:2:ref,V1,1,2,ref,2.ref,V1.2.ref
V1:1:2:alt,V1,1,2,alt,2.alt,V1.2.alt
V1:1:3:ref,V1,1,3,ref,3.ref,V1.3.ref
V1:1:3:alt,V1,1,3,alt,3.alt,V1.3.alt


In [37]:
# adjust rna_annot accordingly
rna_annot_allelic_p2 <- rna_annot_allelic_p2 %>%
    filter(!(version=="V1" & batch == 2)) %>%
    filter(!(version=="V2" & batch == 7))

In [38]:
write.table(rna_annot_allelic_p2, '../data/merged_rna_annot_pool_allelic_p2.txt')