# Data Wrangling

* Saving filtered raw counts
* Saving normalized filtered raw counts
* Saving filtered DGD rescaled decoder outputs
* Saving normalized filtered rescaled decoder outputs
* Saving filtered log2_fold_changes
* Saving log2 FC pseudonormal filtered 
* Saving log2_fold_changes using the normalized rescaled decoder outputs and the normalized  counts
* Saving some miReact results

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.0     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


### Saving filtered raw counts

In [2]:
fold <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/DGD/data/full_TCGA/fold_change_home_calc_mat.rds") # random fold change to get DGD's genes
dim(fold)
head(fold)

Unnamed: 0,TCGA-EW-A6SA-01A-21R-A32P-07,TCGA-E2-A14W-01A-11R-A12D-07,TCGA-EW-A1PD-01A-11R-A144-07,TCGA-55-1594-01A-01R-0946-07,TCGA-49-6742-11A-01R-1858-07,TCGA-50-5932-11A-01R-1755-07,TCGA-95-7947-01A-11R-2187-07,TCGA-CG-4476-01A-01R-1157-13,TCGA-05-4410-01A-21R-1858-07,TCGA-CG-5717-01A-11R-1602-13,⋯,TCGA-D8-A1Y1-01A-21R-A14M-07,TCGA-E2-A14X-01A-11R-A115-07,TCGA-A8-A08R-01A-11R-A034-07,TCGA-E2-A573-01A-11R-A29R-07,TCGA-HN-A2OB-01A-21R-A27Q-07,TCGA-E9-A1RD-11A-33R-A157-07,TCGA-A2-A4S3-01A-21R-A266-07,TCGA-AR-A0TV-01A-21R-A084-07,TCGA-AR-A2LQ-01A-22R-A18M-07,TCGA-C8-A12Z-01A-11R-A115-07
ENSG00000187634,-0.6317677,0.19599825,-0.1619515,-1.486681,0.30304992,0.59487826,-0.3883295,-0.3438531,-0.7189528,-0.4431814,⋯,-1.6715163,-0.5213546,-1.1708322,-0.2970428,0.8706182,0.6235211,0.0131509,-0.7870907,0.8002608,0.7622859
ENSG00000188976,0.1657946,0.6307111,0.476069,-0.1102203,0.25347576,0.27276218,-0.1240866,0.001012279,0.239031,-0.24221367,⋯,-0.5259638,0.4126003,0.001202689,0.1791519,0.4889731,0.3397694,0.2747676,0.3613891,0.2376195,-0.4288155
ENSG00000187961,-0.1387403,-0.574404,-0.2801535,-0.1924673,-0.08400568,-0.09605455,-1.3871883,-0.12497533,-0.4042621,-0.4346363,⋯,-0.7810574,-1.228252,-0.83509153,-1.2075816,-0.2132758,-0.4217334,-0.9551033,-0.57213616,-0.7094557,-0.8572454
ENSG00000187583,-1.7987337,-0.006567637,-2.0238833,-4.072919,-0.0008413264,0.10709301,-3.607912,-2.718879,-0.7510945,-0.09493367,⋯,-1.8078252,-0.2641785,-1.9824735,-1.6360275,-0.2521651,0.7681422,-1.685308,-1.6223006,0.0677278,-2.532913
ENSG00000187642,-1.4372202,0.46162483,-1.7370303,-3.5281708,0.41116753,0.04026748,-2.3723087,-2.585098,-0.1580654,-0.8502631,⋯,-1.590072,0.5290232,-1.7186714,-1.1932127,0.5421603,1.2870241,-1.2129118,-0.08827896,1.1369029,-1.1988158
ENSG00000188290,-0.9131405,-2.4552076,-0.6497661,-0.9287488,-0.12929769,0.06905619,0.3191285,-0.3057822,-0.2436821,0.79916173,⋯,-1.2299562,-1.964431,-1.6631109,-1.5358509,-0.9253867,0.605512,-0.2448734,0.09419273,0.127314,-0.5046041


In [3]:
setwd("/faststorage/project/jsp_student_projects/shared_data/TCGA_counts")
exp <- readRDS("TCGA_mrna_counts_match.rds")
rnames <- unlist(lapply(rownames(exp), function(x) unlist(strsplit(x, split = "[.]"))[1]))
length(rnames) == length(unique(rnames)) # True # check for duplicates
rownames(exp) <- rnames
dim(exp)
head(exp)

Unnamed: 0,TCGA-EW-A6SA-01A-21R-A32P-07,TCGA-E2-A14W-01A-11R-A12D-07,TCGA-EW-A1PD-01A-11R-A144-07,TCGA-55-1594-01A-01R-0946-07,TCGA-49-6742-11A-01R-1858-07,TCGA-50-5932-11A-01R-1755-07,TCGA-95-7947-01A-11R-2187-07,TCGA-CG-4476-01A-01R-1157-13,TCGA-05-4410-01A-21R-1858-07,TCGA-CG-5717-01A-11R-1602-13,⋯,TCGA-D8-A1Y1-01A-21R-A14M-07,TCGA-E2-A14X-01A-11R-A115-07,TCGA-A8-A08R-01A-11R-A034-07,TCGA-E2-A573-01A-11R-A29R-07,TCGA-HN-A2OB-01A-21R-A27Q-07,TCGA-E9-A1RD-11A-33R-A157-07,TCGA-A2-A4S3-01A-21R-A266-07,TCGA-AR-A0TV-01A-21R-A084-07,TCGA-AR-A2LQ-01A-22R-A18M-07,TCGA-C8-A12Z-01A-11R-A115-07
ENSG00000186092,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000278566,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000273547,1,0,1,0,0,0,3,3,0,0,⋯,5,1,0,0,0,0,0,2,0,0
ENSG00000187634,1413,1562,912,501,321,234,590,821,271,672,⋯,596,1128,582,1353,1333,458,1128,671,982,1350
ENSG00000188976,9723,6455,7050,4238,1791,1280,4973,3666,2325,3717,⋯,4419,9720,5661,5226,4777,3152,5675,6891,2900,2706
ENSG00000187961,1310,299,439,459,93,80,226,291,172,296,⋯,334,296,339,387,450,158,347,404,146,131


In [4]:
# remove all genes not found in seqs since miReact does not recognize them
seqs <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/miReact_ver2/seqs/hs.utr3.seqs.rds")
idx <- rownames(exp) %in% seqs[[1]]
#idx <- intersect(rownames(exp), as.data.frame(seqs[1]))
cat("previous dim: ", dim(exp), "\n")
exp <- exp[idx,]
cat("current dim:  ", dim(exp))


previous dim:  19784 10682 
current dim:   18861 10682

In [5]:
# remove all genes not found in DGD
row_names <- intersect(rownames(exp), rownames(fold))
filter <- exp[row_names, , drop = FALSE] 
dim(filter)
any(rowSums(filter)==0) # FALSE


In [None]:
head(filter)
#saveRDS(filter,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/preprocessed/TCGA_counts/counts_filtered.rds" )

### Saving normalized filtered raw counts

In [22]:
#norm_exp <- readRDS("/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/preprocessed/TCGA_counts/counts_filtered.rds")
norm_exp <- readRDS("/faststorage/project/jsp_student_projects/miRNA_DGD_DE_F2024/data/filtered_by_paper/counts_filtered.rds")# filtered


In [23]:
norm_exp <- sweep(norm_exp, 2, colSums(norm_exp), FUN = '/')*median(colSums(norm_exp))
head(norm_exp)

Unnamed: 0,TCGA-AA-3542-01A-02R-1873-07,TCGA-CW-6088-11A-01R-1672-07,TCGA-CZ-5989-01A-11R-1672-07,TCGA-B4-5832-01A-11R-1672-07,TCGA-B8-5549-11A-01R-1541-07,TCGA-AK-3447-01A-01R-1766-07,TCGA-CZ-5462-11A-01R-1503-07,TCGA-G9-6369-01A-21R-1965-07,TCGA-FC-A5OB-01A-11R-A29R-07,TCGA-FC-A6HD-01A-11R-A31N-07,⋯,TCGA-DU-7309-01A-11R-2090-07,TCGA-HT-7681-01A-11R-2403-07,TCGA-S9-A7IQ-01A-21R-A34F-07,TCGA-DB-A64L-01A-11R-A29R-07,TCGA-HT-8558-01A-21R-2404-07,TCGA-P5-A5F4-01A-11R-A28M-07,TCGA-HT-7607-01A-11R-2090-07,TCGA-E1-A7Z3-01A-11R-A34R-07,TCGA-DU-A6S6-01A-21R-A32Q-07,TCGA-HT-7610-01A-21R-2090-07
ENSG00000187634,1494.0008,616.27351,53.098268,56.912428,755.70866,203.068116,593.50479,1979.9579,1033.5306,4500.51,⋯,645.7077219,1010.422936,733.670672,739.824587,600.84912,1532.06761,434.204651,832.859524,1092.058547,680.103863
ENSG00000188976,7051.0342,2531.12333,376.868192,197.770687,2721.25661,1062.95655,3024.18449,6119.3319,5625.6573,11448.7322,⋯,3061.8783869,2866.729992,4382.502862,3939.23075,4060.940027,4464.917163,2801.501225,3872.978106,3799.477974,3629.458521
ENSG00000187961,383.2437,184.74866,5.180319,2.845621,131.38925,42.014093,84.13231,246.5087,330.598,753.2393,⋯,291.4541089,359.99768,454.680605,333.957062,310.8415,959.951168,183.455471,747.035103,729.59301,516.288125
ENSG00000187583,162.3914,19.34192,12.950797,5.691243,24.6906,1.40047,44.64163,244.5366,196.6015,653.5644,⋯,0.8051218,1.104287,6.507057,4.875286,1.66671,3.613367,2.403347,0.0,3.496452,3.35688
ENSG00000187642,58.4609,36.68295,28.491753,11.382486,35.27228,0.0,22.32082,416.1067,176.8315,796.1373,⋯,3.2204874,6.625724,13.014114,7.312928,5.833487,14.453468,8.812272,2.417589,12.820324,5.371008
ENSG00000188290,532.6438,60.02664,3.885239,4.268432,135.79829,7.002349,72.68574,2232.3828,845.7158,1042.1702,⋯,222.21363,682.44959,636.8782,496.060308,186.671571,861.185803,114.55953,821.980372,771.550435,318.232212


In [24]:
#norm_exp <- log2(norm_exp+1e-6)
norm_exp <- log2(norm_exp+1)

In [25]:
#pseudocount = 1e-6
#saveRDS(norm_exp,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/pseudocount_1e-6/counts_filtered_normalized.rds" )
#pseudocount = 1
#saveRDS(norm_exp,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/pseudocount_1/counts_filtered_normalized.rds" )
#Filtered
#saveRDS(norm_exp,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/filtered_by_paper/counts_filtered_normalized.rds" )

### Saving normalized filtered rescaled decoder outputs
Normalizing the rescaled decoder outputs

In [20]:
# For both pseudocounts
filtered <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/DGD_expression/DGD_rescaled_FINAL.rds")

exp_counts <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/preprocessed/TCGA_counts/counts_filtered.rds")
# For filtered

In [26]:
# For filtered by paper
filtered <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/DGD_expression/DGD_rescaled_FINAL.rds")
exp_counts <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/filtered_by_paper/counts_filtered.rds")
col_names <- intersect(colnames(filtered), colnames(exp_counts))
filtered <- filtered[,col_names , drop = FALSE] 
dim(filtered)


In [27]:
norm_DGD <- sweep(filtered, 2, colSums(filtered), FUN = '/')*median(colSums(exp_counts))

head(norm_DGD)

Unnamed: 0,TCGA-EW-A6SA-01A-21R-A32P-07,TCGA-E2-A14W-01A-11R-A12D-07,TCGA-EW-A1PD-01A-11R-A144-07,TCGA-55-1594-01A-01R-0946-07,TCGA-49-6742-11A-01R-1858-07,TCGA-50-5932-11A-01R-1755-07,TCGA-95-7947-01A-11R-2187-07,TCGA-CG-4476-01A-01R-1157-13,TCGA-05-4410-01A-21R-1858-07,TCGA-CG-5717-01A-11R-1602-13,⋯,TCGA-D8-A1Y1-01A-21R-A14M-07,TCGA-E2-A14X-01A-11R-A115-07,TCGA-A8-A08R-01A-11R-A034-07,TCGA-E2-A573-01A-11R-A29R-07,TCGA-HN-A2OB-01A-21R-A27Q-07,TCGA-E9-A1RD-11A-33R-A157-07,TCGA-A2-A4S3-01A-21R-A266-07,TCGA-AR-A0TV-01A-21R-A084-07,TCGA-AR-A2LQ-01A-22R-A18M-07,TCGA-C8-A12Z-01A-11R-A115-07
ENSG00000187634,1842.056,1741.8648,946.08813,1538.3395,423.37848,340.52876,693.518,940.49688,916.0871,1069.9102,⋯,1371.0811,972.2779,1084.9735,1878.4529,888.29907,228.716042,1691.2917,1100.2867,699.87112,990.0674
ENSG00000188976,7286.66,5322.6176,4695.44817,5014.9812,2441.98946,2328.8321,4860.481,3304.88934,4038.9219,5151.9028,⋯,4588.0816,4379.2458,4682.6432,5216.2955,4148.51098,1912.625126,7097.2356,5096.6851,3051.91771,4530.5333
ENSG00000187961,1212.2869,568.949,494.56427,574.1789,159.6701,187.71089,531.4184,286.0377,467.0272,470.6394,⋯,412.8108,415.0719,500.473,1011.6737,635.57717,163.661161,1017.8911,569.7564,296.12358,295.7288
ENSG00000187583,632.8512,627.7733,325.9376,550.2377,68.51189,38.26817,1228.6347,253.60011,1013.5548,4256.9034,⋯,525.677,750.9493,659.7224,1626.3753,366.13612,32.230718,846.9914,532.1796,234.99378,783.9469
ENSG00000187642,193.5129,230.5986,70.88109,158.8185,22.06796,15.68443,252.4636,66.04033,231.1615,1227.177,⋯,175.7901,142.0406,181.2705,689.8508,99.69498,9.433059,266.108,143.8944,48.58651,139.5265
ENSG00000188290,1538.3721,484.2531,518.36929,278.4365,201.1384,141.78758,364.7809,160.98072,230.7436,643.3052,⋯,469.5716,481.2751,403.3513,2060.9258,551.41539,38.404204,738.1876,360.5358,192.32608,192.1785


In [28]:
#norm_DGD <- log2(norm_DGD+1e-6)
norm_DGD <- log2(norm_DGD+1)

In [29]:
#saveRDS(norm_DGD,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/pseudocount_1e-6/rescaled_decoder_outputs_filtered_normalized.rds" )
#saveRDS(norm_DGD,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/pseudocount_1/rescaled_decoder_outputs_filtered_normalized.rds" )
saveRDS(norm_DGD,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/filtered_by_paper/rescaled_decoder_outputs_filtered_normalized.rds" )

### Saving log2 FC pseudonormal filtered 


Using Mortens method of calculating the fold change scores using the median expression.
In the DGD version we take the $$\log_{2}\left(\frac{exp}{pred\_means}\right)$$ but this time we use that:

$$\log_{2}(a) - \log_{2}(b) = \log_{2}\left(\frac{a}{b}\right)$$


In [30]:
#exp <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/pseudocount_1e-6/counts_filtered_normalized.rds")
#exp <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/pseudocount_1/counts_filtered_normalized.rds")
#
exp <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/filtered_by_paper/counts_filtered_normalized.rds")


head(exp)
pred_means_morten <- apply(exp,1,median) 
FC_morten <- apply(exp,2,function(x)(x-pred_means_morten))
head(FC_morten)

Unnamed: 0,TCGA-AA-3542-01A-02R-1873-07,TCGA-CW-6088-11A-01R-1672-07,TCGA-CZ-5989-01A-11R-1672-07,TCGA-B4-5832-01A-11R-1672-07,TCGA-B8-5549-11A-01R-1541-07,TCGA-AK-3447-01A-01R-1766-07,TCGA-CZ-5462-11A-01R-1503-07,TCGA-G9-6369-01A-21R-1965-07,TCGA-FC-A5OB-01A-11R-A29R-07,TCGA-FC-A6HD-01A-11R-A31N-07,⋯,TCGA-DU-7309-01A-11R-2090-07,TCGA-HT-7681-01A-11R-2403-07,TCGA-S9-A7IQ-01A-21R-A34F-07,TCGA-DB-A64L-01A-11R-A29R-07,TCGA-HT-8558-01A-21R-2404-07,TCGA-P5-A5F4-01A-11R-A28M-07,TCGA-HT-7607-01A-11R-2090-07,TCGA-E1-A7Z3-01A-11R-A34R-07,TCGA-DU-A6S6-01A-21R-A32Q-07,TCGA-HT-7610-01A-21R-2090-07
ENSG00000187634,10.545931,9.269766,5.75751,5.855801,9.563594,7.672907,9.215545,10.951982,10.014761,12.136193,⋯,9.33697,9.982171,9.520954,9.532988,9.233258,10.582206,8.76555,9.703661,10.094155,9.411731
ENSG00000188976,12.783824,11.306132,8.561739,7.634961,11.410587,10.055224,11.562807,12.579394,12.458062,13.483026,⋯,11.5806724,11.485693,12.097868,11.944064,11.987953,12.124741,11.452499,11.9196,11.891965,11.825936
ENSG00000187961,8.585878,7.537208,2.627681,1.943217,7.048642,5.426738,6.411635,7.951335,8.373291,9.558878,⋯,8.1920664,8.495846,8.831879,8.387832,8.284669,9.908319,7.527129,9.546962,9.512924,9.014824
ENSG00000187583,7.352188,4.346384,3.802276,2.742274,4.683169,1.263317,5.512279,7.939794,7.62645,9.354391,⋯,0.8520962,1.073332,2.908247,2.554659,1.415061,2.20582,1.766954,0.0,2.168787,2.123295
ENSG00000187642,5.893869,5.23584,4.88224,3.630229,5.180796,0.0,4.543546,8.704273,7.474367,9.638684,⋯,2.0774096,2.930874,3.808809,3.055357,2.772622,3.949859,3.294587,1.772979,3.78872,2.671522
ENSG00000188290,9.059733,5.931367,2.288429,2.397374,7.095906,3.000424,6.203313,11.125015,9.725734,10.026759,⋯,7.8022813,9.416691,9.317137,8.957277,7.552066,9.751855,6.852492,9.684714,9.593485,8.318462


Unnamed: 0,TCGA-AA-3542-01A-02R-1873-07,TCGA-CW-6088-11A-01R-1672-07,TCGA-CZ-5989-01A-11R-1672-07,TCGA-B4-5832-01A-11R-1672-07,TCGA-B8-5549-11A-01R-1541-07,TCGA-AK-3447-01A-01R-1766-07,TCGA-CZ-5462-11A-01R-1503-07,TCGA-G9-6369-01A-21R-1965-07,TCGA-FC-A5OB-01A-11R-A29R-07,TCGA-FC-A6HD-01A-11R-A31N-07,⋯,TCGA-DU-7309-01A-11R-2090-07,TCGA-HT-7681-01A-11R-2403-07,TCGA-S9-A7IQ-01A-21R-A34F-07,TCGA-DB-A64L-01A-11R-A29R-07,TCGA-HT-8558-01A-21R-2404-07,TCGA-P5-A5F4-01A-11R-A28M-07,TCGA-HT-7607-01A-11R-2090-07,TCGA-E1-A7Z3-01A-11R-A34R-07,TCGA-DU-A6S6-01A-21R-A32Q-07,TCGA-HT-7610-01A-21R-2090-07
ENSG00000187634,0.5824023,-0.6937622,-4.206018,-4.107727,-0.3999341,-2.290621,-0.7479836,0.9884542,0.051232284,2.172665,⋯,-0.6265582,0.01864243,-0.442574385,-0.43054009,-0.73027021,0.61867735,-1.1979781,-0.2598677,0.1306267,-0.55179725
ENSG00000188976,0.6845525,-0.7931393,-3.537532,-4.46431,-0.6886839,-2.044048,-0.5364638,0.480123,0.358791165,1.383755,⋯,-0.5185988,-0.61357773,-0.001402738,-0.15520681,-0.11131799,0.02546956,-0.6467719,-0.1796711,-0.2073061,-0.27333516
ENSG00000187961,0.2212159,-0.8274539,-5.736981,-6.421445,-1.3160197,-2.937924,-1.953027,-0.4133264,0.008629546,1.194217,⋯,-0.1725954,0.1311839,0.467217304,0.02317051,-0.07999273,1.54365746,-0.8375331,1.1823003,1.1482623,0.65016241
ENSG00000187583,-0.3483619,-3.3541663,-3.898274,-4.958276,-3.0173815,-6.437233,-2.1882715,0.2392444,-0.074100298,1.653841,⋯,-6.8484538,-6.62721833,-4.79230263,-5.14589108,-6.28548887,-5.49472999,-5.9335959,-7.7005501,-5.531763,-5.57725472
ENSG00000187642,-0.7721797,-1.4302092,-1.783809,-3.03582,-1.4852534,-6.666049,-2.1225027,2.0382235,0.808317817,2.972635,⋯,-4.5886395,-3.73517473,-2.857240456,-3.6106923,-3.89342722,-2.71619035,-3.3714619,-4.89307,-2.8773295,-3.99452748
ENSG00000188290,0.6793659,-2.449,-6.091938,-5.982994,-1.2844609,-5.379944,-2.1770538,2.7446476,1.345366666,1.646392,⋯,-0.5780859,1.03632386,0.936769901,0.57690983,-0.82830095,1.37148773,-1.5278748,1.304347,1.213118,-0.06190484


In [31]:
#saveRDS(FC_morten,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/pseudocount_1e-6/morten_log2_fold_change_scores.rds" )
#saveRDS(FC_morten,"/faststorage/project/jsp_student_projects/miRNA_DGD_DE_F2024/data/pseudocount_1/morten_log2_fold_change_scores.rds" )
saveRDS(FC_morten,"/faststorage/project/jsp_student_projects/miRNA_DGD_DE_F2024/data/filtered_by_paper/morten_log2_fold_change_scores.rds" )

### DGD fold change calculation

In [12]:
# for pseudocount = 1e-6
exp <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/pseudocount_1e-6/counts_filtered_normalized.rds")
pred_means_DGD <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/pseudocount_1e-6/rescaled_decoder_outputs_filtered_normalized.rds")
FC_DGD <- exp-pred_means_DGD
head(FC_DGD)

Unnamed: 0,TCGA-EW-A6SA-01A-21R-A32P-07,TCGA-E2-A14W-01A-11R-A12D-07,TCGA-EW-A1PD-01A-11R-A144-07,TCGA-55-1594-01A-01R-0946-07,TCGA-49-6742-11A-01R-1858-07,TCGA-50-5932-11A-01R-1755-07,TCGA-95-7947-01A-11R-2187-07,TCGA-CG-4476-01A-01R-1157-13,TCGA-05-4410-01A-21R-1858-07,TCGA-CG-5717-01A-11R-1602-13,⋯,TCGA-D8-A1Y1-01A-21R-A14M-07,TCGA-E2-A14X-01A-11R-A115-07,TCGA-A8-A08R-01A-11R-A034-07,TCGA-E2-A573-01A-11R-A29R-07,TCGA-HN-A2OB-01A-21R-A27Q-07,TCGA-E9-A1RD-11A-33R-A157-07,TCGA-A2-A4S3-01A-21R-A266-07,TCGA-AR-A0TV-01A-21R-A084-07,TCGA-AR-A2LQ-01A-22R-A18M-07,TCGA-C8-A12Z-01A-11R-A115-07
ENSG00000187634,-0.6690956,0.10430325,-0.2616608,-1.6052005,-0.07266354,0.330637546,-0.4468625,-0.3576655,-0.7380466,-0.6792083,⋯,-1.8186441,-0.4835141,-1.26786412,-0.38547385,0.7560948,0.5174638,-0.08544263,-0.81098918,0.55027805,0.549531
ENSG00000188976,0.1296037,0.53982259,0.3776404,-0.2295698,-0.12058053,0.008442869,-0.1806268,-0.0120198,0.2224058,-0.4792184,⋯,-0.6708856,0.4524334,-0.09556162,0.09059078,0.3740467,0.2363872,0.17627898,0.33765982,-0.01201886,-0.6413488
ENSG00000187961,-0.1747046,-0.66661234,-0.3806571,-0.3097152,-0.45308044,-0.358534756,-1.4471696,-0.1368108,-0.4219527,-0.6772766,⋯,-0.9223471,-1.1856045,-0.93130241,-1.29842938,-0.3276069,-0.5351137,-1.05365986,-0.59348423,-0.95858668,-1.0725378
ENSG00000187583,-1.8373751,-0.09886767,-2.1309072,-4.2326392,-0.37925395,-0.138239904,-3.6563055,-2.7557187,-0.7713062,-0.3316729,⋯,-1.9473882,-0.2252726,-2.07679922,-1.72155429,-0.368421,0.6345923,-1.7760766,-1.65341786,-0.18547781,-2.7442575
ENSG00000187642,-1.4905083,0.37230353,-1.8324869,-3.5975047,0.03275488,-0.292001024,-2.4122159,-2.5064603,-0.1754008,-1.0894703,⋯,-1.752496,0.5740644,-1.80520636,-1.28153542,0.4259042,1.1848386,-1.32813375,-0.10648977,0.88542524,-1.4075156
ENSG00000188290,-0.9489165,-2.54955163,-0.7467762,-1.0418243,-0.50515466,-0.18821884,0.2613114,-0.3224437,-0.2611461,0.5617456,⋯,-1.3677945,-1.9361331,-1.75840198,-1.62425139,-1.0416426,0.5004089,-0.34242748,0.07028516,-0.1163754,-0.7159485


In [14]:
# for pseudocount = 1
exp <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/pseudocount_1/counts_filtered_normalized.rds")
pred_means_DGD <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/pseudocount_1/rescaled_decoder_outputs_filtered_normalized.rds")
FC_DGD <- exp-pred_means_DGD
head(FC_DGD)

Unnamed: 0,TCGA-EW-A6SA-01A-21R-A32P-07,TCGA-E2-A14W-01A-11R-A12D-07,TCGA-EW-A1PD-01A-11R-A144-07,TCGA-55-1594-01A-01R-0946-07,TCGA-49-6742-11A-01R-1858-07,TCGA-50-5932-11A-01R-1755-07,TCGA-95-7947-01A-11R-2187-07,TCGA-CG-4476-01A-01R-1157-13,TCGA-05-4410-01A-21R-1858-07,TCGA-CG-5717-01A-11R-1602-13,⋯,TCGA-D8-A1Y1-01A-21R-A14M-07,TCGA-E2-A14X-01A-11R-A115-07,TCGA-A8-A08R-01A-11R-A034-07,TCGA-E2-A573-01A-11R-A29R-07,TCGA-HN-A2OB-01A-21R-A27Q-07,TCGA-E9-A1RD-11A-33R-A157-07,TCGA-A2-A4S3-01A-21R-A266-07,TCGA-AR-A0TV-01A-21R-A084-07,TCGA-AR-A2LQ-01A-22R-A18M-07,TCGA-C8-A12Z-01A-11R-A115-07
ENSG00000187634,-0.6686186,0.10424362,-0.2613479,-1.6032246,-0.07248217,0.329743656,-0.4460837,-0.3572203,-0.7369617,-0.6783719,⋯,-1.8159016,-0.4829046,-1.26593326,-0.38523102,0.7554111,0.5155075,-0.0853889,-0.80996871,0.54960366,0.5490546
ENSG00000188976,0.1295861,0.53973521,0.3775673,-0.2295186,-0.12052736,0.008439137,-0.1805859,-0.01201603,0.2223531,-0.4791045,⋯,-0.6706933,0.4523418,-0.09553983,0.09057339,0.3739646,0.2362695,0.1762548,0.33759882,-0.01201478,-0.6411648
ENSG00000187961,-0.1745465,-0.66507752,-0.3797495,-0.3090949,-0.44966303,-0.356308948,-1.4423449,-0.13629447,-0.4208714,-0.675385,⋯,-0.9191271,-1.1810469,-0.92860975,-1.2962831,-0.3270103,-0.5310561,-1.0520873,-0.59215622,-0.95386324,-1.0670088
ENSG00000187583,-1.8313373,-0.09869957,-2.1155841,-4.1853065,-0.37282541,-0.134431784,-3.6423054,-2.72243103,-0.7702684,-0.3315825,⋯,-1.939328,-0.2249377,-2.06955809,-1.71945159,-0.3672406,0.61856,-1.7718215,-1.64743318,-0.18461185,-2.7334759
ENSG00000187642,-1.4767106,0.37083943,-1.7803592,-3.4975244,0.03130602,-0.272140214,-2.3870238,-2.40600937,-0.1745713,-1.0881023,⋯,-1.7326639,0.570641,-1.78495544,-1.27845209,0.4221173,1.1027936,-1.3197304,-0.10570228,0.87158356,-1.3900978
ENSG00000188290,-0.9480161,-2.5347206,-0.7448321,-1.0361892,-0.50206741,-0.186765478,0.2606364,-0.32014188,-0.2598708,0.5609995,⋯,-1.3627974,-1.9274252,-1.74964589,-1.62274686,-1.0387899,0.4892908,-0.3418875,0.07008916,-0.11572808,-0.7110009


In [32]:
# for Filtered by paper AND pseudocount = 1
exp <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/filtered_by_paper/counts_filtered_normalized.rds")
pred_means_DGD <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/filtered_by_paper/rescaled_decoder_outputs_filtered_normalized.rds")
dim(exp)
dim(pred_means_DGD)
FC_DGD <- exp-pred_means_DGD
head(FC_DGD)

Unnamed: 0,TCGA-AA-3542-01A-02R-1873-07,TCGA-CW-6088-11A-01R-1672-07,TCGA-CZ-5989-01A-11R-1672-07,TCGA-B4-5832-01A-11R-1672-07,TCGA-B8-5549-11A-01R-1541-07,TCGA-AK-3447-01A-01R-1766-07,TCGA-CZ-5462-11A-01R-1503-07,TCGA-G9-6369-01A-21R-1965-07,TCGA-FC-A5OB-01A-11R-A29R-07,TCGA-FC-A6HD-01A-11R-A31N-07,⋯,TCGA-DU-7309-01A-11R-2090-07,TCGA-HT-7681-01A-11R-2403-07,TCGA-S9-A7IQ-01A-21R-A34F-07,TCGA-DB-A64L-01A-11R-A29R-07,TCGA-HT-8558-01A-21R-2404-07,TCGA-P5-A5F4-01A-11R-A28M-07,TCGA-HT-7607-01A-11R-2090-07,TCGA-E1-A7Z3-01A-11R-A34R-07,TCGA-DU-A6S6-01A-21R-A32Q-07,TCGA-HT-7610-01A-21R-2090-07
ENSG00000187634,-0.30195364,-1.497479,-4.129844,-4.732295,0.8343864,-0.7429563,-0.2243236,1.07316998,0.1738456,2.0715715,⋯,-1.08518,0.05546264,-0.56381925,-1.343109,-0.56326682,2.7384978,-1.959212,-0.4013139,0.6411496,-0.5411083
ENSG00000188976,-0.04741618,-1.072059,-3.635615,-4.657355,0.1561554,-1.1307868,-0.6843728,0.88857146,0.4779507,1.1518566,⋯,-0.5833174,-0.61110265,-0.09554698,-0.405022,-0.03077239,1.2226483,-1.340745,-0.3960265,0.3159922,-0.3198475
ENSG00000187961,-1.65882732,-1.617481,-6.325247,-7.22465,-0.2793155,-2.1332964,-2.6447818,-0.21376099,-0.4971572,0.677338,⋯,-0.5007611,-0.20484325,-0.13814907,-1.596121,-1.02952247,2.5449628,-2.465655,0.3902309,1.2980049,0.8018231
ENSG00000187583,-1.95581213,-4.950012,-4.550596,-6.364257,-1.4360193,-4.0319718,-4.7517356,-0.05229478,-2.3601813,-2.7015362,⋯,-8.1886785,-8.48115987,-6.45965302,-8.113672,-7.10511008,-2.8486255,-7.960952,-9.0584778,-5.7138179,-7.4931558
ENSG00000187642,-1.70985226,-2.619643,-1.285301,-3.690062,0.6529772,-4.0604309,-3.4420883,2.63731518,-0.3846183,-0.6236183,⋯,-5.3884837,-4.22940693,-3.70112844,-6.376874,-3.88122605,0.5667684,-4.766692,-5.4058791,-1.8431561,-4.4631771
ENSG00000188290,-1.52839317,-2.991226,-6.732188,-5.729003,-0.5632932,-4.1573032,-2.3115224,3.78533653,1.869348,0.6951584,⋯,-1.0759891,0.50297855,0.65767172,-2.0525,-1.55754339,4.4515773,-2.677304,1.1867196,1.9985928,0.7246718


In [33]:
#saveRDS(FC_DGD,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/pseudocount_1e-6/log2_fold_changes_filtered_normalized.rds")
#saveRDS(FC_DGD,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/pseudocount_1/log2_fold_changes_filtered_normalized.rds")
saveRDS(FC_DGD,"/faststorage/project/jsp_student_projects//miRNA_DGD_DE_F2024/data/filtered_by_paper/log2_fold_changes_filtered_normalized.rds")

### rescale DGD


Rescaling the negative binomials by the means of the raw counts


In [9]:
pred_means_DGD <- read_csv("/faststorage/project/jsp_student_projects/miRNA_DGD_DE_F2024/DGD/data/full_TCGA/decoder_outputs.csv")
obs_counts_DGD <- read_csv("/faststorage/project/jsp_student_projects/miRNA_DGD_DE_F2024/DGD/data/full_TCGA/samples_preprocessed.csv")

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m10682[39m [1mColumns: [22m[34m16884[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m     (1): ...1
[32mdbl[39m (16883): ENSG00000187634, ENSG00000188976, ENSG00000187961, ENSG00000187...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m10682[39m [1mColumns: [22m[34m16884[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m     (1): ...1
[32mdbl[39m (16883): ENSG00000187634, ENSG00000188976, ENSG00000187961, ENSG00000187...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specif

In [10]:
pred_means_DGD <- pred_means_DGD %>%
    column_to_rownames("...1")

obs_counts_DGD <- obs_counts_DGD %>%
    column_to_rownames("...1")

In [11]:
# Filter away the genes that we do not want
exp <- readRDS("/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/preprocessed/TCGA_counts/counts_filtered.rds")
dim(obs_counts_DGD)
dim(pred_means_DGD)
row_names <- intersect(rownames(exp), colnames(obs_counts_DGD))
obs_counts_DGD <- obs_counts_DGD[,row_names , drop = FALSE] 
row_names <- intersect(rownames(exp), colnames(pred_means_DGD))
pred_means_DGD <- pred_means_DGD[,row_names , drop = FALSE] 
dim(obs_counts_DGD)
dim(pred_means_DGD)

In [12]:
head(obs_counts_DGD)
dim(obs_counts_DGD)


Unnamed: 0_level_0,ENSG00000187634,ENSG00000188976,ENSG00000187961,ENSG00000187583,ENSG00000187642,ENSG00000188290,ENSG00000187608,ENSG00000188157,ENSG00000131591,ENSG00000162571,⋯,ENSG00000129824,ENSG00000067646,ENSG00000099715,ENSG00000114374,ENSG00000067048,ENSG00000183878,ENSG00000154620,ENSG00000165246,ENSG00000012817,ENSG00000198692
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-EW-A6SA-01A-21R-A32P-07,1413,9723,1310,216,84,972,834,26778,2022,69,⋯,7408,1192,1,846,6239,2542,9,600,239,93
TCGA-E2-A14W-01A-11R-A12D-07,1562,6455,299,489,249,69,538,4353,480,3,⋯,9090,800,2,1398,1813,558,105,791,130,55
TCGA-EW-A1PD-01A-11R-A144-07,912,7050,439,86,23,357,1281,15155,773,24,⋯,12622,855,7,2183,3703,1745,165,1072,3496,1612
TCGA-55-1594-01A-01R-0946-07,501,4238,459,29,13,134,1300,10850,553,52,⋯,1354,144,0,220,678,200,14,78,571,179
TCGA-49-6742-11A-01R-1858-07,321,1791,93,42,18,113,831,15436,175,396,⋯,9588,686,4,793,2766,783,100,288,892,842
TCGA-50-5932-11A-01R-1755-07,234,1280,80,19,7,68,759,9293,153,57,⋯,4081,573,4,683,2230,580,47,146,962,463


In [13]:
cat("nrow: ", nrow(obs_counts_DGD))
rownames(obs_counts_DGD)[1:5]

nrow:  10682

In [16]:
# find mean gene count of samples 
mean_obs <- as.data.frame(apply(obs_counts_DGD,1,mean))
dim(mean_obs)
# multiply that by each col to rescale
rescaled <- apply(pred_means_DGD,2,function(x)(x*mean_obs))
rescaled <- as.data.frame(rescaled)
colnames(rescaled) <- colnames(pred_means_DGD)

In [17]:
dim(rescaled)
head(rescaled)

Unnamed: 0_level_0,ENSG00000187634,ENSG00000188976,ENSG00000187961,ENSG00000187583,ENSG00000187642,ENSG00000188290,ENSG00000187608,ENSG00000188157,ENSG00000131591,ENSG00000162571,⋯,ENSG00000129824,ENSG00000067646,ENSG00000099715,ENSG00000114374,ENSG00000067048,ENSG00000183878,ENSG00000154620,ENSG00000165246,ENSG00000012817,ENSG00000198692
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TCGA-EW-A6SA-01A-21R-A32P-07,2310.3406,9139.063,1520.47262,793.73359,242.707389,1929.45474,1117.4473,19028.07,2114.8164,119.57313,⋯,21167.689,1321.7553,20.864318,4348.5882,10000.2374,1760.1786,268.40429,322.68874,7318.5681,2515.4001
TCGA-E2-A14W-01A-11R-A12D-07,1428.9973,4366.588,466.7564,515.01488,189.179295,397.27332,784.3496,9610.073,608.6834,48.46396,⋯,5561.191,184.07626,4.312737,417.6698,940.1876,196.7939,38.47986,103.97807,421.1055,374.1556
TCGA-EW-A1PD-01A-11R-A144-07,1116.3503,5540.462,583.56823,384.59477,83.637167,611.65731,933.7001,18154.421,925.8916,57.36299,⋯,8383.794,609.51948,18.114367,1387.1117,2730.0287,810.8627,75.31684,448.34581,1964.2439,733.4288
TCGA-55-1594-01A-01R-0946-07,1479.2856,4822.466,552.13725,529.11513,152.721794,267.74784,1295.8194,12008.591,642.7878,107.65976,⋯,6419.025,601.84567,13.059228,1134.4092,1923.8761,735.7088,143.75839,260.08426,1771.9658,1239.4983
TCGA-49-6742-11A-01R-1858-07,288.1781,1662.172,108.68155,46.63352,15.020845,136.90749,1052.495,7026.947,212.7491,719.74591,⋯,2684.682,91.99661,3.260468,188.865,396.7989,128.9595,16.07005,37.4735,208.1401,196.5468
TCGA-50-5932-11A-01R-1755-07,173.4054,1185.897,95.58689,19.48707,7.986891,72.20164,728.1915,5954.528,174.7308,55.77457,⋯,1794.643,144.90459,3.529298,268.0972,691.4012,170.7816,12.44671,44.51676,361.9491,142.6248


In [18]:
saveRDS(t(rescaled),"/home/elisalaegsgaard/jsp_student_projects/miRNA_DGD_DE_F2024/data/DGD_expression/DGD_rescaled_FINAL.rds")