In [1]:
library(OUTRIDER)
library(dplyr)

Loading required package: BiocParallel

Loading required package: GenomicFeatures

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

Loading required package: stats4


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: GenomeInfoDb

“package ‘GenomeInfoDb’ was built

In [3]:
ods_parent_dir <- '/s/project/gtex_genetic_diagnosis/v8/processed_results/aberrant_expression/gencode34/outrider'
sample_annotation_path <- '/s/project/gtex_genetic_diagnosis/v8/sample_annotation.tsv'
drop_group_map_path <-  '/data/nasif12/home_if12/tsi/projects/rep_scripts/config/gtex_v8_old_dna/drop_groups.tsv'
output_dir <- '/s/project/promoter_prediction/sex_analysis/bcv'
output_count_filename = 'gtexv8_rnaseq_counts.csv'
output_annot_filename = 'sample_annotation.csv'
# output_count_filename = 'gtexv8_rnaseq_subset_counts.csv'
# output_annot_filename = 'sample_subset_annotation.csv'

## Extract necessary sample annotation

In [4]:
drop_df <- read.table(drop_group_map_path,  sep = "\t", header = TRUE, quote = "")
rownames(drop_df) <- drop_df$DROP_GROUP

In [5]:
sample_annot <- read.table(sample_annotation_path, sep = "\t", header = TRUE, quote = "")[, c('RNA_ID', 'INDIVIDUAL_ID','SEX', 'tissue_type', 'TISSUE')] %>%
  mutate(SEX = ifelse(`SEX` == "Male", 0, ifelse(`SEX` == "Female", 1, `SEX`)))
colnames(sample_annot)[colnames(sample_annot) == "RNA_ID"] <- "sampleID"
colnames(sample_annot)[colnames(sample_annot) == "INDIVIDUAL_ID"] <- "individualID"
colnames(sample_annot)[colnames(sample_annot) == "SEX"] <- "sex"
colnames(sample_annot)[colnames(sample_annot) == "tissue_type"] <- "tissue"
colnames(sample_annot)[colnames(sample_annot) == "TISSUE"] <- "DROP_GROUP"
rownames(sample_annot) <- sample_annot$sampleID
sample_annot$subtissue <- drop_df[sample_annot$DROP_GROUP, 'subtissue']
head(sample_annot)

Unnamed: 0_level_0,sampleID,individualID,sex,tissue,DROP_GROUP,subtissue
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F,1,Uterus,Uterus,Uterus
GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,1,Adipose Tissue,Adipose_-_Subcutaneous,Adipose - Subcutaneous
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,1,Brain,Brain_-_Cortex,Brain - Cortex
GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F,1,Vagina,Vagina,Vagina
GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F,1,Adipose Tissue,Adipose_-_Visceral_Omentum,Adipose - Visceral (Omentum)
GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,1,Heart,Heart_-_Atrial_Appendage,Heart - Atrial Appendage


In [6]:
dim(sample_annot)

In [7]:
sample_annot$subtissue %>% unique %>% length

In [8]:
# # Filter the data to keep tissues that have both sexes (male and female)
# sample_annot = sample_annot %>%
#   group_by(`subtissue`) %>%                             # Group by tissue
#   filter(all(c(0, 1) %in% `sex`)) %>%      # Check for both sexes
#   ungroup()
# sample_annot$subtissue %>% unique %>% length

In [9]:
# sample_annot <- sample_annot %>%
#   group_by(`tissue`, `sex`) %>%   # Group by tissue, sample, and sex
#   filter(n() >= 50) %>%                # Keep only groups with at least 50 samples
#   ungroup()        
# sample_annot$subtissue %>% unique %>% length

In [10]:
drop_groups <- sample_annot$DROP_GROUP %>% unique

In [11]:
# # Sample n rows from each tissue-sex combination
# sample_annot <- sample_annot %>%
#   group_by(`tissue`, `sex`) %>%                      # Group by tissue and sex
#   sample_n(20, replace = FALSE) %>%               # Take n samples from each group
#   ungroup()    
# dim(sample_annot)

In [12]:
write.csv(sample_annot, paste0(output_dir, '/', output_annot_filename), row.names = TRUE, quote=FALSE)

## Extract OUTRIDER count table

In [13]:
rds_dirs = list.dirs(path = ods_parent_dir, full.names = TRUE, recursive = FALSE)
names(rds_dirs) <- sapply(rds_dirs, basename)
rds_dirs <- rds_dirs[drop_groups]
rds_files = paste0(rds_dirs, '/ods_unfitted.Rds')
rds_files

In [14]:
mat_list = lapply(rds_files, function(x) counts(readRDS(x)))

In [15]:
dim(mat_list[[1]])

In [16]:
combined_matrix <- do.call(cbind, mat_list) %>% t

In [17]:
dim(combined_matrix)

In [18]:
combined_matrix = combined_matrix[sample_annot$sampleID, ]

In [19]:
dim(combined_matrix)

In [20]:
setdiff(rownames(combined_matrix), sample_annot$sampleID) %>% length

In [21]:
write.csv(combined_matrix, paste0(output_dir, '/', output_count_filename), row.names = TRUE, quote=FALSE)