In [1]:
suppressPackageStartupMessages({
  library(biomaRt)
  library(dplyr)
  library(readr)
  library(readxl)
})

In [30]:
# Added an extra tab to the beginning of the file
s1_expr <- readr::read_tsv("../data/alberts2018elife_yeast/raw/SI_Data_01_expressionValues.txt")

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m1012[39m [1mColumns: [22m[34m5721[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m    (1): ...1
[32mdbl[39m (5720): YAL062W, YAL061W, YAL060W, YAL059W, YAL058W, YAL056W, YAL055W, Y...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [31]:
s1_expr <- s1_expr |>
  dplyr::rename(segregant_id = `...1`) |>
  dplyr::mutate(segregant_id = stringr::str_split_i(segregant_id, "-", 1)) |>  # Remove the uninformative part of the segregant id
  tibble::column_to_rownames("segregant_id") |>
  t() |>
  as.data.frame() |>
  tibble::rownames_to_column("gene_id")

In [32]:
readr::write_tsv(s1_expr, "../data/alberts2018elife_yeast/processed/alberts2018_expression_logtpm.tsv")

In [33]:
s2_cov <- readxl::read_excel("../data/alberts2018elife_yeast/raw/elife-35471-data2.xlsx")

In [34]:
s2_cov <- dplyr::rename(s2_cov, segregant_id = segregant, od_covariate = OD_covariate)

In [35]:
readr::write_tsv(s2_cov, "../data/alberts2018elife_yeast/processed/alberts2018_segregant_covariates.tsv")

In [13]:
# Added an extra tab to the beginning of the file
s3_genotype <- readr::read_tsv("../data/alberts2018elife_yeast/raw/SI_Data_03_genotypes.txt")

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m1012[39m [1mColumns: [22m[34m42053[39m
[36m──[39m [1mColumn specification[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m     (1): ...1
[32mdbl[39m (42052): chrI:33040_A/G, chrI:33048_A/C, chrI:33070_A/T, chrI:33077_G/A,...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [14]:
s3_genotype <- s3_genotype |>
  dplyr::rename(segregant_id = `...1`) |>
  tibble::column_to_rownames("segregant_id") |>
  t() |>
  as.data.frame() |>
  tibble::rownames_to_column("variant_id") |>
  dplyr::mutate(
    chr = stringr::str_extract(variant_id, "chr([IVX]+)\\:", group = 1),
    pos = as.numeric(stringr::str_extract(variant_id, "\\:([0-9]+)\\_", group = 1)),
    ref = stringr::str_extract(variant_id, "\\_([ATGC]+)/", group = 1),
    alt = stringr::str_extract(variant_id, "/([ATGC]+)$", group = 1),
    .after = variant_id
  ) |>
  dplyr::mutate(variant_id = gsub("[:/]", "_", variant_id))

In [15]:
head(s3_genotype)

Unnamed: 0_level_0,variant_id,chr,pos,ref,alt,A01_01,A01_02,A01_03,A01_04,A01_05,⋯,A11_86,A11_87,A11_88,A11_89,A11_91,A11_92,A11_93,A11_94,A11_95,A11_96
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,chrI_33040_A_G,I,33040,A,G,1,-1,1,1,-1,⋯,1,1,1,1,1,1,-1,-1,1,-1
2,chrI_33048_A_C,I,33048,A,C,1,-1,1,1,-1,⋯,1,1,1,1,1,1,-1,-1,1,-1
3,chrI_33070_A_T,I,33070,A,T,1,-1,1,1,-1,⋯,1,1,1,1,1,1,-1,-1,1,-1
4,chrI_33077_G_A,I,33077,G,A,1,-1,1,1,-1,⋯,1,1,1,1,1,1,-1,-1,1,-1
5,chrI_33147_G_T,I,33147,G,T,1,-1,1,1,-1,⋯,1,1,1,1,1,1,-1,-1,1,-1
6,chrI_33152_T_C,I,33152,T,C,1,-1,1,1,-1,⋯,1,1,1,1,1,1,-1,-1,1,-1


In [16]:
readr::write_tsv(s3_genotype, "../data/alberts2018elife_yeast/processed/alberts2018_genotypes.tsv")

In [12]:
s6_genes <- readxl::read_excel("../data/alberts2018elife_yeast/raw/elife-35471-data6.xlsx")

In [13]:
bm_scerevisiae <- biomaRt::getBM(
  attributes = c("ensembl_gene_id", "external_gene_name", "chromosome_name", "strand", "transcription_start_site"), 
  mart = biomaRt::useDataset(
    dataset = "scerevisiae_eg_gene", 
    mart = useMart(biomart = "fungi_mart", host = "https://fungi.ensembl.org")
  )
)

In [14]:
s6_genes <- s6_genes |>
  dplyr::select(gene_id = gene, essential, is_tf = isTF) |>
  dplyr::inner_join(bm_scerevisiae, dplyr::join_by(gene_id == ensembl_gene_id)) |>
  dplyr::relocate(c(essential, is_tf), .after = transcription_start_site)

In [16]:
readr::write_tsv(s6_genes, "../data/alberts2018elife_yeast/processed/alberts2018_genes.tsv")