# Comparison with the R version of `tximport`

This file is included to provide the code used to generate the output from `tximport` that the test in `test_correctness.py` compares against. It will note be automatically run by `pytest` and the assertions provided at the end are redundant, since they are already included in `test_correctness.py`.

In [1]:
%load_ext rpy2.ipython

In [2]:
%%R
R.version.string

[1] "R version 4.3.1 (2023-06-16)"


In [3]:
%%R
library(tximport)
library(readr)
dir <- "./data/fabry_disease"
tx2gene <- read_tsv(file.path(dir, "transcript_gene_mapping_human.csv"))
files <- c(
  file.path(dir, "SRR16504309_wt.sf"),
  file.path(dir, "SRR16504310_wt.sf"),
  file.path(dir, "SRR16504311_ko.sf"),
  file.path(dir, "SRR16504312_ko.sf")
)
countsFromAbundanceOptions <- c("no", "scaledTPM", "lengthScaledTPM")
for (idx in seq_along(countsFromAbundanceOptions)) {
  txi <- tximport(
    files,
    type = "salmon",
    tx2gene = tx2gene,
    countsFromAbundance = countsFromAbundanceOptions[idx],
    ignoreTxVersion = TRUE,
    ignoreAfterBar = TRUE
  )
  print(idx)
  writePath <- file.path(dir, "counts_tximport.csv")
  if (!is.null(countsFromAbundanceOptions[idx])) {
    writePath <- gsub(".csv", paste0("_", countsFromAbundanceOptions[idx], ".csv"), writePath)
  }
  write.csv(txi$counts, writePath)
}

Rows: 244191 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): transcript_id, gene_id

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] 1
[1] 2
[1] 3


reading in files with read_tsv
1 2 3 4 
transcripts missing from tx2gene: 31380
summarizing abundance
summarizing counts
summarizing length
reading in files with read_tsv
1 2 3 4 
transcripts missing from tx2gene: 31380
summarizing abundance
summarizing counts
summarizing length
reading in files with read_tsv
1 2 3 4 
transcripts missing from tx2gene: 31380
summarizing abundance
summarizing counts
summarizing length


In [4]:
!pytximport -i ./data/fabry_disease/SRR16504309_wt.sf -i ./data/fabry_disease/SRR16504310_wt.sf -i ./data/fabry_disease/SRR16504311_ko.sf -i ./data/fabry_disease/SRR16504312_ko.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/fabry_disease/counts_pytximport_no.csv
!pytximport -i ./data/fabry_disease/SRR16504309_wt.sf -i ./data/fabry_disease/SRR16504310_wt.sf -i ./data/fabry_disease/SRR16504311_ko.sf -i ./data/fabry_disease/SRR16504312_ko.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/fabry_disease/counts_pytximport_scaledTPM.csv -c scaled_tpm
!pytximport -i ./data/fabry_disease/SRR16504309_wt.sf -i ./data/fabry_disease/SRR16504310_wt.sf -i ./data/fabry_disease/SRR16504311_ko.sf -i ./data/fabry_disease/SRR16504312_ko.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/fabry_disease/counts_pytximport_lengthScaledTPM.csv -c length_scaled_tpm

2024-06-04 19:00:50,027: Starting the import.
Reading quantification files: 4it [00:01,  2.84it/s]
2024-06-04 19:00:51,561: Converting transcript-level expression to gene-level expression.
2024-06-04 19:00:51,981: Not all transcripts are present in the mapping. 31380 out of 253181 missing.
2024-06-04 19:00:52,335: Matching gene_ids.
2024-06-04 19:00:52,542: Creating gene abundance.
2024-06-04 19:00:52,820: Creating gene counts.
2024-06-04 19:00:52,936: Creating lengths.
2024-06-04 19:00:53,089: Replacing missing lengths.
2024-06-04 19:00:58,156: Creating gene expression dataset.
2024-06-04 19:00:58,193: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.
2024-06-04 19:00:58,273: Finished the import in 8.25 seconds.
2024-06-04 19:00:59,769: Starting the import.
Reading quantification files: 4it [00:01,  2.66it/s]
2024-06-04 19:01:01,409: Converting transcript-level expression to gene-level expression.
2024-06-04 19:01:01,837: Not all transcripts are present

In [5]:
import pandas as pd

counts_tximport_no = pd.read_csv("./data/fabry_disease/counts_tximport_no.csv")
counts_tximport_scaledTPM = pd.read_csv("./data/fabry_disease/counts_tximport_scaledTPM.csv")
counts_tximport_lengthScaledTPM = pd.read_csv("./data/fabry_disease/counts_tximport_lengthScaledTPM.csv")

counts_pytximport_no = pd.read_csv("./data/fabry_disease/counts_pytximport_no.csv")
counts_pytximport_scaledTPM = pd.read_csv("./data/fabry_disease/counts_pytximport_scaledTPM.csv")
counts_pytximport_lengthScaledTPM = pd.read_csv("./data/fabry_disease/counts_pytximport_lengthScaledTPM.csv")
counts_pytximport_no.columns = counts_tximport_no.columns
counts_pytximport_scaledTPM.columns = counts_tximport_scaledTPM.columns
counts_pytximport_lengthScaledTPM.columns = counts_tximport_lengthScaledTPM.columns

pd.testing.assert_frame_equal(counts_tximport_no, counts_pytximport_no)
pd.testing.assert_frame_equal(counts_tximport_scaledTPM, counts_pytximport_scaledTPM)
pd.testing.assert_frame_equal(counts_tximport_lengthScaledTPM, counts_pytximport_lengthScaledTPM)