# Comparison with the R version of `tximport`

This file is included to provide the code used to generate the output from `tximport` that the test in `test_correctness.py` compares against. It will note be automatically run by `pytest` and the assertions provided at the end are redundant, since they are already included in `test_correctness.py`.

In [1]:
%load_ext rpy2.ipython

In [2]:
%%R
R.version.string

[1] "R version 4.3.1 (2023-06-16)"


In [3]:
%%R
library(tximport)
library(readr)
dir <- "./data/fabry_disease"
tx2gene <- read_tsv(file.path(dir, "transcript_gene_mapping_human.csv"))
files <- c(
  file.path(dir, "SRR16504309_wt.sf"),
  file.path(dir, "SRR16504310_wt.sf"),
  file.path(dir, "SRR16504311_ko.sf"),
  file.path(dir, "SRR16504312_ko.sf")
)
countsFromAbundanceOptions <- c("no", "scaledTPM", "lengthScaledTPM")
for (idx in seq_along(countsFromAbundanceOptions)) {
    txi <- tximport(
        files,
        type = "salmon",
        tx2gene = tx2gene,
        countsFromAbundance = countsFromAbundanceOptions[idx],
        ignoreTxVersion = TRUE,
        ignoreAfterBar = TRUE
    )
    writePath <- file.path(dir, "counts_tximport.csv")
    if (!is.null(countsFromAbundanceOptions[idx])) {
        writePath <- gsub(".csv", paste0("_", countsFromAbundanceOptions[idx], ".csv"), writePath)
    }
    write.csv(txi$counts, writePath)
}

Rows: 244191 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): transcript_id, gene_id

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


reading in files with read_tsv
1 2 3 4 
transcripts missing from tx2gene: 31380
summarizing abundance
summarizing counts
summarizing length
reading in files with read_tsv
1 2 3 4 
transcripts missing from tx2gene: 31380
summarizing abundance
summarizing counts
summarizing length
reading in files with read_tsv
1 2 3 4 
transcripts missing from tx2gene: 31380
summarizing abundance
summarizing counts
summarizing length


In [4]:
!pytximport -i ./data/fabry_disease/SRR16504309_wt.sf -i ./data/fabry_disease/SRR16504310_wt.sf -i ./data/fabry_disease/SRR16504311_ko.sf -i ./data/fabry_disease/SRR16504312_ko.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/fabry_disease/counts_pytximport_no.csv
!pytximport -i ./data/fabry_disease/SRR16504309_wt.sf -i ./data/fabry_disease/SRR16504310_wt.sf -i ./data/fabry_disease/SRR16504311_ko.sf -i ./data/fabry_disease/SRR16504312_ko.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/fabry_disease/counts_pytximport_scaledTPM.csv -c scaled_tpm
!pytximport -i ./data/fabry_disease/SRR16504309_wt.sf -i ./data/fabry_disease/SRR16504310_wt.sf -i ./data/fabry_disease/SRR16504311_ko.sf -i ./data/fabry_disease/SRR16504312_ko.sf -t salmon -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/fabry_disease/counts_pytximport_lengthScaledTPM.csv -c length_scaled_tpm

2024-06-11 17:08:34,425: Starting the import.
Reading quantification files: 4it [00:01,  3.45it/s]
2024-06-11 17:08:35,700: Converting transcript-level expression to gene-level expression.
2024-06-11 17:08:36,111: Not all transcripts are present in the mapping. 31380 out of 253181 missing.
2024-06-11 17:08:36,392: Matching gene_ids.
2024-06-11 17:08:36,550: Creating gene abundance.
2024-06-11 17:08:36,964: Creating gene counts.
2024-06-11 17:08:37,043: Creating lengths.
2024-06-11 17:08:37,164: Replacing missing lengths.
2024-06-11 17:08:42,420: Creating gene expression dataset.
2024-06-11 17:08:42,450: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.
2024-06-11 17:08:42,521: Finished the import in 8.10 seconds.
2024-06-11 17:08:44,060: Starting the import.
Reading quantification files: 4it [00:01,  3.54it/s]
2024-06-11 17:08:45,298: Converting transcript-level expression to gene-level expression.
2024-06-11 17:08:45,688: Not all transcripts are present

In [5]:
import pandas as pd

counts_tximport_no = pd.read_csv("./data/fabry_disease/counts_tximport_no.csv")
counts_tximport_scaledTPM = pd.read_csv("./data/fabry_disease/counts_tximport_scaledTPM.csv")
counts_tximport_lengthScaledTPM = pd.read_csv("./data/fabry_disease/counts_tximport_lengthScaledTPM.csv")

counts_pytximport_no = pd.read_csv("./data/fabry_disease/counts_pytximport_no.csv")
counts_pytximport_scaledTPM = pd.read_csv("./data/fabry_disease/counts_pytximport_scaledTPM.csv")
counts_pytximport_lengthScaledTPM = pd.read_csv("./data/fabry_disease/counts_pytximport_lengthScaledTPM.csv")
counts_pytximport_no.columns = counts_tximport_no.columns
counts_pytximport_scaledTPM.columns = counts_tximport_scaledTPM.columns
counts_pytximport_lengthScaledTPM.columns = counts_tximport_lengthScaledTPM.columns

pd.testing.assert_frame_equal(counts_tximport_no, counts_pytximport_no)
pd.testing.assert_frame_equal(counts_tximport_scaledTPM, counts_pytximport_scaledTPM)
pd.testing.assert_frame_equal(counts_tximport_lengthScaledTPM, counts_pytximport_lengthScaledTPM)

## Compare outputs for transcript-level summarization

In [6]:
%%R
dir <- "./data/salmon"
files_protein_coding <- c(
  file.path(dir, "quant.sf")
)
tx2gene <- read_tsv(file.path("./data/fabry_disease", "transcript_gene_mapping_human.csv"))
countsFromAbundanceOptions <- c("scaledTPM", "dtuScaledTPM")
for (idx in seq_along(countsFromAbundanceOptions)) {
    txi <- tximport(
        files_protein_coding,
        type = "salmon",
        tx2gene = tx2gene,
        txOut = TRUE,
        countsFromAbundance = countsFromAbundanceOptions[idx],
        ignoreTxVersion = TRUE,
        ignoreAfterBar = TRUE
    )
    writePath <- file.path(dir, "counts_tximport.csv")
    if (!is.null(countsFromAbundanceOptions[idx])) {
        writePath <- gsub(".csv", paste0("_", countsFromAbundanceOptions[idx], ".csv"), writePath)
    }
    write.csv(txi$counts, writePath)
}

Rows: 244191 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): transcript_id, gene_id

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


reading in files with read_tsv
1 
reading in files with read_tsv
1 


In [7]:
!pytximport -i ./data/salmon/quant.sf -m ./data/fabry_disease/transcript_gene_mapping_human.csv -o ./data/salmon/counts_pytximport_dtuScaledTPM.csv -t salmon -tx -c dtu_scaled_tpm

2024-06-11 17:09:03,452: Starting the import.
Reading quantification files: 1it [00:00, 295.94it/s]
2024-06-11 17:09:03,827: Setting counts to length scaled TPM.
2024-06-11 17:09:03,828: Saving the gene-level expression to: data/salmon/counts_pytximport_dtuScaledTPM.csv.
2024-06-11 17:09:03,830: Finished the import in 0.38 seconds.


In [8]:
counts_tximport_dtuScaledTPM = pd.read_csv("./data/salmon/counts_tximport_dtuScaledTPM.csv", index_col=0).sort_index()
counts_pytximport_dtuScaledTPM = pd.read_csv(
    "./data/salmon/counts_pytximport_dtuScaledTPM.csv", index_col=0
).sort_index()
# cut the transcript version from the index
counts_tximport_dtuScaledTPM.index = counts_tximport_dtuScaledTPM.index.str.split(".").str[0]
counts_pytximport_dtuScaledTPM.columns = counts_tximport_dtuScaledTPM.columns

pd.testing.assert_frame_equal(counts_tximport_dtuScaledTPM, counts_pytximport_dtuScaledTPM)