# Isomatrix Tools

> Tools for converting isomatrix files into anndata objects for integration with the Scanpy ecosystem.


In [1]:
#| default_exp isomatrix_tools

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import pandas as pd
import scanpy as sc
from scanpy import AnnData
from scipy.sparse import csr_matrix
import warnings

def isomatrix_to_anndata(file_path:str,  # The path to the isomatrix csv file to be read.
                        sparse:bool=True  # Flag to determine if the output should be a sparse matrix.
) -> AnnData: # The converted isomatrix as a scanpy compatible  anndata object
    """
    This function converts an isomatrix txt file (SiCeLoRe output) into an AnnData object compatible with scanpy

    """
    
    # Read in the data from the file
    df = pd.read_csv(file_path, sep='\t', index_col=0)
    # Filter out rows where the transcriptId is "undef"
    df = df.loc[df["transcriptId"] != "undef"]
    
    df = df.reset_index()
    df = df.transpose()
    
    # Extract the rows with 'gene_id', 'transcript_id', 'nb_exons' from the DataFrame
    additional_info_rows = df.loc[df.index.intersection(['geneId', 'transcriptId', 'nbExons'])]
    # Drop 'gene_id', 'transcript_id', 'nb_exons' rows from the DataFrame if they exist
    df = df.drop(['geneId', 'transcriptId', 'nbExons'], errors='ignore')

    # Convert the DataFrame to a sparse matrix if the sparse flag is True
    if sparse:
        matrix = csr_matrix(df.values.astype('float32'))
    else:
        try:
            matrix = df.values.astype('float32')
        except ValueError:
            print("Error: Non-numeric data present in the DataFrame. Cannot convert to float.")
            return None
    
    # Convert the matrix to an AnnData object
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        anndata = sc.AnnData(X=matrix, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
    
    # Add additional information to the AnnData object vars
    for info in ['geneId', 'transcriptId', 'nbExons']:
        if info in additional_info_rows.index:
            anndata.var[info] = additional_info_rows.loc[info, :].values
            if info == 'nbExons':
                anndata.var[info] = anndata.var[info].astype('int32')
    
    return anndata

In [4]:
#| export
def download_test_data() -> str: #The absolute path of the extracted file 'sample_isomatrix.txt' if the download is successful.
    """
    This function downloads a test data file from a specified URL, saves it locally, and extracts it.
    """
    import urllib.request
    import gzip
    import shutil
    import os

    # URL of the file to be downloaded
    url = "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3748nnn/GSM3748087/suppl/GSM3748087%5F190c.isoforms.matrix.txt.gz"

    # Download the file from `url` and save it locally under `file.txt.gz`:
    urllib.request.urlretrieve(url, 'file.txt.gz')

    # Check if the file is downloaded correctly
    if os.path.exists('file.txt.gz'):
        print("File downloaded successfully")
        # Now we need to extract the file
        with gzip.open('file.txt.gz', 'rb') as f_in:
            with open('sample_isomatrix.txt', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print("File extracted successfully")
        return os.path.abspath('sample_isomatrix.txt')
    else:
        print("Failed to download the file")
        return None


Example usage of `isomatrix_to_anndata`: We can use the `download_test_data` function to download a small isoform matrix dataset for demonstrating the functionality.


In [5]:
from longreadtools.isomatrix_tools import * 
test_file = download_test_data() 

File downloaded successfully
File extracted successfully


In [6]:
anndata = isomatrix_to_anndata(test_file)

Lets take a look at the anndata object generated from the isomatrix.

In [24]:
anndata.var

Unnamed: 0,geneId,transcriptId
0,Klc2,ENSMUST00000156717.1
1,Capn15,ENSMUST00000212520.1
2,Klc2,ENSMUST00000025798.12
3,Eva1c,ENSMUST00000231280.1
4,Atg5,ENSMUST00000039286.4
...,...,...
20829,Kcnj9,ENSMUST00000062387.7
20830,Iqcg,ENSMUST00000115100.8
20831,Nt5dc2,ENSMUST00000227096.1
20832,Emg1,ENSMUST00000004379.7


In [8]:
#| hide
def test_isomatrix_to_anndata():
    # Test with a known file
    test_file = download_test_data()
    anndata = isomatrix_to_anndata(test_file)

    # Check the type of the returned object
    assert isinstance(anndata, sc.AnnData), "The returned object is not an AnnData object."

    # Check the dimensions of the AnnData object
    assert anndata.shape == (190, 20834), "The dimensions of the AnnData object are not as expected."

    # Check the var names of the AnnData object
    assert 'geneId' in anndata.var, "The 'geneId' is not in the var of the AnnData object."
    assert 'transcriptId' in anndata.var, "The 'transcriptId' is not in the var of the AnnData object."




In [14]:
test_file = download_test_data()

File downloaded successfully
File extracted successfully


In [15]:
pd.read_csv(test_file,  sep='\t', index_col=0)

Unnamed: 0_level_0,transcriptId,CAACTAGAGCTGTTCA,CCACCTACAAAGTCAA,AACCATGAGACCACGA,GATGAGGTCGACCAGC,GCACTCTTCTATCGCC,TGCCCTAAGGCCCGTT,TTCGGTCAGCTGGAAC,GGATTACAGACGCAAC,GTCTCGTTCCTCTAGC,...,TCGGGACAGCCCAGCT,GTTACAGTCTCGATGA,ATCATGGAGAAGAAGC,CGCGTTTCAGCTGTAT,TTCTTAGAGATCTGCT,AAGACCTAGATTACCC,CTGAAGTAGGTAGCTG,TAGTGGTAGGCCCTTG,GATGAAAAGCAATATG,TTCTTAGTCTGTTGAG
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Klc2,ENSMUST00000156717.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Capn15,ENSMUST00000212520.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Klc2,ENSMUST00000025798.12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Eva1c,ENSMUST00000231280.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Atg5,ENSMUST00000039286.4,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Kcnj9,ENSMUST00000062387.7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Iqcg,ENSMUST00000115100.8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Nt5dc2,ENSMUST00000227096.1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,7,0,7,0,0,0
Emg1,ENSMUST00000004379.7,0,0,3,0,0,0,2,1,1,...,1,0,0,0,1,1,4,2,2,1


Often, it may be necessary to convert more than one isomatrix in bulk. The function `multiple_isomatrix_conversion` has been designed for this purpose. It leverages Python's multiprocessing capabilities to perform this task in a fast and efficient manner.


In [27]:
# Adjust the simulate_isomatrix function to allow specifying the number of rows and columns
# and generate the sample IDs as part of the function parameters.

import numpy as np 
from pandas import DataFrame

def simulate_isomatrix(num_genes, # int, number of genes (groups of rows)
                       num_transcripts_per_gene, # int, number of transcripts per gene
                       num_samples, # int, number of samples (columns)
                       sparsity=0.95, # float, fraction of zeros in the data (default 0.95)
                       max_expression=100, # int, maximum expression level for any transcript in any sample
                       seed=0 # int, random seed for reproducibility
                      ) -> DataFrame : # DataFrame with simulated transcript expression data for demonstration purposes.
    """
    Simulate transcript expression data to match the structure of the first image provided by the user.
    Allows specifying the number of genes, transcripts per gene, and samples.
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    
    # Calculate total number of transcripts
    total_transcripts = num_genes * num_transcripts_per_gene
    
    # Generate random data
    data = np.random.rand(total_transcripts, num_samples)
    
    # Apply sparsity
    zero_mask = np.random.rand(total_transcripts, num_samples) > sparsity
    data[~zero_mask] = 0  # Set a fraction of data to 0 based on sparsity
    
    # Scale data to have values up to max_expression
    data = np.ceil(data * max_expression).astype(int)
    
    # Generate transcript and sample labels
    transcript_ids = [f"ENSMUST00000{str(i).zfill(6)}.1" for i in range(1, total_transcripts + 1)]
    gene_ids = [f"Gene_{(i // num_transcripts_per_gene) + 1}" for i in range(total_transcripts)]
    nb_exons = np.random.randint(1, 21, total_transcripts)  # Assuming 1-20 exons based on typical gene structures
    sample_ids = [f"CACCTACACGTCAAC{str(i).zfill(2)}" for i in range(1, num_samples + 1)]
    
    # Create DataFrame
    df = pd.DataFrame(data, index=gene_ids, columns=sample_ids)
    df.index.name = 'geneId'  # Add index name
    df.insert(0, 'transcriptId', transcript_ids)
    df.insert(2, 'nbExons', nb_exons)
    
    return df

# Example: Simulate data for 10 genes with 5 transcripts each, for 20 samples
simulated_data = simulate_isomatrix(10, 5, 20)
simulated_data.head(10)  # Display the first 10 rows to show multiple transcripts per gene


ModuleNotFoundError: No module named 'np'

In [20]:
#| export
from multiprocessing import Pool
import os
from functools import partial

def multiple_isomatrix_conversion(file_paths: list, # A list of file paths to be converted.
                                  verbose: bool = False # If True, print progress messages.
                                  ):
    """
    This function takes a list of file paths, converts each file from isomatrix to anndata format, 
    and saves the converted file in the same location with the same name but with a .h5ad extension.
    """
    
    def convert_and_save_file(sample, verbose):
        anndata = isomatrix_to_anndata(sample)
        anndata.write_h5ad(sample.replace('.txt', '.h5ad'))
        if verbose:
            print(f"File {sample.replace('.txt', '.h5ad')} was successfully written to disk.")

    with Pool() as p:
        p.map(partial(convert_and_save_file, verbose=verbose), file_paths)



Here is an example of how to use the function to convert several Isomatrix objects at once. The input is a list of paths to the Isomatrix text files.


In [11]:
multiple_isomatrix_conversion(individual_runs, verbose=True)

AttributeError: Can't pickle local object 'multiple_isomatrix_conversion.<locals>.convert_and_save_file'

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()