# Isomatrix Tools

> Tools for converting isomatrix files into anndata objects for integration with the Scanpy ecosystem.


In [6]:
#| default_exp isomatrix_tools

In [7]:
#| hide
from nbdev.showdoc import *

In [8]:
#| export
import pandas as pd
import scanpy as sc
from scanpy import AnnData
from scipy.sparse import csr_matrix

def isomatrix_to_anndata(file_path:str,  # The path to the isomatrix csv file to be read.
                        sparse:bool=True  # Flag to determine if the output should be a sparse matrix.
) -> AnnData: # The converted isomatix as a scanpy compatible  anndata object
    """
    This function reads a file into a pandas DataFrame, performs some transformations, 
    and then converts the DataFrame into a sparse matrix and an AnnData object.
    
    Returns:
    anndata (AnnData): The resulting AnnData object.
    """
    
    # Read in the data from the file
    df = pd.read_csv(file_path, sep='\t', index_col=0)
    
    # Filter out rows where the transcriptId is "undef"
    df = df.loc[df["transcriptId"] != "undef"]
    
    # Reset the index of the DataFrame
    df = df.reset_index()
    
    # Transpose the DataFrame
    df = df.T
    
    # Create a copy of the DataFrame for later use
    df_1 = df.copy()
    
    # Update the column names of the DataFrame
    df.columns = df_1.iloc[0].astype(str) + "_" + df_1.iloc[1].astype(str)
    
    # Extract the first three rows from the copied DataFrame
    first_three_rows = df_1.iloc[:3]
  
    # Drop unnecessary rows from the DataFrame
    df = df.iloc[3:]
    
    # Convert the DataFrame to a sparse matrix if the sparse flag is True
    if sparse:
        matrix = csr_matrix(df.values.astype('float32'))
    else:
        matrix = df.values.astype('float32')
    
    # Convert the matrix to an AnnData object
    anndata = sc.AnnData(X=matrix, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
    
    # Add additional information to the AnnData object vars
    anndata.var['gene_id'] = first_three_rows.iloc[0, :].values
    anndata.var['transcript_id'] = first_three_rows.iloc[1, :].values
    anndata.var['nb_exons'] = first_three_rows.iloc[2, :].values.astype('int32')
    
    return anndata

In [9]:

def test_isomatrix_to_anndata():
    """
    Test the function isomatrix_to_anndata.
    """
    # Define a test file path
    test_file_path = "/data/analysis/data_mcandrew/000-sclr-discovair/D498_BIOP_INT/D498_BIOP_INT_isomatrix.txt"
    
    # Call the function with the test file path
    anndata = isomatrix_to_anndata(test_file_path, sparse=False)
    
    # Assert that the returned object is an instance of AnnData
    assert isinstance(anndata, sc.AnnData), "The returned object is not an instance of AnnData."
    
    # Assert that the shape of the AnnData object is as expected
    assert anndata.shape == (3494, 52277), "The shape of the AnnData object is not as expected."
    
    # Assert that the var names of the AnnData object are as expected
    expected_var_names = ['CYP4F12_ENST00000548501', 'CALCB_ENST00000324229', 'MYOF_ENST00000371489', 'SLC27A3_ENST00000368659', 'TMEM161B-AS1_ENST00000669353']
    assert list(anndata.var_names[:5]) == expected_var_names, "The var names of the AnnData object are not as expected."
    
    # Assert that the obs names of the AnnData object are as expected
    expected_obs_names = ['AGGAAATGTACAAGCG', 'GCCATTCGTCGGAACA', 'TCGACCTCAGTGTGCC', 'CGTAGTATCAGTGTGT', 'GCCAGGTGTCTAACTG']
    assert list(anndata.obs_names[:5]) == expected_obs_names, "The obs names of the AnnData object are not as expected."

    # Assert that the additional vars of the AnnData object are as expected
    expected_additional_vars = ['gene_id', 'transcript_id', 'nb_exons']
    assert list(anndata.var.columns) == expected_additional_vars, "The additional vars of the AnnData object are not as expected."

test_isomatrix_to_anndata()

In [10]:
#| hide
import nbdev; nbdev.nbdev_export()

AssertionError: /data/analysis/data_mcandrew/longreadtools/longreadtools/tests.py does not exist