In [1]:
#| hide
from longreadtools.core import *

# longreadtools

> A working repository for development of tools for the analasis of long read rna sequencing data.

## Install

```sh
pip install git+https://github.com/cobioda/longreadtools.git

```

## How to use

Here we will use the `isomatrix_tools` module to convert our isomatrix txt files in bulk into `anndata` objects using the `multiple_isomatrix_conversion` function . Then, we will utilize our specialized `concatenate_anndata` function to generate a concatenated `anndata` for downstream analysis. 


In [2]:
from longreadtools.isomatrix_tools import *

In [3]:

# Importing required libraries
import os
import re

# Defining the directory path
directory = '/data/analysis/data_mcandrew/000-sclr-discovair/'

# Defining the regular expression pattern to match the required files
pattern = re.compile('.*(_BIOP_INT|BIOP_NAS)$')

# Getting a list of all files in the directory
all_files = os.listdir(directory)

# Filtering the list to include only files that match the pattern
matching_files = [os.path.join(directory, f) for f in all_files if pattern.match(f)]

# Printing the list of matching files
print(matching_files)

# Assigning the list of matching files to the variable 'individual_runs'
individual_runs = matching_files

# Adding '_isomatrix.txt' to each file name in the 'individual_runs' list
individual_runs = [f'{run}_isomatrix.txt' for run in individual_runs]

# Creating a list of paths for each isomatrix file in the 'matching_files' list
isomatrix_paths = [os.path.join(run, f'{os.path.basename(run)}_isomatrix.txt') for run in matching_files]


['/data/analysis/data_mcandrew/000-sclr-discovair/D498_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D492_BIOP_NAS', '/data/analysis/data_mcandrew/000-sclr-discovair/D494_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D500_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D494_BIOP_NAS', '/data/analysis/data_mcandrew/000-sclr-discovair/D496_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D499_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D493_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D493_BIOP_NAS', '/data/analysis/data_mcandrew/000-sclr-discovair/D534_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D490_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D500_BIOP_NAS', '/data/analysis/data_mcandrew/000-sclr-discovair/D495_BIOP_INT', '/data/analysis/data_mcandrew/000-sclr-discovair/D492_BIOP_INT']


In [4]:
converted_isomatrix_paths = multiple_isomatrix_conversion(isomatrix_paths, verbose=True, return_paths = True)


File /data/analysis/data_mcandrew/000-sclr-discovair/D498_BIOP_INT/D498_BIOP_INT_isomatrix.h5ad was successfully written to disk.
File /data/analysis/data_mcandrew/000-sclr-discovair/D500_BIOP_NAS/D500_BIOP_NAS_isomatrix.h5ad was successfully written to disk.
File /data/analysis/data_mcandrew/000-sclr-discovair/D500_BIOP_INT/D500_BIOP_INT_isomatrix.h5ad was successfully written to disk.
File /data/analysis/data_mcandrew/000-sclr-discovair/D493_BIOP_NAS/D493_BIOP_NAS_isomatrix.h5ad was successfully written to disk.
File /data/analysis/data_mcandrew/000-sclr-discovair/D494_BIOP_NAS/D494_BIOP_NAS_isomatrix.h5ad was successfully written to disk.
File /data/analysis/data_mcandrew/000-sclr-discovair/D493_BIOP_INT/D493_BIOP_INT_isomatrix.h5ad was successfully written to disk.
File /data/analysis/data_mcandrew/000-sclr-discovair/D499_BIOP_INT/D499_BIOP_INT_isomatrix.h5ad was successfully written to disk.
File /data/analysis/data_mcandrew/000-sclr-discovair/D494_BIOP_INT/D494_BIOP_INT_isomatrix

In [5]:
converted_isomatrix_paths

['/data/analysis/data_mcandrew/000-sclr-discovair/D498_BIOP_INT/D498_BIOP_INT_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D492_BIOP_NAS/D492_BIOP_NAS_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D494_BIOP_INT/D494_BIOP_INT_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D500_BIOP_INT/D500_BIOP_INT_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D494_BIOP_NAS/D494_BIOP_NAS_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D496_BIOP_INT/D496_BIOP_INT_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D499_BIOP_INT/D499_BIOP_INT_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D493_BIOP_INT/D493_BIOP_INT_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D493_BIOP_NAS/D493_BIOP_NAS_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/D534_BIOP_INT/D534_BIOP_INT_isomatrix.h5ad',
 '/data/analysis/data_mcandrew/000-sclr-discovair/

In [6]:
andata_concat = concatenate_anndata(converted_isomatrix_paths)

Standardizing anndata features via union: 100%|██████████| 14/14 [09:10<00:00, 39.34s/it]
  utils.warn_names_duplicates("obs")


In [7]:
andata_concat

AnnData object with n_obs × n_vars = 122872 × 89177
    obs: 'batch'
    var: 'geneId', 'transcriptId', 'nbExons'

In [8]:
andata_concat.var

Unnamed: 0_level_0,geneId,transcriptId,nbExons
transcriptId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENST00000548501,CYP4F12,ENST00000548501,4
ENST00000324229,CALCB,ENST00000324229,5
ENST00000371489,MYOF,ENST00000371489,15
ENST00000368659,SLC27A3,ENST00000368659,2
ENST00000669353,TMEM161B-AS1,ENST00000669353,4
...,...,...,...
ENST00000597528,ZNF274,ENST00000597528,3
ENST00000624896,PCDHB14,ENST00000624896,2
ENST00000453216,TRAM2-AS1,ENST00000453216,3
ENST00000589481,ZNF532,ENST00000589481,4
