In [1]:
import os
os.environ["MKL_NUM_THREADS"] = "50"
os.environ["NUMEXPR_NUM_THREADS"] = "50"
os.environ["OMP_NUM_THREADS"] = "50"

import scanpy as sc
import scvelo as scv
import pandas as pd
import numpy as np
import scanpy.external as sce

%matplotlib inline
from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
sc.set_figure_params(dpi = 150, dpi_save = 150, format = 'png')
sc._settings.ScanpyConfig(verbosity=0)

#scv.settings.presenter_view = True  # set max width size for presenter view
#scv.set_figure_params('scvelo')

<scanpy._settings.ScanpyConfig at 0x7f705ff77350>

In [2]:
import anndata
from scipy import sparse

def buildAnndataFromStarCurr(path):
    """Generate an anndata object from the STAR aligner output folder"""
    path=path
    # Load Read Counts
    X = sc.read_mtx(path+'Gene/raw/matrix.mtx')

    # Transpose counts matrix to have Cells as rows and Genes as cols as expected by AnnData objects
    X = X.X.transpose()

    # Load the 3 matrices containing Spliced, Unspliced and Ambigous reads
    mtxU = np.loadtxt(path+'Velocyto/raw/unspliced.mtx', skiprows=3, delimiter=' ')
    mtxS = np.loadtxt(path+'Velocyto/raw/spliced.mtx', skiprows=3, delimiter=' ')
    mtxA = np.loadtxt(path+'Velocyto/raw/ambiguous.mtx', skiprows=3, delimiter=' ')

    # Extract sparse matrix shape informations from the third row
    shapeU = np.loadtxt(path+'Velocyto/raw/unspliced.mtx', skiprows=2, max_rows = 1 ,delimiter=' ')[0:2].astype(int)
    shapeS = np.loadtxt(path+'Velocyto/raw/spliced.mtx', skiprows=2, max_rows = 1 ,delimiter=' ')[0:2].astype(int)
    shapeA = np.loadtxt(path+'Velocyto/raw/ambiguous.mtx', skiprows=2, max_rows = 1 ,delimiter=' ')[0:2].astype(int)

    # Read the sparse matrix with csr_matrix((data, (row_ind, col_ind)), shape=(M, N))
    # Subract -1 to rows and cols index because csr_matrix expects a 0 based index
    # Traspose counts matrix to have Cells as rows and Genes as cols as expected by AnnData objects

    spliced = sparse.csr_matrix((mtxS[:,2], (mtxS[:,0]-1, mtxS[:,1]-1)), shape = shapeS).transpose()
    unspliced = sparse.csr_matrix((mtxU[:,2], (mtxU[:,0]-1, mtxU[:,1]-1)), shape = shapeU).transpose()
    ambiguous = sparse.csr_matrix((mtxA[:,2], (mtxA[:,0]-1, mtxA[:,1]-1)), shape = shapeA).transpose()

    # Load Genes and Cells identifiers
    obs = pd.read_csv(path+'Velocyto/raw/barcodes.tsv',
                  header = None, index_col = 0)

    # Remove index column name to make it compliant with the anndata format
    obs.index.name = None

    var = pd.read_csv(path+'Velocyto/raw/features.tsv', sep='\t',
                                    names = ('gene_ids', 'feature_types'), index_col = 1)
  
    # Build AnnData object to be used with ScanPy and ScVelo
    adata = anndata.AnnData(X = X, obs = obs, var = var,
                                                 layers = {'spliced': spliced, 'unspliced': unspliced, 'ambiguous': ambiguous})
    adata.var_names_make_unique()

    # Subset Cells based on STAR filtering
    #selected_barcodes = pd.read_csv(path+'Gene/filtered/barcodes.tsv', header = None)
    #adata = adata[selected_barcodes[0]]

    return adata.copy()

In [3]:
d9dv_velocity = buildAnndataFromStarCurr("/nfsdata/data/data-runs/mistr_starsolo/d9dv/Solo.out/")

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [6]:
d9dv = sc.read_h5ad("/home/kgr851/seurat_data_for_python/velocity/adata/raw/scvi/dv_mistr/new/d9dv/d9dv.h5ad")
d9dv

AnnData object with n_obs × n_vars = 6437 × 19011
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'percent.mt', 'S.Score', 'G2M.Score', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.4', 'seurat_clusters', 'day', 'model', 'SCT_snn_res.0.5', 'tissue', 'source'
    var: 'features'
    obsm: 'X_umap'

In [7]:
d9dv_velocity.var_names_make_unique()
d9dv = sc.read_h5ad("/home/kgr851/seurat_data_for_python/velocity/adata/raw/scvi/dv_mistr/new/d9dv/d9dv.h5ad")
hvgs = d9dv.var_names
metadata = d9dv.obs.copy()
d9dv = d9dv.raw.to_adata()
d9dv.obs = metadata
d9dv.var_names = d9dv.var['_index']
del d9dv.var['_index']
d9dv.var['highly_variable'] = False
d9dv.var.loc[hvgs, 'highly_variable'] = True
merged = scv.utils.merge(d9dv, d9dv_velocity)
merged

AnnData object with n_obs × n_vars = 6437 × 18441
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'percent.mt', 'S.Score', 'G2M.Score', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.4', 'seurat_clusters', 'day', 'model', 'SCT_snn_res.0.5', 'tissue', 'source', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size'
    var: 'highly_variable', 'gene_ids', 'feature_types'
    obsm: 'X_umap'
    layers: 'spliced', 'unspliced', 'ambiguous'

In [9]:
merged.write("/home/kgr851/seurat_data_for_python/velocity/adata/raw/scvi/dv_mistr//d9dv/d9dv.velocity.raw.h5ad")