# SpaceRanger Data Extraction Code:

This notebook is specifically for extracting a .csv file from the SpaceRanger outs folder of a sample. Essentially, running this script for an outs file will output a .csv file which contains the gene expression counts for all barcode coordinates. This *in combination* to the coordinates and spatial projection information in the **spatial** folder will allow for easy statistical analysis of the gene expression data of a SpaceRanger sample.

In [4]:
import csv
import gzip
import os
import scipy.io
import pandas as pd
import numpy as np

All that needs to be checked is that the file paths point to the correct directory. Without a GPU, the process can take several minutes

In [5]:
# define MEX directory
matrix_dir = "V1_62_human_outs/filtered_feature_bc_matrix"
# read in MEX format matrix as table
mat_filtered = scipy.io.mmread(os.path.join(matrix_dir, "matrix.mtx.gz"))
 
# list of transcript ids, e.g. 'ENSG00000187634'
features_path = os.path.join(matrix_dir, "features.tsv.gz")
feature_ids = [row[0] for row in csv.reader(gzip.open(features_path, mode="rt"), delimiter="\t")]
 
# list of gene names, e.g. 'SAMD11'
gene_names = [row[1] for row in csv.reader(gzip.open(features_path, mode="rt"), delimiter="\t")]
 
# list of feature_types, e.g. 'Gene Expression'
feature_types = [row[2] for row in csv.reader(gzip.open(features_path, mode="rt"), delimiter="\t")]
 
# list of barcodes, e.g. 'AAACATACAAAACG-1'
barcodes_path = os.path.join(matrix_dir, "barcodes.tsv.gz")
barcodes = [row[0] for row in csv.reader(gzip.open(barcodes_path, mode="rt"), delimiter="\t")]

In [6]:
# transform table to pandas dataframe and label rows and columns
matrix = pd.DataFrame.sparse.from_spmatrix(mat_filtered)
matrix.columns = barcodes
matrix.insert(loc=0, column="feature_id", value=feature_ids)
matrix.insert(loc=1, column="gene", value=gene_names)
matrix.insert(loc=2, column="feature_type", value=feature_types)
 
# display matrix
print(matrix)
 
# save the table as a CSV (note the CSV will be a very large file)
matrix.to_csv("V1_62_human_matrix.csv", index=False)

            feature_id             gene     feature_type  AAACAGCTTTCAGAAG-1   
0      ENSG00000243485      MIR1302-2HG  Gene Expression                   0  \
1      ENSG00000237613          FAM138A  Gene Expression                   0   
2      ENSG00000186092            OR4F5  Gene Expression                   0   
3      ENSG00000238009  ENSG00000238009  Gene Expression                   0   
4      ENSG00000239945  ENSG00000239945  Gene Expression                   0   
...                ...              ...              ...                 ...   
37728  ENSG00000277836  ENSG00000277836  Gene Expression                   0   
37729  ENSG00000278633  ENSG00000278633  Gene Expression                   0   
37730  ENSG00000276017  ENSG00000276017  Gene Expression                   0   
37731  ENSG00000278817  ENSG00000278817  Gene Expression                   0   
37732  ENSG00000277196  ENSG00000277196  Gene Expression                   0   

       AAACAGGGTCTATATT-1  AAACCGGGTAGG