In [None]:
import pandas as pd
import scipy.io
import scipy.sparse
import scanpy as sc
import gzip

# Paths
matrix_path = "../../data/matrix.mtx.gz"
barcodes_path = "../../data/barcodes.tsv.gz"
features_path = "../../data/features.tsv.gz"

# Load
matrix = scipy.io.mmread(gzip.open(matrix_path)).tocsr()
barcodes = pd.read_csv(barcodes_path, header=None, compression="gzip")[0].values
features = pd.read_csv(features_path, header=None, compression="gzip")[0].values

# Transpose matrix to have (cells x genes)
adata = sc.AnnData(X=matrix.T)

# Assign barcodes and features
adata.obs_names = barcodes  # cells
adata.var_names = features  # genes

print(adata)

AnnData object with n_obs × n_vars = 35276 × 47096


In [None]:
import pandas as pd

# Path to GTF
gtf_path = "../../data/gencode.v43.annotation.gtf.gz"

# Load GTF
gtf = pd.read_csv(
    gtf_path,
    sep="\t",
    comment="#",
    header=None,
    names=[
        "chromosome", "source", "feature", "start", "end",
        "score", "strand", "frame", "attribute"
    ],
    compression="gzip"
)

# Filter genes only
gtf_genes = gtf[gtf["feature"] == "gene"].copy()  # ⚡ Added .copy() to avoid SettingWithCopyWarning

# Extract gene_id and gene_name
gtf_genes["gene_id"] = gtf_genes["attribute"].str.extract('gene_id "([^"]+)"')
gtf_genes["gene_name"] = gtf_genes["attribute"].str.extract('gene_name "([^"]+)"')

# 🔥 REMOVE version numbers
gtf_genes["gene_id"] = gtf_genes["gene_id"].str.split(".").str[0]

# Build mapping
id_to_symbol = pd.Series(gtf_genes.gene_name.values, index=gtf_genes.gene_id).to_dict()


In [14]:
# Map gene symbols into adata
adata.var["gene_symbol"] = adata.var_names.map(id_to_symbol)

# Check
print(adata.var.head())


                gene_symbol
ENSG00000000003      TSPAN6
ENSG00000000419        DPM1
ENSG00000000457       SCYL3
ENSG00000000460    C1orf112
ENSG00000001036       FUCA2


In [None]:
adata.write("../../data/breast_cancer_raw_annotated.h5ad")
