In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import matplotlib.axes as axt
import glob
import anndata
import bbknn
import celltypist 
from celltypist import models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80,facecolor='white',color_map='viridis')
sc.logging.print_header()

scanpy==1.10.4 anndata==0.11.3 umap==0.5.6 numpy==1.26.4 scipy==1.15.1 pandas==2.2.3 scikit-learn==1.6.1 statsmodels==0.14.4 igraph==0.11.8 pynndescent==0.5.13


In [5]:
data_path = '../data/SCP1884/'

In [6]:
#load the main dataframe
adata = sc.read_mtx(data_path + 'CO_EPI.scp.matrix.mtx')

In [7]:
adata.shape

(28663, 97788)

In [8]:
adata = adata.transpose()

In [9]:
adata.shape

(97788, 28663)

In [10]:
gene_data = pd.read_csv(data_path + 'CO_EPI.scp.features2.tsv')
epi_barcodes = pd.read_csv(data_path + 'CO_EPI.scp.barcodes.tsv')

In [11]:
gene_data

Unnamed: 0,gene_name
0,AL627309.1
1,AP006222.2
2,RP4-669L17.10
3,RP11-206L10.3
4,RP11-206L10.2
...,...
28658,RP5-974N19.1
28659,CTB-60B18.18
28660,CTD-2616J11.3
28661,AC006272.2


In [12]:
#assign gene names to data matrix
gene_data = gene_data[gene_data.gene_name.notnull()]
notNa = gene_data.index
notNa = notNa.to_list()

In [13]:
#remove any NA genes
adata = adata[:,notNa]
adata.var = gene_data
adata.var.set_index('gene_name', inplace=True)
adata.var.index.name = None
adata.var_names_make_unique()

In [14]:
meta_data = pd.read_table(data_path + 'scp_metadata_combined.v2.txt')

  meta_data = pd.read_table(data_path + 'scp_metadata_combined.v2.txt')


In [15]:
meta_data["barcodes"] = meta_data["NAME"]

In [16]:
meta_data['Health'] = meta_data['Type']
meta_data['disease'] = meta_data['disease__ontology_label']
meta_data['Cluster'] = meta_data['Celltype']
meta_data['Location'] = meta_data['organ__ontology_label']
meta_data['Sample'] = meta_data['biosample_id']

meta_data

Unnamed: 0,NAME,biosample_id,n_genes,n_counts,Chem,Site,Type,donor_id,Layer,Celltype,...,library_preparation_protocol__ontology_label,organ,organ__ontology_label,disease,disease__ontology_label,barcodes,Health,Cluster,Location,Sample
0,TYPE,group,numeric,numeric,group,group,group,group,group,group,...,group,group,group,group,group,TYPE,group,group,group,group
1,N105446_L-GTGTGGCTCCGTCAAA,N105446_L,5135,19014,v3,CO,NonI,105446,L,Fibroblasts ADAMDEC1,...,10x 3' v3,UBERON_0001155,colon,Crohn's disease,Crohn's disease,N105446_L-GTGTGGCTCCGTCAAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
2,N105446_L-CAATACGAGTCCCTAA,N105446_L,5119,18425,v3,CO,NonI,105446,L,Endothelial cells CD36,...,10x 3' v3,UBERON_0001155,colon,Crohn's disease,Crohn's disease,N105446_L-CAATACGAGTCCCTAA,NonI,Endothelial cells CD36,colon,N105446_L
3,N105446_L-CCCTGATAGTGTTCCA,N105446_L,5024,18305,v3,CO,NonI,105446,L,Fibroblasts ADAMDEC1,...,10x 3' v3,UBERON_0001155,colon,Crohn's disease,Crohn's disease,N105446_L-CCCTGATAGTGTTCCA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
4,N105446_L-CATTGTTAGAGCCCAA,N105446_L,4817,17791,v3,CO,NonI,105446,L,Fibroblasts ADAMDEC1,...,10x 3' v3,UBERON_0001155,colon,Crohn's disease,Crohn's disease,N105446_L-CATTGTTAGAGCCCAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
720629,N119540_L2-CACTGGGAGCTGACCC,N119540_L2,270,460,v3,TI,NonI,119540,L,Plasma cells,...,10x 3' v3,UBERON_0002116,ileum,Crohn's disease,Crohn's disease,N119540_L2-CACTGGGAGCTGACCC,NonI,Plasma cells,ileum,N119540_L2
720630,N119540_L2-CGAGGCTTCTCACTCG,N119540_L2,267,454,v3,TI,NonI,119540,L,Cycling cells,...,10x 3' v3,UBERON_0002116,ileum,Crohn's disease,Crohn's disease,N119540_L2-CGAGGCTTCTCACTCG,NonI,Cycling cells,ileum,N119540_L2
720631,N119540_L2-CTCAAGATCTACGCGG,N119540_L2,327,453,v3,TI,NonI,119540,L,Plasma cells,...,10x 3' v3,UBERON_0002116,ileum,Crohn's disease,Crohn's disease,N119540_L2-CTCAAGATCTACGCGG,NonI,Plasma cells,ileum,N119540_L2
720632,N119540_L2-TATTCCATCGCCTAGG,N119540_L2,268,430,v3,TI,NonI,119540,L,Cycling cells,...,10x 3' v3,UBERON_0002116,ileum,Crohn's disease,Crohn's disease,N119540_L2-TATTCCATCGCCTAGG,NonI,Cycling cells,ileum,N119540_L2


In [17]:

meta_data = meta_data.drop(columns=["NAME","Site","Type","Celltype","organ__ontology_label","biosample_id","n_genes","n_counts","Chem","donor_id","Layer","sex","species","species__ontology_label","library_preparation_protocol","library_preparation_protocol__ontology_label","organ","disease","disease__ontology_label"])
meta_data

Unnamed: 0,barcodes,Health,Cluster,Location,Sample
0,TYPE,group,group,group,group
1,N105446_L-GTGTGGCTCCGTCAAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
2,N105446_L-CAATACGAGTCCCTAA,NonI,Endothelial cells CD36,colon,N105446_L
3,N105446_L-CCCTGATAGTGTTCCA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
4,N105446_L-CATTGTTAGAGCCCAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
...,...,...,...,...,...
720629,N119540_L2-CACTGGGAGCTGACCC,NonI,Plasma cells,ileum,N119540_L2
720630,N119540_L2-CGAGGCTTCTCACTCG,NonI,Cycling cells,ileum,N119540_L2
720631,N119540_L2-CTCAAGATCTACGCGG,NonI,Plasma cells,ileum,N119540_L2
720632,N119540_L2-TATTCCATCGCCTAGG,NonI,Cycling cells,ileum,N119540_L2


In [18]:
epi_barcodes.index = epi_barcodes["barcodes"]

In [19]:
meta_data.index = meta_data["barcodes"]
meta_data

Unnamed: 0_level_0,barcodes,Health,Cluster,Location,Sample
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TYPE,TYPE,group,group,group,group
N105446_L-GTGTGGCTCCGTCAAA,N105446_L-GTGTGGCTCCGTCAAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
N105446_L-CAATACGAGTCCCTAA,N105446_L-CAATACGAGTCCCTAA,NonI,Endothelial cells CD36,colon,N105446_L
N105446_L-CCCTGATAGTGTTCCA,N105446_L-CCCTGATAGTGTTCCA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
N105446_L-CATTGTTAGAGCCCAA,N105446_L-CATTGTTAGAGCCCAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
...,...,...,...,...,...
N119540_L2-CACTGGGAGCTGACCC,N119540_L2-CACTGGGAGCTGACCC,NonI,Plasma cells,ileum,N119540_L2
N119540_L2-CGAGGCTTCTCACTCG,N119540_L2-CGAGGCTTCTCACTCG,NonI,Cycling cells,ileum,N119540_L2
N119540_L2-CTCAAGATCTACGCGG,N119540_L2-CTCAAGATCTACGCGG,NonI,Plasma cells,ileum,N119540_L2
N119540_L2-TATTCCATCGCCTAGG,N119540_L2-TATTCCATCGCCTAGG,NonI,Cycling cells,ileum,N119540_L2


In [20]:
epi_barcodes = epi_barcodes.drop(columns=["barcodes"])
epi_barcodes

N105446_L-ATTGTTCCAAACGTGG
N105446_L-TCGACGGGTGAGACCA
N105446_L-AGTAACCGTTAAGGGC
N105446_L-GCAGGCTTCGCTAAAC
N105446_L-ATCTTCATCTGAGAGG
...
N130084_L-GTGGTTACAGTTCCAA
N130084_L-TATCCTATCGTTCATT
N130084_L-TCATCCGGTATGATCC
N130084_L-GGTCACGGTTAGCGGA
N130084_L-TGGTGATAGTTGGAAT


In [21]:
meta_data = meta_data.drop(columns=["barcodes"])
meta_data

Unnamed: 0_level_0,Health,Cluster,Location,Sample
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TYPE,group,group,group,group
N105446_L-GTGTGGCTCCGTCAAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
N105446_L-CAATACGAGTCCCTAA,NonI,Endothelial cells CD36,colon,N105446_L
N105446_L-CCCTGATAGTGTTCCA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
N105446_L-CATTGTTAGAGCCCAA,NonI,Fibroblasts ADAMDEC1,colon,N105446_L
...,...,...,...,...
N119540_L2-CACTGGGAGCTGACCC,NonI,Plasma cells,ileum,N119540_L2
N119540_L2-CGAGGCTTCTCACTCG,NonI,Cycling cells,ileum,N119540_L2
N119540_L2-CTCAAGATCTACGCGG,NonI,Plasma cells,ileum,N119540_L2
N119540_L2-TATTCCATCGCCTAGG,NonI,Cycling cells,ileum,N119540_L2


In [22]:
cell_meta = pd.concat([epi_barcodes, meta_data], axis=1, join="inner")

In [23]:
cell_meta

Unnamed: 0_level_0,Health,Cluster,Location,Sample
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N105446_L-ATTGTTCCAAACGTGG,NonI,Paneth cells,colon,N105446_L
N105446_L-TCGACGGGTGAGACCA,NonI,Paneth cells,colon,N105446_L
N105446_L-AGTAACCGTTAAGGGC,NonI,Paneth cells,colon,N105446_L
N105446_L-GCAGGCTTCGCTAAAC,NonI,Goblet cells MUC2 TFF1,colon,N105446_L
N105446_L-ATCTTCATCTGAGAGG,NonI,Goblet cells MUC2 TFF1-,colon,N105446_L
...,...,...,...,...
N130084_L-GTGGTTACAGTTCCAA,NonI,Tuft cells,colon,N130084_L
N130084_L-TATCCTATCGTTCATT,NonI,Enterocytes CA1 CA2 CA4-,colon,N130084_L
N130084_L-TCATCCGGTATGATCC,NonI,Enterocytes BEST4,colon,N130084_L
N130084_L-GGTCACGGTTAGCGGA,NonI,Enterocytes CA1 CA2 CA4-,colon,N130084_L


In [24]:
#assign barcodes to matrix
adata.obs = cell_meta
adata.obs_names_make_unique()

In [25]:
adata.shape

(97788, 28663)

In [26]:
#check correct loading of barcodes
sc.get.var_df(adata)

AL627309.1
AP006222.2
RP4-669L17.10
RP11-206L10.3
RP11-206L10.2
...
RP5-974N19.1
CTB-60B18.18
CTD-2616J11.3
AC006272.2
CTA-929C8.7


In [27]:
#check correct loading of gene names
sc.get.obs_df(adata)

N105446_L-ATTGTTCCAAACGTGG
N105446_L-TCGACGGGTGAGACCA
N105446_L-AGTAACCGTTAAGGGC
N105446_L-GCAGGCTTCGCTAAAC
N105446_L-ATCTTCATCTGAGAGG
...
N130084_L-GTGGTTACAGTTCCAA
N130084_L-TATCCTATCGTTCATT
N130084_L-TCATCCGGTATGATCC
N130084_L-GGTCACGGTTAGCGGA
N130084_L-TGGTGATAGTTGGAAT


In [28]:
#write data as h5ad for easy loading later
adata.write_h5ad(data_path + 'SCP1884.epi.h5ad')

... storing 'Health' as categorical
... storing 'Cluster' as categorical
... storing 'Location' as categorical
... storing 'Sample' as categorical
