## create a single anndata file with our raw count data.


In [1]:

import anndata as ad
import pandas as pd
from scipy.sparse import csr_matrix, vstack, hstack, coo_matrix
import numpy as np
from pathlib import Path


In [2]:
data_folder = Path.cwd() / "data"
pattern = f"chunk0*_output_data.h5ad"
chunk_fs = sorted(data_folder.glob(pattern))
chunk_fs

[PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0010_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0020_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0030_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0040_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0050_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0060_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0070_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0080_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0090_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0100_output_data.h5ad'),
 PosixPath('/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0110_output_d

In [3]:
adata = ad.read_h5ad("/Users/ergonyc/Projects/SingleCell/labelator/data/brain_atlas_anndata.h5ad")
adata.var_names_make_unique()
features = adata.var_names.tolist()

del adata

In [4]:
# Assuming 'targets' is a set of strings that you're looking for
targets_set = set(features)  # Make sure this is a set for O(1) lookups

matched_lines = []  # List to store matched lines if necessary
count_matrix_file_path = "./data/brain_atlas_full_counts_table.csv"
# Open the large CSV file
output_file_path = './data/counts_3k.csv'



In [11]:




def read_csv_to_coo_matrix(file_path, filter_features=None):

    if filter_features is None:
        targets_set = set()
        process_all = True
    else:
        targets_set = set(filter_features)
        process_all = False


    # Initialize lists to hold the COO data
    row_indices = []
    col_indices = []
    data = []
    ids = []
    keep_rows = []

    row_idx = 0

    with open(file_path, 'r') as file:
        print(f"opened {file_path}")
        # Read the header line
        header = file.readline()
        # Read the rest of the lines
        for frow_idx, line in enumerate(file):
            # Get the first entry by reading up to the first comma
            first_entry = line.split(',', 1)[0]
            # Check if the first entry is a target
            if first_entry in targets_set or process_all:
                keep_rows.append(frow_idx)
                # read the entire line
                id,*cnts = line.strip().split(',')
                ids.append(id)
                # Split the line into values and convert them to 8-bit integers
                values = np.array(cnts, dtype=np.uint8)
                
                # Get the non-zero indices and corresponding values
                non_zero_indices = np.nonzero(values)[0]
                non_zero_values = values[non_zero_indices]
                
                # Append the data to the COO format lists
                row_indices.extend([row_idx] * len(non_zero_indices))
                col_indices.extend(non_zero_indices)
                data.extend(non_zero_values)
                # incriment the row index
                row_idx += 1


    # Create the COO matrix
    num_rows = row_idx
    header = header.strip().split(',')
    # remove "genes" from header
    header = header[1:]
    num_cols = len(header)
    matrix_coo = coo_matrix((data, (row_indices, col_indices)), shape=(num_rows, num_cols), dtype=np.uint8)
    
    return matrix_coo, header, ids, keep_rows





def make_anndata_from_bigcsv(big_csv_file_path, filter_features=None, meta_obs=None, meta_var=None):
    """ create an anndata object from a large csv file """

    # Call the function with the file path
    sparse_matrix_coo, cols, idxs, kept = read_csv_to_coo_matrix(big_csv_file_path, filter_features=filter_features)

    # Convert the COO matrix to a CSR matrix
    if meta_obs is None:
        # obs_ = obs_.merge(meta_obs, left_index=True, right_index=True)
        meta_obs = pd.DataFrame(cols, index=cols, columns=['cell_ids'])
    else:
        meta_obs['cell_ids'] = cols

    if meta_var is None:
        # var_ = var_.merge(meta_var, left_index=True, right_index=True)
        meta_var = pd.DataFrame({'gene_ids':idxs, 'big_idx':kept}, index=idxs)
    else:
        meta_var['gene_ids'] = idxs
        meta_var['big_idx'] = kept


    adat_out = ad.AnnData(X=sparse_matrix_coo.transpose().tocsr(), obs=meta_obs, var=meta_var)
    return adat_out


In [7]:

# # Call the function with the file path
# sparse_matrix_coo, cols, idxs = read_csv_to_coo_matrix(output_file_path)


In [47]:
sparse_matrix_csr = sparse_matrix_coo.transpose().tocsr()
# If you need to convert to CSR
# sparse_matrix_csr = sparse_matrix_coo.tocsr()


In [8]:
obs = pd.read_csv("data/cell_barcode_labels.csv", index_col=0)



In [9]:
big_csv_file_path = "./data/brain_atlas_full_counts_table.csv"
# Open the large CSV file
small_csv_file_path = './data/counts_3k.csv'


In [12]:
ad1 = make_anndata_from_bigcsv(big_csv_file_path, filter_features=features, meta_obs=obs, meta_var=None)


opened ./data/brain_atlas_full_counts_table.csv


In [13]:
ad2 = make_anndata_from_bigcsv(small_csv_file_path, filter_features=features, meta_obs=obs, meta_var=None)


opened ./data/counts_3k.csv


In [14]:
ad3 = make_anndata_from_bigcsv(small_csv_file_path, meta_obs=obs, meta_var=None)


opened ./data/counts_3k.csv


In [16]:
ad1.write_h5ad("./data/ad1.h5ad")
ad2.write_h5ad("./data/ad2.h5ad")
ad3.write_h5ad("./data/ad3.h5ad")

del ad1, ad2, ad3, obs

NameError: name 'ad1' is not defined

In [32]:

var_ = pd.DataFrame(idxs,index=idxs, columns=['gene_ids'])
obs_ = pd.DataFrame(cols, columns=['cell_ids'])


In [33]:
var_.head()

Unnamed: 0,gene_ids
PRDM16,PRDM16
SLC2A5,SLC2A5
SPSB1,SPSB1
SLC25A33,SLC25A33
CORT,CORT


In [22]:
var_.head()

Unnamed: 0,gene_ids
0,PRDM16
1,SLC2A5
2,SPSB1
3,SLC25A33
4,CORT


In [37]:
obs['cell_ids'] = cols

In [38]:

adat = ad.AnnData(X=sparse_matrix_csr, obs=obs, var=var_)

In [39]:
adat.obs.head()


Unnamed: 0_level_0,seurat_clusters,cell_type,sample,cell_ids
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GGCCTAATCGATTTAG-1_1,11,Mature neurons,KEN-1070-ARC,GGCCTAATCGATTTAG-1_1
TAGTAACGTAGTCAAT-1_1,5,Mature neurons,KEN-1070-ARC,TAGTAACGTAGTCAAT-1_1
GAAAGCCAGCAGCTCA-1_1,2,Oligodendrocytes,KEN-1070-ARC,GAAAGCCAGCAGCTCA-1_1
ACTCACCTCCTCCCTC-1_1,5,Mature neurons,KEN-1070-ARC,ACTCACCTCCTCCCTC-1_1
CTTCATCCAATCGCAC-1_1,11,Mature neurons,KEN-1070-ARC,CTTCATCCAATCGCAC-1_1


In [5]:

feats = []
with open(count_matrix_file_path, 'r') as file:
    # Read and store the first line if header information is needed
    header_line = file.readline()
    with open(output_file_path, 'w') as out_file:
        # Iterate over each subsequent line in the file
        out_file.write(header_line)  # Optionally write the header line

        for line in file:
            # Get the first entry by reading up to the first comma
            first_entry = line.split(',', 1)[0]
            feats.append(first_entry)
            # Check if the first entry is a target
            if first_entry in targets_set:
                out_file.write(line)


#                 # If it is a target, process the entire line
#                 matched_lines.append(line.strip())  # Remove newline character
                
#                 # If you need to do more processing on this line, you can do it here
#                 # For example, you could split the line by commas and do something with the fields
#                 # fields = line.strip().split(',')


In [8]:

# # Do something with the matched lines, like writing them to a new file
linesarr=[]
with open(output_file_path, 'r') as file:
    colnms = file.readline()
    for line in file:
        # vals = np.fromstring(line, sep=',')
        linesarr.append(line)
        


In [10]:
colnms = colnms.split(',')
colnms[:10]

['genes',
 'GGCCTAATCGATTTAG-1_1',
 'TAGTAACGTAGTCAAT-1_1',
 'GAAAGCCAGCAGCTCA-1_1',
 'ACTCACCTCCTCCCTC-1_1',
 'CTTCATCCAATCGCAC-1_1',
 'CAATCCTGTGTCCTGC-1_1',
 'GTTAATGTCAAGCTAC-1_1',
 'AAGCCACGTTGCAATG-1_1',
 'CTTGGACCAATCATGT-1_1']

In [9]:
ln = linesarr[0]
id,*cnts = ln.split(',')

ic = map(int,cnts)

In [11]:
ic = list(ic)   

In [15]:

np.array(cnts,np.uint8)

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [16]:
ids =[]
counts = []
for ln in linesarr:
    id,*cnts = ln.split(',')
    ids.append(id)
    counts.append(np.array(cnts,np.uint8))


In [17]:
X = np.stack(counts, axis=1)

In [19]:
len(ids)

3000

In [26]:
cols = cols.strip().split(',')
len(cols)

713627

In [27]:
len(idxs), len(cols), sparse_matrix_coo.shape

(3000, 713627, (3000, 713627))

In [36]:
type(cols)

list

In [30]:
var_.head()

Unnamed: 0,gene_ids
0,PRDM16
1,SLC2A5
2,SPSB1
3,SLC25A33
4,CORT


In [32]:
obs.shape

(713626, 3)

In [28]:

feats = []
larr = []
with open(count_matrix_file_path, 'r') as file:
    # Read and store the first line if header information is needed
    header_line = file.readline()
    for line in file:
        # Get the first entry by reading up to the first comma
        first_entry = line.split(',', 1)[0]
        feats.append(first_entry)
        # Check if the first entry is a target
        if first_entry in targets_set:
            # remove the first entry from the line
            line = line.lstrip(f"{first_entry},")
            vals = np.fromstring(line, sep=',', dtype="f")
            #array([nan,  1.,  2.,  3.], dtype)
            if vals.shape[0] == 713626:
                larr.append(vals)
            else:
                print(f"error: {first_entry}: {vals.shape} {line[:111]}")
                larr.append(line)




error: SLC25A33: (713625,) 1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0
error: ARHGEF10L: (713583,) 2,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,0
error: EMC1-AS1: (713625,) 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
error: MICOS10: (713618,) 2,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
error: LINC01141: (713248,) 3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
error: PTP4A2: (713625,) 0,2,0,1,1,0,0,0,0,2,0,1,1,0,1,1,0,0,0,0,0,1,2,1,2,0,0,0,0,0,0,1,1,0,0,1,1,0,2,0,0,0,1,1,0,1,1,1,1,0,0,0,1,1,0,0
error: MARCKSL1: (713625,) 0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
error: S100PBP: (713610,) 2,

KeyboardInterrupt: 

In [24]:
larr[100].shape

# shapes[-100:]

(713626,)

In [18]:
X = np.stack(larr, axis=1)

ValueError: all input arrays must have the same shape

In [8]:
X.shape

(2140524722,)

In [6]:
atlas_3k = pd.read_csv(output_file_path)

In [None]:
genes_ = atlas_3k['genes']
cells_ = atlas_3k.columns[1:]
X = csr_matrix(atlas_3k.iloc[:,1:].values.T)



In [None]:
obs_file_path = "./data/cell_barcode_labels.csv"
cells = pd.read_csv(obs_file_path,index_col=0)


var_ = pd.DataFrame(index = genes_)
obs_ = cells
obs_['cells_alt'] = cells_.to_list()
adata_3k = ad.AnnData(X=X, obs=obs_, var=var_)

adata_3k


In [None]:

adata_3k.obs_names = cells.index
adata_3k.write_h5ad("./data/brain_atlas_3k.h5ad")

In [5]:

# Specify a larger sample size (e.g., 1e7 bytes)
sample_size = 1e7


# Specify data types
# Assuming that the first column is object (like string) and the rest are uint8.
dtypes = {0: 'object'}
for col in range(1, 713626):  # Adjust the range based on the number of columns
    dtypes[col] = 'uint8'

# Define the chunk size
chunk_size = 100  # 
# Create an empty list to store sparse matrices from each chunk
chunk_counter = 0
sparse_matrices = []
genes = []
# 1. Read the CSV in chunks using a context manager
with pd.read_csv(count_matrix_file_path, header=0,dtype=dtypes, chunksize=chunk_size) as reader:
    for chunk in reader:

        print(f"chunk number {chunk_counter}")
        # 2. get the genes
        gene_chunk = chunk['genes']

        sparse_chunk = csr_matrix(chunk.iloc[:,1:].values.T)
        genes.append(gene_chunk)
        sparse_matrices.append(sparse_chunk)



KeyboardInterrupt: 

In [None]:
sparse_matrix = hstack(sparse_matrices)
genes_ = pd.DataFrame(index = pd.concat(genes, axis=0))
genes_['chunk'] = chunk_counter

# adata_dict = {}
# adata_dict["X"] = sparse_matrix.transpose()
# adata_dict["obs"] = cells
# adata_dict["var"] = genes_
# # adata_dict["dtype"] = np.float64
# # adata_dict["obsm"] = dict(
# #     a=da.random.random((M, 100)),
# # )
# # adata_dict["layers"] = dict(
# #     a=da.random.random((M, N)),
# # )
print(f"matrix shape {sparse_matrix.shape} \n var shape {genes_.shape} obs shape {cells.shape[0]}")


adata = ad.AnnData(X=sparse_matrix, obs=cells, var=genes_)
# adata = ad.AnnData(**adata_dict)
h5ad_file_path = f"./data/chunk2{chunk_counter+1:04d}_output_data.h5ad"
adata.write_h5ad(h5ad_file_path)
del adata
print(f"wrote {genes_.shape} genes to adata file {h5ad_file_path}")

In [2]:
import scanpy as sc
import scvi

sc.set_figure_params(figsize=(4, 4))

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

Global seed set to 0
  from .autonotebook import tqdm as notebook_tqdm
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [5]:
print(f"working with {len(features)} features")

working with 3000 features


In [41]:
# load the first file 
adata0 = ad.read_h5ad(chunk_fs[0])
# subset it to the features in features.
keep_feaets = adata0.var_names.isin(features)

In [42]:

adata0 = adata0[:, keep_feaets]
adata0


View of AnnData object with n_obs × n_vars = 713626 × 30
    obs: 'seurat_clusters', 'cell_type', 'sample'

In [43]:
for chunk_n, chunk in enumerate(chunk_fs[1:]):
    print(f"loading {chunk}")
    adata = ad.read_h5ad(chunk)
    keep_feats = adata.var_names.isin(features)
    adata = adata[:, keep_feats]
    adata0 = ad.concat([adata0, adata], axis=1)
    print(f"adata0 shape: {adata0.shape}")
    if chunk_n % 10 == 0:
        print(f"writing to disk")
        adata0.write_h5ad(f"brain_atlas_anndata_{chunk_n}.h5ad")
        print(f"done writing to disk")

loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0020_output_data.h5ad
adata0 shape: (713626, 129)
writing to disk
done writing to disk
loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0030_output_data.h5ad
adata0 shape: (713626, 197)
loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0040_output_data.h5ad
adata0 shape: (713626, 287)
loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0050_output_data.h5ad
adata0 shape: (713626, 360)
loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0060_output_data.h5ad
adata0 shape: (713626, 506)
loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0070_output_data.h5ad
adata0 shape: (713626, 617)
loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0080_output_data.h5ad
adata0 shape: (713626, 737)
loading /Users/ergonyc/Projects/SingleCell/labelator/data/chunk0090_output_data.h5ad
adata0 shape: (713626, 875)
loading /Users/ergonyc/Projects/SingleCell/labelator/data/c

In [44]:
print(f"writing to disk")
adata0.write_h5ad(f"brain_atlas_anndata_FINAL.h5ad")
print(f"done writing to disk")

writing to disk
done writing to disk


In [82]:
adata = ad.read_h5ad("brain_atlas_anndata_FINAL.h5ad")

In [83]:
adata.var_names_make_unique()

In [84]:
adata

AnnData object with n_obs × n_vars = 713626 × 3000

In [85]:
obs = pd.read_csv("data/cell_barcode_labels.csv", index_col=0)

In [86]:
adata.obs = obs

In [87]:
# Suppose you want to change the dtype to float32
adata.X = adata.X.astype('float32')
# After changing the dtype, you can save the AnnData object if needed
adata.write_h5ad('adata_float32.h5ad')


In [80]:
X = adata.X


In [81]:
X

<HDF5 sparse dataset: format 'csr', shape (713626, 3000), type '|u1'>

In [88]:

sc.pp.filter_genes(adata, min_counts=3)


In [89]:


adata.layers["counts"] = adata.X.copy()  # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata  # freeze the state in `.raw`


In [101]:

# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True
)
adata

AnnData object with n_obs × n_vars = 713626 × 3000
    obs: 'seurat_clusters', 'cell_type', 'sample', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'n_counts', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'log1p'
    layers: 'counts'

In [93]:


clean_samples_path = Path.cwd() / "data" / "Model Combinations - clean_samples_138.csv"
clean_samples = pd.read_csv(clean_samples_path)

batch_mapper = dict(zip(clean_samples["sample"], clean_samples["batch"]))

adata.obs["batch"] = adata.obs["sample"].map(batch_mapper)

Unnamed: 0,sample,batch
0,KEN-1070-ARC,batch1
1,KEN-1092-ARC,batch1
2,KEN-1095-ARC,batch1
3,KEN-1127-ARC,batch1
4,KEN-1132-ARC,batch1
...,...,...
133,UMARY-819-ARC,batch5
134,UMARY-871-ARC,batch5
135,UMARY-879-ARC,batch5
136,UMARY-914-ARC,batch5


In [102]:
adata.write_h5ad("brain_atlas_anndata_FULL.h5ad")


In [103]:
adata = ad.read_h5ad("brain_atlas_anndata_FULL.h5ad")

In [104]:
adata.obs.head()

Unnamed: 0_level_0,seurat_clusters,cell_type,sample,batch,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_ribo,log1p_total_counts_ribo,pct_counts_ribo
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
GGCCTAATCGATTTAG-1_1,11,Mature neurons,KEN-1070-ARC,batch1,1310,7.178545,2034.776367,7.618632,4.502719,0.0,0.0,0.0,0.0,0.0,0.0
TAGTAACGTAGTCAAT-1_1,5,Mature neurons,KEN-1070-ARC,batch1,1294,7.166266,2045.917969,7.624091,4.451315,0.0,0.0,0.0,0.0,0.0,0.0
GAAAGCCAGCAGCTCA-1_1,2,Oligodendrocytes,KEN-1070-ARC,batch1,1209,7.098376,2015.303955,7.609021,4.578828,0.0,0.0,0.0,0.0,0.0,0.0
ACTCACCTCCTCCCTC-1_1,5,Mature neurons,KEN-1070-ARC,batch1,1125,7.026427,1856.347168,7.526905,5.043708,0.0,0.0,0.0,0.0,0.0,0.0
CTTCATCCAATCGCAC-1_1,11,Mature neurons,KEN-1070-ARC,batch1,1108,7.011214,1831.069092,7.513201,5.125524,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    categorical_covariate_keys=["batch"],
    continuous_covariate_keys=[],
)

In [106]:
adata

AnnData object with n_obs × n_vars = 713626 × 3000
    obs: 'seurat_clusters', 'cell_type', 'sample', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', '_scvi_batch', '_scvi_labels'
    var: 'n_counts', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'log1p', '_scvi_uuid', '_scvi_manager_uuid'
    obsm: '_scvi_extra_categorical_covs'
    layers: 'counts'

In [10]:

# Create empty DataFrames for .obs and .var
obs = pd.DataFrame()
var_ = pd.DataFrame()
# Initialize the AnnData object with the empty DataFrames
adata_empty = ad.AnnData(X=None, obs=obs, var=var_)
adata_empty.write_h5ad("master.h5ad")

master_ad = ad.read_h5ad("master.h5ad", backed="r+")

adata1 = ad.read_h5ad(chunk_fs[0])
adata2 = ad.read_h5ad(chunk_fs[2])


# adata1 = ad.read_h5ad(chunk_fs[0], backed="r")
# adata2 = ad.read_h5ad(chunk_fs[1], backed="r")
# adata3 = ad.read_h5ad(chunk_fs[3], backed="r")





In [29]:
adata.obs["sample"]

cells
GGCCTAATCGATTTAG-1_1       KEN-1070-ARC
TAGTAACGTAGTCAAT-1_1       KEN-1070-ARC
GAAAGCCAGCAGCTCA-1_1       KEN-1070-ARC
ACTCACCTCCTCCCTC-1_1       KEN-1070-ARC
CTTCATCCAATCGCAC-1_1       KEN-1070-ARC
                              ...      
GTTGTGAGTCGCAATA-1_138    UMARY-933-ARC
GAAGTCAAGCCACAAT-1_138    UMARY-933-ARC
CTGGACCAGGCTGTGC-1_138    UMARY-933-ARC
TCCTCACAGGAGTAAT-1_138    UMARY-933-ARC
GCAGCCAGTTGTGATG-1_138    UMARY-933-ARC
Name: sample, Length: 709811, dtype: category
Categories (138, object): ['KEN-1066-ARC', 'KEN-1070-ARC', 'KEN-1092-ARC', 'KEN-1095-ARC', ..., 'UMARY-5120-ARC', 'UMARY-5123-ARC', 'UMARY-5171-ARC', 'UMARY-5179-ARC']

In [42]:
# adata2.var_names_make_unique()
keep_var = [(v in features) for v in adata2.var_names] 

# keep_obs = [(o in clean_samples['samples']) for o in adata2.obs['sample']]

# adata2.obs['sample'].isin(clean_samples['sample']).value_counts()

adata1.var['features'] = adata1.var_names

adata1.var['features'].isin(features).value_counts()

features
False    100
Name: count, dtype: int64

In [34]:
clean_samples[:1]

Unnamed: 0,sample,batch
0,KEN-1070-ARC,batch1


In [59]:
master_ad.X = adata1.X
master_ad.obs = adata1.obs
master_ad.var = adata1.var



((20000, 900), (20000, 1000))

In [9]:
adata1.var_names_make_unique()

In [60]:
adata1_sm.write_h5ad("adata1.h5ad")
adata2_sm.write_h5ad("adata2.h5ad")

X_sm = hstack([adata1_sm.X, adata2_sm.X])
adata1_sm.X.shape

(20000, 900)

In [11]:
adata1.x.shape

AttributeError: 'AnnData' object has no attribute 'x'

In [7]:
adata_concatenated = ad.concat([master_ad, adata1], axis=1, join='outer')


KeyError: "Unable to synchronously open object (object 'X' doesn't exist)"

In [23]:
adata1

AnnData object with n_obs × n_vars = 713626 × 900 backed at '/Users/ergonyc/Projects/SingleCell/labelator/data/chunk0010_output_data.h5ad'
    obs: 'seurat_clusters', 'cell_type', 'sample'

In [3]:





# function to collect all the
def list_h_files(data_folder: Path, file_type: str, postfix: Union[str, None] = None) -> List:
    """
    get a list of all the filetypes
    TODO: aics has cleaner functions than this "lambda"
    should this use Path methods? or return Path?
    """

    if postfix is not None:
        return sorted(data_folder.glob(f"*{postfix}{file_type}"))
    else:
        return sorted(data_folder.glob(f"*{file_type}"))

    # if prefix is not None:

AnnData object with n_obs × n_vars = 709811 × 3000
    obs: 'seurat_clusters', 'cell_type', 'sample'

In [6]:

sc.pp.filter_genes(adata, min_counts=3)

adata

AnnData object with n_obs × n_vars = 709811 × 3000
    obs: 'seurat_clusters', 'cell_type', 'sample'
    var: 'n_counts'

In [7]:
adata.var

Unnamed: 0,n_counts
PRDM16,74424.0
SLC2A5,23150.0
SPSB1,68033.0
SLC25A33,98671.0
CORT,32179.0
...,...
TMLHE-AS1,73827.0
TMLHE,188169.0
LINC00278,41040.0
PCDH11Y,435956.0
