## **To recreate study results please load package versions found in main_requirements.txt**

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
import math
import seaborn as sns
import os
# %config IPCompleter.greedy=True
%load_ext autoreload
%autoreload 2

sc.settings.verbosity = 0
sc.logging.print_header()
sns.set_context("paper")

scanpy==1.10.3 anndata==0.10.9 umap==0.5.7 numpy==1.26.4 scipy==1.13.1 pandas==2.2.3 scikit-learn==1.6.1 statsmodels==0.14.4 igraph==0.8.3 pynndescent==0.5.13


In [2]:
# import local module containing misc code, helps keep notebooks clean from commonly used functions
import new_misc_code as nmc

## **Load raw UMI count data**

In [3]:
# read in count matrices post nuclei and gene quality control
adata = sc.read( "../data/Processed_data_RNA-all_full-counts-and-downsampled-CPM.h5ad")

In [4]:
# read in adata post clustering and dev-traj assignment
adata2 = sc.read_h5ad( "../data/Processed_data_RNA-gaba_full-counts-and-downsampled-CPM.h5ad")

In [5]:
adata.obs_names.values

array(['AAACCTGAGAGTCGGT-RL1612_34d_v2', 'AAACCTGAGCCGCCTA-RL1612_34d_v2',
       'AAACCTGAGTCGAGTG-RL1612_34d_v2', ...,
       'TTTGTTGGTAAGGTCG-RL2132_25yr_v3',
       'TTTGTTGGTTCGGCTG-RL2132_25yr_v3',
       'TTTGTTGTCGTCCTCA-RL2132_25yr_v3'], dtype=object)

In [None]:
# add obs and obsm data to matching raw count nuclei
adata = adata[nmc.member_test( adata.obs_names.values, adata2.obs_names.values)]
adata.obs = adata2.obs
adata.obsm = adata2.obsm
adata.uns = adata2.uns
sc.pp.calculate_qc_metrics( adata, inplace=True)

In [None]:
# check if order of BCs match, should sum to 0
(adata.obs_names.values!=adata2.obs_names.values).sum()

In [None]:
sc.pl.umap( adata, color=['batch'], legend_fontsize=5, add_outline=True, size=2, legend_fontoutline=0.5)

## **Clean up features a bit**

In [None]:
# change unknown NaNs in year collected to common date
adata.obs['Collection_year'].fillna( 9999, inplace=True)

In [None]:
# get list of batches
batches = adata.uns['batch_order']

## **Bulk by batch**

In [None]:
!mkdir ../data/limma-voom/

In [None]:
# set minimum number of nuclei needed to make a bulk data set
min_cells = 10
# create dataframe to hold bulked data
batches = adata.uns['batch_order']
columns = [ ("--").join((x,y)) for x,y in zip( adata.var['gene_ids'].values, adata.var_names.values)]#
bulk = pd.DataFrame( index=batches, columns=columns)
# only want obs that are same across the batch, i.e. what can be controlled for
obs_col_mk = (adata.obs.nunique()<=len(batches)).values
obs_cols = np.array( adata.obs_keys())[obs_col_mk]
# drop all dev_traj observations
obs_cols = [ii for ii in obs_cols if 'traj' not in ii]
obs = pd.DataFrame(  index=batches, columns=obs_cols)
obs['Num_Cells'] = 0
# loop through batches
for batch_itr in batches:
    ind = batch_itr
    batch_mk = adata.obs['batch'].values==batch_itr
    num_cells = batch_mk.sum()
    if( min_cells<10):
        bulk.drop( ind, axis=0, inplace=True)
        obs.drop(  ind, axis=0, inplace=True)
        continue
    else:
        batch_adata = adata[batch_mk]
        bulk.loc[ind] = batch_adata.X.sum(0).A1
        obs.loc[ind,'Num_Cells'] = num_cells
        obs.loc[ind,obs_cols] = batch_adata.obs.iloc[0,obs_col_mk]

# save files
bulk.T.to_csv( f"../data/limma-voom/bulked-by-batch_pseudo-bulk-cts_min{min_cells}.csv")
obs.to_csv(  f"../data/limma-voom/bulked-by-batch_obs-cts_min{min_cells}.csv")

In [None]:
# example output, un-transposed
obs.head()

In [None]:
# example output, un-transposed
bulk.head()