In [None]:
import numpy as np
import scanpy as sc
import anndata as ad
import pandas as pd

from preprocessing import set_random_seed

set_random_seed(1234)

#path where to save the adata object
import os

path = os.path.abspath('').replace('\\', '/')
path = path.replace('/create_datasets', '')+'/dataset/'

# Brain Immune Atlas Preparation Instructions

To utilize data from the Brain Immune Atlas, follow these steps to download and prepare the necessary files for your analysis. 
The files are large, we recommend at least 16GB of RAM to run this script.

## Step 1: Navigate to the Download Page

- Go to [https://www.brainimmuneatlas.org/download.php](https://www.brainimmuneatlas.org/download.php) in your web browser.

## Step 2: Download Required Datasets

For each species of interest, download the specified files.

### Brain Immune Atlas: Human GBM

- Newly diagnosed GBM: full aggregate: Gene-cell count matrix and Cell annotation matrix.
- Newly diagnosed GBM: TAM: Cell annotation matrix.
- Recurrent GBM: full aggregate: Gene-cell count matrix and Cell annotation matrix
- Recurrent GBM: TAM: Cell annotation matrix.
- Recurrent GBM: Mg-TAM: Cell annotation matrix.
- DC: recurrent + newly diagnosed GBM: TAM: Cell annotation matrix.

### Brain Immune Atlas: Mouse transplanted GBM 

- Full aggregate: Gene-cell count matrix and Cell annotation matrix.
- DC: Cell annotation matrix.
- TAM: Cell annotation matrix.
- Mg-TAM: Cell annotation matrix.

## Step 3: Unzip the files

After downloading, use a file decompression tool to unzip each file. Then place all files into the `scSpecies/dataset` folder 

## Step 4: Run the code

Afterwards rum the code which processes these files into species specific `.h5ad` files.
The Output will be: 

- `glio_human.h5ad`
- `glio_mouse.h5ad`

In [None]:
def update_df(df, df_update):
    df_update = df_update.rename(columns={"cluster": "cell_type_fine"})
    _, ind_All, ind_fine = np.intersect1d(np.array(df.loc[:,"cell"]), np.array(df_update.loc[:,"cell"]), return_indices=True)
    df_update = df_update.iloc[ind_fine].set_index(ind_All)
    df.update(df_update.loc[:, ['cell_type_fine']])
    return df

In [None]:
# human
annot_R_All = pd.read_csv(path+"annot_Human_R_GBM_Full.csv")
annot_ND_All = pd.read_csv(path+"annot_Human_ND_GBM_Full.csv")
annot_All = pd.concat([annot_R_All, annot_ND_All], ignore_index=True)

annot_R_TAM = pd.read_csv(path+"annot_Human_R_GBM_TAM.csv")
annot_R_mgTAM = pd.read_csv(path+"annot_Human_R_GBM_mg-TAM.csv")
annot_ND_TAM = pd.read_csv(path+"annot_Human_ND_GBM_TAM.csv")
annot_RND_DC = pd.read_csv(path+"annot_Human_DC.csv")

annot_human = pd.DataFrame(annot_All[['cell', 'cluster', 'sample', 'ident']])
annot_human['cell_type_fine'] = 'Unknown'

annot_human = update_df(annot_human, annot_R_TAM)
annot_human = update_df(annot_human, annot_R_mgTAM)
annot_human = update_df(annot_human, annot_ND_TAM)
annot_human = update_df(annot_human, annot_RND_DC)

annot_human.rename(columns={'cluster': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)


for ct in ['T cells', 'NK cells', 'Regulatory T cells', 'Plasma B', 'B cells', 'Mast cells', 'Monocytes', 'Prolif. TAM']:
    ind = np.intersect1d(np.where(annot_human['cell_type_coarse'] == ct)[0], np.where(annot_human['cell_type_fine'] == 'Unknown')[0])
    annot_human.loc[ind, 'cell_type_fine'] = ct
   
#ind = np.intersect1d(np.where(annot_human['cell_type_coarse'] == 'TAM 1')[0], np.where(annot_human['cell_type_fine'] == 'Unknown')[0])
#annot_human.loc[ind, 'cell_type_fine'] = 'Mo-TAM'    
    
#ind = np.intersect1d(np.where(annot_human['cell_type_coarse'] == 'TAM 2')[0], np.where(annot_human['cell_type_fine'] == 'Unknown')[0])
#annot_human.loc[ind, 'cell_type_fine'] = 'Mg-TAM'    

#remove cells with missing fine cell label:
annot_human = annot_human[annot_human['cell_type_fine'] != 'Unknown']

#remove cells with label TAM as it inconsitently contains only Dendric cells in the coarse cell label:
annot_human = annot_human[annot_human['cell_type_fine'] != 'TAM']

keep_cells = []
for ct_fine in np.unique(annot_human['cell_type_fine'].to_numpy()):
    sub_df = annot_human[annot_human['cell_type_fine'] == ct_fine]
    #print(ct_fine+'\n',sub_df['cell_type_coarse'].value_counts())
    
    #TAM 2 contains Mg-TAM cells, not Mo-TAM
    if ct_fine == 'Hypoxic Mg-TAM':
        keep_ind = np.where(sub_df['cell_type_coarse'] == 'TAM 2')[0]
        keep_cells += sub_df.iloc[keep_ind]['cell'].to_list()

    elif ct_fine != 'Unknown':
        keep_ind = np.where(sub_df['cell_type_coarse'] == sub_df['cell_type_coarse'].value_counts().index[0])[0]
        keep_cells += sub_df.iloc[keep_ind]['cell'].to_list()
  
annot_human = annot_human[annot_human['cell'].isin(keep_cells)]  

sub_df = annot_human[annot_human['cell_type_coarse'] == 'TAM 1'  ]    
rem_cells = sub_df.iloc[np.where(sub_df['cell_type_fine'] == sub_df['cell_type_fine'].value_counts().index[-1])[0]]['cell'].to_list()

for ct_fine in ['T cells', 'Regulatory T cells']:
    sub_df = annot_human[annot_human['cell_type_coarse'] == ct_fine]

    #else:
    rem_ind = np.where(sub_df['cell_type_fine'] != sub_df['cell_type_fine'].value_counts().index[0])[0]
    rem_cells += sub_df.iloc[rem_ind]['cell'].to_list()
   
annot_human = annot_human[~annot_human['cell'].isin(rem_cells)] 

annot_human = annot_human.set_index('cell').rename_axis(None)          

In [None]:
# mouse
annot_All = pd.read_csv(path+"annot_mouse_GBM_Full.csv")
annot_TAM = pd.read_csv(path+"annot_mouse_GBM_TAM.csv")
annot_mgTAM = pd.read_csv(path+"annot_mouse_GBM_mg-TAM.csv")
annot_DC = pd.read_csv(path+"annot_mouse_DC.csv")

annot_mouse = pd.DataFrame(annot_All[['cell', 'cluster', 'sample', 'ident']])
annot_mouse['cell_type_fine'] = 'Unknown'

annot_mouse = update_df(annot_mouse, annot_TAM)
annot_mouse = update_df(annot_mouse, annot_mgTAM)
annot_mouse = update_df(annot_mouse, annot_DC)

annot_mouse.rename(columns={'cluster': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)

for ct in ['T cells', 'NK cells', 'Regulatory T cells', 'plasma B cells', 'B cells', 'Mast cells']:
    ind = np.intersect1d(np.where(annot_mouse['cell_type_coarse'] == ct)[0], np.where(annot_mouse['cell_type_fine'] == 'Unknown')[0])
    annot_mouse.loc[ind, 'cell_type_fine'] = ct

#remove cells with missing fine cell label:
annot_mouse = annot_mouse[annot_mouse['cell_type_fine'] != 'Unknown']

#remove cells with label TAM as it inconsitently contains only Dendric cells in the coarse cell label:
annot_mouse = annot_mouse[annot_mouse['cell_type_fine'] != 'TAM']

#remove cells with labeling conflict between 'cell_type_fine' and 'cell_type_coarse'
keep_cells = []
for ct_fine in np.unique(annot_mouse['cell_type_fine'].to_numpy()):
    sub_df = annot_mouse[annot_mouse['cell_type_fine'] == ct_fine]
    keep_ind = np.where(sub_df['cell_type_coarse'] == sub_df['cell_type_coarse'].value_counts().index[0])[0]
    #print(ct_fine+'\n',sub_df['cell_type_coarse'].value_counts())
    keep_cells += sub_df.iloc[keep_ind]['cell'].to_list()
    
annot_mouse = annot_mouse[annot_mouse['cell'].isin(keep_cells)]  

annot_mouse = annot_mouse.set_index('cell').rename_axis(None)  

In [None]:
#rename coarse and fine cell labels to have consistent labels among all datasets
translation_dict = {
    'DC 1': 'cDCs 1',           
    'DC 2': 'cDCs 2',   
    'cDC1': 'cDCs 1',           
    'cDC2': 'cDCs 2',         
    'DC 3': 'Mig. DCs', 
    'pDC' : 'pDCs',      
    'DC 4': 'pDCs',    
    'DC1': 'cDCs 1',           
    'DC2': 'cDCs 2',    
    'DC3': 'Mig. DCs', 
    'MigDC': 'Mig. DCs',    
    'DC4': 'pDCs',       
    'TAM 1': 'Mo-TAM', 
    'TAM 2': 'Mg-TAM', 
    'Regulatory T cells': 'Regulatory T',   
    'Plasma B cells': 'Plasma',
    'Plasma B': 'Plasma',    
    'plasma B cells': 'Plasma',
    'prol. TAM': 'Prol. TAM',
    'prol. cDC1': 'Prol. cDCs 1',    
    'prol. cDC2': 'Prol. cDCs 2',        
    'prol. DC': 'Prol. cDCs 1',    
    'SEPP1-hi Mo-TAM': 'SEPP1+ Mo-TAM', 
    'SEPP1-lo Mo-TAM': 'SEPP1+ Mo-TAM', 
    'Sepp1+(a) Mo-TAM': 'SEPP1+ Mo-TAM',
    'Sepp1+(b) Mo-TAM': 'SEPP1+ Mo-TAM',   
    'B cells': 'B Cells',
    'NK cells': 'NK Cells',
    'Mast cells': 'Mast Cells',
    'T cells': 'T Cells',
}

annot_mouse.cell_type_coarse = annot_mouse.cell_type_coarse.replace(translation_dict)
annot_human.cell_type_coarse = annot_human.cell_type_coarse.replace(translation_dict)    
annot_mouse.cell_type_fine = annot_mouse.cell_type_fine.replace(translation_dict)
annot_human.cell_type_fine = annot_human.cell_type_fine.replace(translation_dict)        

In [None]:
adata_mouse = sc.read_mtx(path+"filtered_feature_bc_matrix_MouseTransplantedGBM/filtered_gene_bc_matrices/mm10/matrix.mtx").transpose()
adata_mouse.var_names = list(pd.read_csv(path+"filtered_feature_bc_matrix_MouseTransplantedGBM/filtered_gene_bc_matrices/mm10/genes.tsv", header=None, delimiter="\t")[1])
adata_mouse.obs_names = list(pd.read_csv(path+"filtered_feature_bc_matrix_MouseTransplantedGBM/filtered_gene_bc_matrices/mm10/barcodes.tsv", header=None)[0])

#remove gene duplicates
unique_genes_mask = adata_mouse.var_names.value_counts() == 1
unique_genes = unique_genes_mask[unique_genes_mask].index.tolist()

adata_mouse = adata_mouse[:, adata_mouse.var_names.isin(unique_genes)].copy()

_, ind_a, ind_b = np.intersect1d(adata_mouse.obs_names, annot_mouse.index, return_indices=True)

adata_mouse = adata_mouse[ind_a]
annot_mouse = annot_mouse.iloc[ind_b]

adata_mouse.obs = annot_mouse
adata_mouse.write_h5ad(path+'glio_mouse.h5ad')

In [None]:
adata_human_R = sc.read_mtx(path+"filtered_feature_bc_matrix_HumanRecurrentGBM/filtered_gene_bc_matrices/GRCh38/matrix.mtx").transpose()
adata_human_R.var_names = list(pd.read_csv(path+"filtered_feature_bc_matrix_HumanRecurrentGBM/filtered_gene_bc_matrices/GRCh38/features.tsv", header=None, delimiter="\t")[1])
adata_human_R.obs_names = list(pd.read_csv(path+"filtered_feature_bc_matrix_HumanRecurrentGBM/filtered_gene_bc_matrices/GRCh38/barcodes.tsv", header=None)[0])

adata_human_ND = sc.read_mtx(path+"filtered_feature_bc_matrix_HumanNewlyDiagnGBM/filtered_feature_bc_matrix/matrix.mtx.gz").transpose()
adata_human_ND.var_names = list(pd.read_csv(path+"filtered_feature_bc_matrix_HumanNewlyDiagnGBM/filtered_feature_bc_matrix/features.tsv.gz", header=None, delimiter="\t")[1])
adata_human_ND.obs_names = list(pd.read_csv(path+"filtered_feature_bc_matrix_HumanNewlyDiagnGBM/filtered_feature_bc_matrix/barcodes.tsv.gz", header=None)[0])
_, _, ind_b = np.intersect1d(adata_human_R.obs_names, adata_human_ND.obs_names, return_indices=True)
adata_human_ND = adata_human_ND[~np.isin(range(adata_human_ND.n_obs), ind_b)]

#remove gene duplicates
unique_genes_mask = adata_human_ND.var_names.value_counts() == 1
unique_genes = unique_genes_mask[unique_genes_mask].index.tolist()

adata_human_ND = adata_human_ND[:, adata_human_ND.var_names.isin(unique_genes)].copy()
adata_human_R = adata_human_R[:, adata_human_R.var_names.isin(unique_genes)].copy()

adata_human = ad.concat([adata_human_R, adata_human_ND], join='outer')

_, ind_a, ind_b = np.intersect1d(adata_human.obs_names, annot_human.index, return_indices=True)

adata_human = adata_human[ind_a]
annot_human = annot_human.iloc[ind_b]

adata_human.obs = annot_human
adata_human.write_h5ad(path+'glio_human.h5ad')