In [18]:
import numpy as np
import scanpy as sc
import pandas as pd

from preprocessing import set_random_seed

set_random_seed(1234)

import os

path = os.path.abspath('').replace('\\', '/')
path = path.replace('/create_datasets', '')+'/dataset/'

# Liver Cell Atlas Data Preparation Instructions

To utilize data from the Liver Cell Atlas, follow these steps to download and prepare the necessary files for your analysis. Ensure you have adequate storage space and the required software to unzip files.
The files are large, we recommend at least 32GB of RAM to run this script.

## Step 1: Navigate to the Download Page

- Go to [www.livercellatlas.org/download.php](http://www.livercellatlas.org/download.php) in your web browser.

## Step 2: Download Required Datasets

For each species of interest, download the specified files.

### Liver Cell Atlas: Human

- All liver cells: Gene-cell count matrix and Cell annotation matrix.
- Myeloid cells: Cell annotation matrix.
- Lymphoid cells: Cell annotation matrix.
- CD45- cells: Cell annotation matrix.

### Liver Cell Atlas: Mouse (StSt)

- All liver cells: Gene-cell count matrix and Cell annotation matrix.
- Myeloid cells: Cell annotation matrix.
- Lymphoid cells: Cell annotation matrix.
- CD45- cells: Cell annotation matrix.
- Fibroblasts: Cell annotation matrix.

### Liver Cell Atlas: Mouse (NAFLD)

- All liver cells: Gene-cell count matrix and Cell annotation matrix.
- Myeloid cells: Cell annotation matrix.
- Lymphoid cells: Cell annotation matrix.
- Fibroblasts: Cell annotation matrix.

### Liver Cell Atlas: Other Species

- **Chicken**: Gene-cell count matrix and Cell annotation matrix.
- **Pig**: Gene-cell count matrix and Cell annotation matrix.
- **Hamster**: Gene-cell count matrix and Cell annotation matrix.
- **Monkey**: Gene-cell count matrix and Cell annotation matrix.

## Step 3: Unzip the Files

After downloading, use a file decompression tool to unzip each file. Then place all files into the `scSpecies/dataset` folder 

## Step 4: Run the Code

Afterwards rum the code which processes these files into species specific `.h5ad` files.
The Output will be: 

- `liver_mouseStSt.h5ad`
- `liver_mouseNafld.h5ad`
- `liver_human.h5ad`
- `liver_hamster.h5ad`
- `liver_pig.h5ad`
- `liver_chicken.h5ad`
- `liver_monkey.h5ad`

In [19]:
def return_matrix(data_path):
    counts = sc.read_mtx(data_path+"/matrix.mtx.gz").transpose()
    counts.var_names = list(pd.read_csv(data_path+"/features.tsv.gz", header=None)[0])
    counts.obs_names = list(pd.read_csv(data_path+"/barcodes.tsv.gz", header=None)[0])
    return counts

def update_df(df, df_update):
    df_update = df_update.rename(columns={"annot": "cell_type_fine"})
    _, ind_All, ind_fine = np.intersect1d(np.array(df.loc[:,"cell"]), np.array(df_update.loc[:,"cell"]), return_indices=True)
    df_update = df_update.iloc[ind_fine].set_index(ind_All)
    df.update(df_update.loc[:, ['cell_type_fine']])
    return df

In [20]:
#human dataset
annot_All = pd.read_csv(path+"rawData_human/annot_humanAll.csv")
annot_CD45neg = pd.read_csv(path+"rawData_human/annot_humanCD45neg.csv")
annot_Lymphoid = pd.read_csv(path+"rawData_human/annot_humanLymphoid.csv")
annot_Myeloid = pd.read_csv(path+"rawData_human/annot_humanMyeloid.csv")

annot_human = pd.DataFrame(annot_All[['cell', 'annot', 'sample', 'patient', 'typeSample', 'diet']])
annot_human['cell_type_fine'] = 'N/A'

annot_human = update_df(annot_human, annot_CD45neg)
annot_human = update_df(annot_human, annot_Lymphoid)
annot_human = update_df(annot_human, annot_Myeloid)

annot_human.rename(columns={'annot': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)

for ct in ['Neutrophils', 'Basophils']:
    ind = np.intersect1d(np.where(annot_human['cell_type_coarse'] == ct)[0], np.where(annot_human['cell_type_fine'] == 'N/A')[0])
    annot_human.loc[ind, 'cell_type_fine'] = ct

#remove cells with missing fine cell label:
annot_human = annot_human[annot_human['cell_type_fine'] != 'N/A']

#remove NucSeq samples
annot_human = annot_human[annot_human['typeSample'] != 'nucSeq']

#remove cells with labeling conflict between 'cell_type_fine' and 'cell_type_coarse'
keep_cells = []
for ct_fine in np.unique(annot_human['cell_type_fine'].to_numpy()):
    sub_df = annot_human[annot_human['cell_type_fine'] == ct_fine]
    if sub_df['cell_type_coarse'].value_counts()[0] > 20:
        keep_ind = np.where(sub_df['cell_type_coarse'] == sub_df['cell_type_coarse'].value_counts().index[0])[0]
        keep_cells += sub_df.iloc[keep_ind]['cell'].to_list()

annot_human = annot_human[annot_human['cell'].isin(keep_cells)]

annot_human = annot_human.set_index('cell').rename_axis(None)

  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0

In [21]:
#mouse StSt dataset
annot_All = pd.read_csv(path+"rawData_mouseStSt/annot_mouseStStAll.csv")
annot_CD45neg = pd.read_csv(path+"rawData_mouseStSt/annot_mouseStStCD45neg.csv")
annot_Lymphoid = pd.read_csv(path+"rawData_mouseStSt/annot_mouseStStLymphoid.csv")
annot_Myeloid = pd.read_csv(path+"rawData_mouseStSt/annot_mouseStStMyeloid.csv")
annot_Fibro = pd.read_csv(path+"rawData_mouseStSt/annot_mouseStStFibro.csv")

annot_mouseStSt = pd.DataFrame(annot_All[['cell', 'annot', 'sample', 'typeSample']])
annot_mouseStSt['cell_type_fine'] = 'N/A'

annot_mouseStSt = update_df(annot_mouseStSt, annot_CD45neg)
annot_mouseStSt = update_df(annot_mouseStSt, annot_Lymphoid)
annot_mouseStSt = update_df(annot_mouseStSt, annot_Myeloid)
annot_mouseStSt = update_df(annot_mouseStSt, annot_Fibro)

annot_mouseStSt.rename(columns={'annot': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)

for ct in ['Neutrophils', 'Basophils']:
    ind = np.intersect1d(np.where(annot_mouseStSt['cell_type_coarse'] == ct)[0], np.where(annot_mouseStSt['cell_type_fine'] == 'N/A')[0])
    annot_mouseStSt.loc[ind, 'cell_type_fine'] = ct

#remove cells with missing fine cell label:
annot_mouseStSt = annot_mouseStSt[annot_mouseStSt['cell_type_fine'] != 'N/A']

#remove NucSeq samples
annot_mouseStSt = annot_mouseStSt[annot_mouseStSt['typeSample'] != 'nucSeq']

#remove cells with labeling conflict between 'cell_type_fine' and 'cell_type_coarse'
keep_cells = []
for ct_fine in np.unique(annot_mouseStSt['cell_type_fine'].to_numpy()):
    sub_df = annot_mouseStSt[annot_mouseStSt['cell_type_fine'] == ct_fine]
    if sub_df['cell_type_coarse'].value_counts()[0] > 20:
        keep_ind = np.where(sub_df['cell_type_coarse'] == sub_df['cell_type_coarse'].value_counts().index[0])[0]
        keep_cells += sub_df.iloc[keep_ind]['cell'].to_list()
    
annot_mouseStSt = annot_mouseStSt[annot_mouseStSt['cell'].isin(keep_cells)]  

annot_mouseStSt = annot_mouseStSt.set_index('cell').rename_axis(None)  

  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0] > 20:
  if sub_df['cell_type_coarse'].value_counts()[0

In [22]:
#mouse Nafld dataset
annot_All = pd.read_csv(path+"rawData_mouseNafld/annot_mouseNafldAll.csv")
annot_Lymphoid = pd.read_csv(path+"rawData_mouseNafld/annot_mouseNafldLymphoid.csv")
annot_Myeloid = pd.read_csv(path+"rawData_mouseNafld/annot_mouseNafldMyeloid.csv")
annot_Fibro = pd.read_csv(path+"rawData_mouseNafld/annot_mouseNafldFibro.csv")

annot_mouseNafld = pd.DataFrame(annot_All[['cell', 'annot', 'sample', 'typeSample']])
annot_mouseNafld['cell_type_fine'] = 'N/A'

annot_mouseNafld = update_df(annot_mouseNafld, annot_Lymphoid)
annot_mouseNafld = update_df(annot_mouseNafld, annot_Myeloid)
annot_mouseNafld = update_df(annot_mouseNafld, annot_Fibro)

annot_mouseNafld.rename(columns={'annot': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)

for ct in ['Neutrophils', 'Basophils', 'Hepatocytes', 'Endothelial cells', 'Cholangiocytes']:
    ind = np.intersect1d(np.where(annot_mouseNafld['cell_type_coarse'] == ct)[0], np.where(annot_mouseNafld['cell_type_fine'] == 'N/A')[0])
    annot_mouseNafld.loc[ind, 'cell_type_fine'] = ct
    
#remove cells with missing fine cell label:
annot_mouseNafld = annot_mouseNafld[annot_mouseNafld['cell_type_fine'] != 'N/A']

#remove NucSeq samples
annot_mouseNafld = annot_mouseNafld[annot_mouseNafld['typeSample'] != 'nucSeq']

#remove cells with labeling conflict between 'cell_type_fine' and 'cell_type_coarse'
keep_cells = []
for ct_fine in np.unique(annot_mouseNafld['cell_type_fine'].to_numpy()):
    if ct_fine != 'Pre-MoKC and MoKC':
        sub_df = annot_mouseNafld[annot_mouseNafld['cell_type_fine'] == ct_fine]
    keep_ind = np.where(sub_df['cell_type_coarse'] == sub_df['cell_type_coarse'].value_counts().index[0])[0]
    keep_cells += sub_df.iloc[keep_ind]['cell'].to_list()
    
annot_mouseNafld = annot_mouseNafld[annot_mouseNafld['cell'].isin(keep_cells)]  

annot_mouseNafld = annot_mouseNafld.set_index('cell').rename_axis(None)  

In [23]:
#Animal datasets
#Pig dataset
annot_pig = pd.read_csv(path+"rawData_pig/annot_pig.csv")
annot_pig.rename(columns={'annot': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)
annot_pig = annot_pig[['cell', 'cell_type_coarse', 'batch', 'typeSample']]

annot_pig = annot_pig.set_index('cell').rename_axis(None)  

#Chicken dataset
annot_chicken = pd.read_csv(path+"rawData_chicken/annot_chicken.csv")
annot_chicken.rename(columns={'annot': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)
annot_chicken = annot_chicken[['cell', 'cell_type_coarse', 'batch', 'typeSample']]

annot_chicken = annot_chicken.set_index('cell').rename_axis(None)  

#Monkey dataset
annot_monkey = pd.read_csv(path+"rawData_monkey/annot_monkey.csv")
annot_monkey.rename(columns={'annot': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)
annot_monkey = annot_monkey[['cell', 'cell_type_coarse', 'batch', 'typeSample']]

annot_monkey = annot_monkey.set_index('cell').rename_axis(None)  

#Hamster dataset
annot_hamster = pd.read_csv(path+"rawData_hamster/annot_hamster.csv")
annot_hamster.rename(columns={'annot': 'cell_type_coarse', 'sample': 'batch'}, inplace=True)
annot_hamster = annot_hamster[['cell', 'cell_type_coarse', 'batch', 'typeSample']]

annot_hamster = annot_hamster.set_index('cell').rename_axis(None)  

In [24]:
#rename coarse and fine cell labels to have consistent labels among all datasets
translation_dict_fine = {
    'RM CD8+ T cells': 'RM CD8+ T Cells',
    'B cells': 'B Cells',
    'Mesothelial cells': 'Mesothelial Cells',
    'NK cells': 'NK Cells',
    'NKT cells': 'NKT Cells',
    'Capsule fibroblasts': 'Capsule Fibroblasts',
    'CD8 Effector Memory T cells': 'CD8 Eff. Memory T',  
    'Circulating TEM': 'Circ. Eff. Memory T',
    'Circulating NK': 'Circ. NK',
    'CTLs': 'Cytotoxic CD8+',
    'Endothelial cells': 'Endothelials',
    'Endothelial': 'Endothelials',    
    'Gd': 'Gamma-Delta T',
    'Gd T cells': 'Gamma-Delta T',
    'Mig cDCs': 'Mig. DCs',
    'Mig. cDCs': 'Mig. DCs',
    'Mig.cDCs': 'Mig. DCs', 
    'Mono': 'Monocytes',
    'NK': 'NK Cells',
    'NKT': 'NKT Cells',
    'Stellate cells': 'Stellate Cells',
    'Naive/CM CD4+ T cells': 'Naive/CM CD4+ T',
    'Pat.Mono': 'Pat. Monocytes',
    'Patrolling Monocytes': 'Pat. Monocytes',
    'Naïve CD8+ T cells': 'Naive CD8+ T',
    'Naïve CD4+ T cells': 'Naive CD4+ T',    
    'Naive CD8+': 'Naive CD8+ T',
    'Naive CD4+': 'Naive CD4+ T',  
    'Naive CD4': 'Naive CD4+ T',  
    'Naive CD8': 'Naive CD8+ T',  
    'Teff memory': 'CD8 Eff. Memory T',
    'Central Vein Endothelial cells': 'Central Vein ECs',
    'Portal vein ECs': 'Portal Vein ECs',    
    'Portain Vein Endothelial cells': 'Portal Vein ECs',
    'Portal Vein Endothelial cells': 'Portal Vein ECs',
    'Plasma cells': 'Plasma',
    'TRegs': 'Regulatory T', 
    'Treg': 'Regulatory T', 
    'Trans Monocytes 1': 'Trans. Monocytes 1', 
    'Trans Monocytes 2': 'Trans. Monocytes 2',
    'Peritoneal macs': 'Peritoneal Macs',
    'Peritoneal Macrophages': 'Peritoneal Macs',
    'Lymphatic Endothelial cells': 'Lymphatic ECs',
    'Circulating TEM': 'Circ. Eff. Memory T',
    'ILC1s': 'ILCs',
    'resKCs': 'KCs',
    'cDC1s': 'cDCs 1', 
    'cDC2s': 'cDCs 2',
    'CV and Capsule': 'CV/Capsule',
    'Th17 cells': 'Th 17',    
    'Th17s': 'Th 17',
    'Th1s': 'Th 1',
    'CD4+ KLRB1 T cells': 'CD4+ KLRB1 Th',
    'LAMs': 'Bile-duct LAMs',
    'T helper': 'CD4+ T helper',
    
}

annot_mouseStSt.cell_type_fine = annot_mouseStSt.cell_type_fine.replace(translation_dict_fine)
annot_human.cell_type_fine = annot_human.cell_type_fine.replace(translation_dict_fine)    
annot_mouseNafld.cell_type_fine = annot_mouseNafld.cell_type_fine.replace(translation_dict_fine)


translation_dict_coarse = {
    'Cholangio': 'Cholangiocytes',  
    'B cells': 'B Cells',
    'Fibroblasts': 'Stromal',
    'Endothelial cells': 'Endothelials',
    'LSECs': 'Endothelials',    
    'B/Plasma cells': 'B/Plasma',
    'ILC1s': 'ILCs',
    'KCs': 'KCs',
    'Kupffer cells': 'KCs',
    'Mig cDCs': 'Mig. cDCs',
    'Mig.cDCs': 'Mig. cDCs',  
    'Mono': 'Mono/Mono Derived',
    'Monocytes & Monocyte-derived cells': 'Mono/Mono Derived',
    'Mono/mono-derived cells': 'Mono/Mono Derived',
    'Mono+mono derived cells': 'Mono/Mono Derived',
    'Monocytes & Monocyte-derived cells': 'Mono/Mono Derived',
    'Monocytes': 'Mono/Mono Derived',
    'Plasma cells': 'Plasma',
    'cDC1s': 'cDCs', 
    'cDC2s': 'cDCs', 
    'NK cells': 'NK Cells',
    'T cells': 'T Cells',
     
        
}

annot_mouseStSt.cell_type_coarse = annot_mouseStSt.cell_type_coarse.replace(translation_dict_coarse)
annot_human.cell_type_coarse = annot_human.cell_type_coarse.replace(translation_dict_coarse)    
annot_hamster.cell_type_coarse = annot_hamster.cell_type_coarse.replace(translation_dict_coarse)
annot_monkey.cell_type_coarse = annot_monkey.cell_type_coarse.replace(translation_dict_coarse)   
annot_chicken.cell_type_coarse = annot_chicken.cell_type_coarse.replace(translation_dict_coarse)    
annot_mouseNafld.cell_type_coarse = annot_mouseNafld.cell_type_coarse.replace(translation_dict_coarse)
annot_pig.cell_type_coarse = annot_pig.cell_type_coarse.replace(translation_dict_coarse)       

#make coarse cell type labels more transferable among species    
annot_human.loc[annot_human.cell_type_fine.isin(['Pre-moKCs and moKCs', 'matLAMs']), 'cell_type_coarse'] = 'Mono/Mono Derived'
annot_human.loc[annot_human.cell_type_fine.isin(['KCs']), 'cell_type_coarse'] = 'KCs'

annot_human.loc[annot_human.cell_type_fine.isin(['Circ. NK', 'Tissue Resident NK']), 'cell_type_coarse'] = 'NK Cells'
annot_human.loc[annot_human.cell_type_fine.isin(['NKT Cells']), 'cell_type_coarse'] = 'T Cells'     

In [25]:
adata_mouseStSt = return_matrix(path+'rawData_mouseStSt/countTable_mouseStSt')
_, ind_a, ind_b = np.intersect1d(adata_mouseStSt.obs_names, annot_mouseStSt.index, return_indices=True)

adata_mouseStSt = adata_mouseStSt[ind_a]
annot_mouseStSt = annot_mouseStSt.iloc[ind_b]

adata_mouseStSt.obs = annot_mouseStSt
adata_mouseStSt.write_h5ad(path+'liver_mouseStSt.h5ad')


adata_mouseNafld = return_matrix(path+'rawData_mouseNafld/countTable_mouseNafld')
_, ind_a, ind_b = np.intersect1d(adata_mouseNafld.obs_names, annot_mouseNafld.index, return_indices=True)

adata_mouseNafld = adata_mouseNafld[ind_a]
annot_mouseNafld = annot_mouseNafld.iloc[ind_b]

adata_mouseNafld.obs = annot_mouseNafld
adata_mouseNafld.write_h5ad(path+'liver_mouseNafld.h5ad')


adata_human = return_matrix(path+'rawData_human/countTable_human')
_, ind_a, ind_b = np.intersect1d(adata_human.obs_names, annot_human.index, return_indices=True)

adata_human = adata_human[ind_a]
annot_human = annot_human.iloc[ind_b]

adata_human.obs = annot_human
adata_human.write_h5ad(path+'liver_human.h5ad')


adata_hamster = return_matrix(path+'rawData_hamster/')
_, ind_a, ind_b = np.intersect1d(adata_hamster.obs_names, annot_hamster.index, return_indices=True)

adata_hamster = adata_hamster[ind_a]
annot_hamster = annot_hamster.iloc[ind_b]

adata_hamster.obs = annot_hamster
adata_hamster.write_h5ad(path+'liver_hamster.h5ad')


adata_pig = return_matrix(path+'rawData_pig/')
_, ind_a, ind_b = np.intersect1d(adata_pig.obs_names, annot_pig.index, return_indices=True)

adata_pig = adata_pig[ind_a]
annot_pig = annot_pig.iloc[ind_b]

adata_pig.obs = annot_pig
adata_pig.write_h5ad(path+'liver_pig.h5ad')


adata_chicken = return_matrix(path+'rawData_chicken/')
_, ind_a, ind_b = np.intersect1d(adata_chicken.obs_names, annot_chicken.index, return_indices=True)

adata_chicken = adata_chicken[ind_a]
annot_chicken = annot_chicken.iloc[ind_b]

adata_chicken.obs = annot_chicken
adata_chicken.write_h5ad(path+'liver_chicken.h5ad')


adata_monkey = return_matrix(path+'rawData_monkey/')
_, ind_a, ind_b = np.intersect1d(adata_monkey.obs_names, annot_monkey.index, return_indices=True)

adata_monkey = adata_monkey[ind_a]
annot_monkey = annot_monkey.iloc[ind_b]

adata_monkey.obs = annot_monkey
adata_monkey.write_h5ad(path+'liver_monkey.h5ad')