In [1]:
import anndata as ad
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

### Method 1: Downloading preprocessed data directly from the authors

In scEval, the authors refer to PancreasCross as the hPancreas dataset used in Chen, Jiawei, et al. "Transformer for one stop interpretable cell type annotation." Nature Communications 14.1 (2023): 223. In this paper, the authors preprocessed the hPancreas labels by standardizing and cleaning the annotations from various sources. The "stellate cells" have been modified to "PSC" (Pancreas stellate cell) for consistency. Additionally, annotations such as "PP contaminated" or "Beta activated" have been removed due to their ambiguous meaning. To ensure clarity and uniformity, we used the pre-processed data with standardized annotations from Chen, Jiawei, et al. downloaded at the following link: https://figshare.com/articles/dataset/Pre-processed_data_for_benchmarking/24637044?file=43295985

In [2]:
ROOT_PANCREAS_CROSS = "/dccstor/bmfm-targets/data/omics/transcriptome/scRNA/finetune/PancreasCross/h5ad/"

# load files downloaded from link above
train = ad.read_h5ad(ROOT_PANCREAS_CROSS + "hPancreas_train_adata.h5ad")
test = ad.read_h5ad(ROOT_PANCREAS_CROSS + "hPancreas_test_adata.h5ad")

train.obs = train.obs.rename(columns={'Celltype2': 'celltype_sceval'})
test.obs = test.obs.rename(columns={'Celltype2': 'celltype_sceval'})

# concatenate datasets
hpancreas= ad.concat([train, test], axis=0, join='outer', merge='same')

# create split cols
split_cross = ['train'] * train.shape[0] + ['test'] * test.shape[0]
split_cross_series = pd.Series(split_cross, index=hpancreas.obs.index, name='split_cross')

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
train_indices, dev_indices = next(splitter.split(train.obs, train.obs['celltype_sceval']))

split_cross_series.iloc[train_indices] = 'train'
split_cross_series.iloc[dev_indices] = 'dev'

hpancreas.obs['split_cross'] = split_cross_series
hpancreas.obs['split_cross'].value_counts()

  concat_annot = pd.concat(


split_cross
train    9540
test     4218
dev      1060
Name: count, dtype: int64

In [3]:
hpancreas.write(ROOT_PANCREAS_CROSS + 'pancreascross_sceval_split.h5ad')

### Method 2: Preprocessing the data ourselves

This is an attempt to process the original PancreasCross dataset ourselves and see if we arrive at the same split as the authors. 

The PancreassCross dataset is distributed across 5 different GEO studies:
- Train: Data comes from 2 studies: Baron (GSE84133) and Muraro (GSE85241).
- Test: Data comes from 3 studies: Xin (GSE81608), Segerstolpe (E-MTAB-5061), Lawlor (GSE86473).

All of them are downloaded and are located under *'/dccstor/bmfm-targets/data/omics/transcriptome/scRNA/finetune/Pancreas/*'. 

Below, we preprocess the datasets individually.


In [4]:
ROOT_PANCREAS = '/dccstor/bmfm-targets/data/omics/transcriptome/scRNA/finetune/Pancreas/'

baron_raw = ad.read_h5ad( ROOT_PANCREAS + "Baron/baron.h5ad")
muraro_raw = ad.read_h5ad(ROOT_PANCREAS + "Muraro/muraro.h5ad")
xin_raw = ad.read_h5ad(ROOT_PANCREAS + "Xin/xin.h5ad")
segerstolpe_raw = ad.read_h5ad(ROOT_PANCREAS + "Segerstolpe/segerstolpe.h5ad")
lawlor_raw = ad.read_h5ad(ROOT_PANCREAS + "Lawlor/lawlor.h5ad")

baron_raw.obs['dataset'] = 'baron'
muraro_raw.obs['dataset'] = 'muraro'
xin_raw.obs['dataset'] = 'xin'
segerstolpe_raw.obs['dataset'] = 'segerstolpe'
lawlor_raw.obs['dataset'] = 'lawlor'

##### Renaming columns to be consistent between datasets

In [5]:
baron_raw.obs = baron_raw.obs.rename(columns={'label': 'celltype'})

muraro_raw.obs = muraro_raw.obs.rename(columns = {'label': 'celltype'})

xin_raw.obs = xin_raw.obs.rename(columns={'donor.id': 'donor',
                                   'condition': 'disease',
                                     'Sample.name': 'sample',
                                       'ethnicity': 'race',
                                       'cell.type': 'celltype'})

segerstolpe_raw.obs = segerstolpe_raw.obs.rename(columns={'Disease': 'disease',
                                                           'Donor': 'donor',
                                                            'Quality': 'quality',
                                                            'label': 'celltype'})

lawlor_raw.obs = lawlor_raw.obs.rename(columns={'cell type': 'celltype',
                                                 'Sex': 'sex'})

#### Rename disease status and race

In [6]:
rename_dict_disease_xin = {
    'Healthy': 'normal', #lets rename all healthy cells as normal
    'T2D': 'type II diabetes mellitus' # we will always name diabetes as "type II diabetes mellitus"
}

rename_dict_disease_lawlor = {
    'Type 2 Diabetic': 'type II diabetes mellitus',
    'Non-Diabetic': 'normal'
}

rename_dict_race_xin = { #here we went back to the paper to understand that C, AA, AI and H mean
    'C': 'White',
    'AA': 'African American',
    'AI': 'American Indian',
    'H': 'Hispanic'
}


#### Rename cell types


Each of the 5 datasets assigned their own names to the cell types. Some of them represent the same type (e.g., cell type "duct" in the Muraro dataset is the same as "ductal cell" in Segerstolpe dataset. All of them are renamed to "ductal", which is the label encountered in scEval). <br>
Here, we map all cell types to have the same name as they have in scEval:
- Gamma cells are referred as PP cells (information provided by the author).
- All contaminated cells in the Xin dataset were removed from the scEval dataset, so we rename them as 'unknown'.
- Stellate cells are considered PSC (pancreas stellate cells). 
- It's not clear what the authors did with mesenchymal cells (present in the Muraro dataset) or co-expression cells (in the Segerstolpe dataset), so we renamed them as 'unknown'

In [7]:
rename_dict_celltype_baron = {
    'gamma': 'PP', 
    'activated_stellate': 'PSC', 
    'quiescent_stellate': 'PSC'
}

rename_dict_celltype_muraro = {
    'NA': 'unknown',
    'duct': 'ductal',
    'pp': 'PP',
    'unclear': 'unknown',
    'mesenchymal': 'mesenchymal'
}

rename_dict_celltype_xin = {
    'beta.contaminated': 'unknown',
    'alpha.contaminated': 'unknown',
    'PP.contaminated': 'unknown',
    'delta.contaminated': 'unknown'
}

rename_dict_celltype_segerstolpe = {
    'NA': 'unknown',
    'gamma cell': 'PP',
    'alpha cell': 'alpha',
    'beta cell': 'beta',
    'acinar cell': 'acinar',
    'epsilon cell': 'epsilon',
    'unclassified cell': 'unknown',
    'ductal cell': 'ductal',
    'delta cell': 'delta',
    'endothelial cell': 'endothelial',
    'PSC cell': 'PSC',
    'MHC class II cell': 'MHC class II',
    'mast cell': 'mast',
    'co-expression cell': 'unknown',
    'unclassified endocrine cell': 'unknown'
}

rename_dict_celltype_lawlor = {
    'Gamma/PP': 'PP',
    'Alpha': 'alpha',
    'Beta': 'beta',
    'Ductal': 'ductal',
    'Delta': 'delta',
    'Acinar': 'acinar',
    'None/Other': 'unknown',
    'Stellate': 'PSC'
}

In [8]:
def rename_categories_with_mapping(df_data, column, rename_dict, posfix):

    df_data.obs[column] = df_data.obs[column].astype(str)
    df_data.obs[column + posfix] = df_data.obs[column].map(rename_dict).fillna(df_data.obs[column])
    df_data.obs[column + posfix] = df_data.obs[column + posfix].astype('category')
    
    return df_data


In [9]:
def preprocess_datasets(datasets):
    for name, df_data in datasets.items():
        if name == 'xin_raw':
            df_data = rename_categories_with_mapping(df_data, 'disease', rename_dict_disease_xin, posfix = '')
            df_data = rename_categories_with_mapping(df_data, 'race', rename_dict_race_xin, posfix = '')
            df_data = rename_categories_with_mapping(df_data, 'celltype', rename_dict_celltype_xin, posfix = '_sceval')
        
        elif name == 'lawlor_raw':
            df_data = rename_categories_with_mapping(df_data, 'disease', rename_dict_disease_lawlor, posfix = '')
            df_data = rename_categories_with_mapping(df_data, 'celltype', rename_dict_celltype_lawlor, posfix = '_sceval')
        
        elif name == 'baron_raw':
            df_data = rename_categories_with_mapping(df_data, 'celltype', rename_dict_celltype_baron, posfix = '_sceval')
        
        elif name == 'muraro_raw':
            df_data = rename_categories_with_mapping(df_data, 'celltype', rename_dict_celltype_muraro, posfix = '_sceval')
        
        elif name == 'segerstolpe_raw':
            df_data = rename_categories_with_mapping(df_data, 'celltype', rename_dict_celltype_segerstolpe, posfix = '_sceval')
        
        datasets[name] = df_data
    return datasets


In [10]:
# Apply the preprocessing all at once to the dictionary
datasets = {
    'baron_raw': baron_raw,
    'muraro_raw': muraro_raw,
    'xin_raw': xin_raw,
    'segerstolpe_raw': segerstolpe_raw,
    'lawlor_raw': lawlor_raw
}

processed_datasets = preprocess_datasets(datasets)
processed_datasets

{'baron_raw': AnnData object with n_obs × n_vars = 8569 × 20125
     obs: 'donor', 'celltype', 'dataset', 'celltype_sceval'
     uns: 'X_name',
 'muraro_raw': AnnData object with n_obs × n_vars = 3072 × 19046
     obs: 'celltype', 'donor', 'plate', 'dataset', 'celltype_sceval'
     var: 'symbol', 'chr'
     uns: 'X_name',
 'xin_raw': AnnData object with n_obs × n_vars = 1600 × 39851
     obs: 'sample', 'donor', 'disease', 'age', 'race', 'gender', 'celltype', 'dataset', 'celltype_sceval'
     var: 'gene.id', 'symbol'
     uns: 'X_name',
 'segerstolpe_raw': AnnData object with n_obs × n_vars = 3514 × 25454
     obs: 'celltype', 'disease', 'donor', 'quality', 'dataset', 'celltype_sceval'
     var: 'symbol', 'refseq'
     uns: 'X_name',
 'lawlor_raw': AnnData object with n_obs × n_vars = 638 × 26616
     obs: 'title', 'age', 'bmi', 'celltype', 'disease', 'islet unos id', 'race', 'sex', 'dataset', 'celltype_sceval'
     uns: 'X_name'}

In [11]:
baron_processed = processed_datasets['baron_raw']
muraro_processed = processed_datasets['muraro_raw']
xin_processed= processed_datasets['xin_raw']
segerstolpe_processed = processed_datasets['segerstolpe_raw']
lawlor_processed = processed_datasets['lawlor_raw']

In [12]:
# Concatenate the datasets
train_data = ad.concat([baron_processed, muraro_processed], axis=0, join='outer', merge='same')
test_data = ad.concat([xin_processed, segerstolpe_processed, lawlor_processed], axis=0, join='outer', merge='same')
all_data = ad.concat([train_data, test_data], axis=0, join='outer', merge='same')

# Create split_cross column
split_cross = ['train'] * train_data.shape[0] + ['test'] * test_data.shape[0]

# Convert the list to a Pandas Series
split_cross_series = pd.Series(split_cross, index=all_data.obs.index, name='split_cross')

# Randomly assign 'train' and 'dev' within the train_data
split_train_dev = np.random.choice(['train', 'dev'], size=train_data.shape[0], p=[0.9, 0.1])
split_cross_series.loc[train_data.obs.index] = split_train_dev

# Add the split_cross column to the AnnData object
all_data.obs['split_cross'] = split_cross_series

all_data

AnnData object with n_obs × n_vars = 17393 × 69147
    obs: 'donor', 'celltype', 'dataset', 'celltype_sceval', 'plate', 'sample', 'disease', 'age', 'race', 'gender', 'quality', 'title', 'bmi', 'islet unos id', 'sex', 'split_cross'

In [13]:
all_data.obs['age'] = all_data.obs['age'].astype(str)
all_data.obs['celltype_sceval'] = all_data.obs['celltype_sceval'].replace('unknown', np.nan)
all_data.obs['split_cross'].value_counts()

  all_data.obs['celltype_sceval'] = all_data.obs['celltype_sceval'].replace('unknown', np.nan)


split_cross
train    10440
test      5752
dev       1201
Name: count, dtype: int64

In [14]:
all_data.write_h5ad('/dccstor/bmfm-targets/data/omics/transcriptome/scRNA/finetune/PancreasCross/h5ad/pancreascross.h5ad')

### Final remarks

If we want to compare results to scEVAL, we should use *'pancreascross_sceval_split.h5ad*. This is the dataset directly downloaded from the authors, and already processed by the authors. Alternatively, we can use *pancreascross.h5ad*, where we included the metadata of the cells. This metadata was obtained directly from the GEO files. The splits from *pancreascross.h5ad* follow the same splits used by scEVAL, but the number of cells in train and test are different. This is because the authors removed cells according to their own QC criteria, and it's not very clear how to reach exactly the same split.