In [54]:
# Standard Libraries
import os # operating system dependent functionality
from collections import Counter # counting elements in an iterable

# External Libraries
import numpy as np # numerical operations on data arrays and matrices
import pandas as pd # data manipulation and analysis
import matplotlib.pyplot as plt # plotting and visualizations

# Bioinformatics and Data Analysis 
import anndata # handling annotated data, particularly in genomics
import scanpy as sc # single-cell RNA-seq data analysis
import scipy # scientific and technical computations

In [55]:
# access raw data from research paper by Baron et al: "A Single-Cell Transcriptomic Map of the Human and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure" 
BASE_DIR = "/home/sr2464/palmer_scratch/C2S_Files_Syed/Cross_Species_Datasets/mouse_human_pancreas_tissue_Baron_et_al/raw_data"

In [56]:
# list contents of raw data folder
os.listdir(BASE_DIR)

['GSE84133_RAW.tar',
 'GSM2230757_human1_umifm_counts.csv',
 'GSM2230757_human1_umifm_counts.csv.gz',
 'GSM2230757_human1_umifm_counts_merge_two_cols.csv',
 'GSM2230758_human2_umifm_counts.csv',
 'GSM2230758_human2_umifm_counts.csv.gz',
 'GSM2230759_human3_umifm_counts.csv',
 'GSM2230759_human3_umifm_counts.csv.gz',
 'GSM2230760_human4_umifm_counts.csv',
 'GSM2230760_human4_umifm_counts.csv.gz',
 'GSM2230761_mouse1_umifm_counts.csv',
 'GSM2230761_mouse1_umifm_counts.csv.gz',
 'GSM2230762_mouse2_umifm_counts.csv',
 'GSM2230762_mouse2_umifm_counts.csv.gz']

In [58]:
def load_csv_into_anndata(fname):
    """Function facilitates the conversion of data stored in CSV file into an AnnData object, which is widely used in single-cell RNA sequencing analysis, making downstream analysis/processing in Scanpy easier and more efficient.
    Input: fname (filename of csv)
    Output: AnnData object
    """

    # read csv file into a pandas dataframe
    adata_df = pd.read_csv( 
        os.path.join(BASE_DIR, f"{fname}.csv"), # construct full path to the file by appending .csv to the BASE_DIR directory
        index_col=0
    )

    # extract certain attributes related to each cell (aka sample or observation)
    cell_barcodes = adata_df["barcode"].tolist()
    cell_manual_ids = adata_df.index.tolist()
    cell_types = adata_df["assigned_cluster"].tolist()
    batch_sample = [fname] * len(cell_manual_ids)

    # isolate count matrices (which represent the expression levels of genes for each cell) from the Dataframe
    # convert each matrix into a sparse matrix format (CSR) for memory efficiency
    cellxgene_matrix = adata_df.iloc[:, 2:].to_numpy().astype(np.float32)
    cellxgene_matrix = scipy.sparse.csr_matrix(cellxgene_matrix)

    # extract and convert gene names to uppercase for consistency
    gene_names = adata_df.iloc[:, 2:].columns.tolist()
    gene_names = [name.upper() for name in gene_names]

    # create DataFrame for obs (observation) annotations containing cell barcodes, IDs, types, and batch information
    obs_df = pd.DataFrame({
        "cell_barcodes": cell_barcodes,
        "cell_manual_ids": cell_manual_ids,
        "cell_types": cell_types,
        "batch_sample": batch_sample,
    }, index=cell_manual_ids)
    
    # create Dataframe for var (variable) annotations contiaining gene names
    var_df = pd.DataFrame({
        "gene_names": gene_names
    }, index=gene_names)

    # create anndata object using the count matrix (X), observation annotations (obs), and variable annotations (var)
    adata = anndata.AnnData(
        X=cellxgene_matrix,
        obs=obs_df,
        var=var_df,
    )
    # return anndata object
    return adata

The Baron et al paper provides uniformly sequenced single-cell RNA data for 4 humans and 2 mice. We load in this data.

In [59]:
# load human #1 single-cell RNA data as AnnData object
human1_adata = load_csv_into_anndata(fname="GSM2230757_human1_umifm_counts")
human1_adata

AnnData object with n_obs × n_vars = 1937 × 20125
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_names'

In [60]:
# load human #2 single-cell RNA data as AnnData object
human2_adata = load_csv_into_anndata(fname="GSM2230758_human2_umifm_counts")
human2_adata

AnnData object with n_obs × n_vars = 1724 × 20125
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_names'

In [61]:
# load human #3 single-cell RNA data as AnnData object
human3_adata = load_csv_into_anndata(fname="GSM2230759_human3_umifm_counts")
human3_adata

AnnData object with n_obs × n_vars = 3605 × 20125
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_names'

In [62]:
# load human #4 single-cell RNA data as AnnData object
human4_adata = load_csv_into_anndata(fname="GSM2230760_human4_umifm_counts")
human4_adata

AnnData object with n_obs × n_vars = 1303 × 20125
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_names'

In [63]:
# load mouse #1 single-cell RNA data as AnnData object
mouse1_adata = load_csv_into_anndata(fname="GSM2230761_mouse1_umifm_counts")
mouse1_adata

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 822 × 14878
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_names'

In [64]:
# load mouse #2 single-cell RNA data as AnnData object
mouse2_adata = load_csv_into_anndata(fname="GSM2230762_mouse2_umifm_counts")
mouse2_adata

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 1064 × 14878
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_names'

In [66]:
# data processing: make each variable name unique in mouse data by adding a number to each duplicate value
mouse1_adata.var_names_make_unique()
mouse2_adata.var_names_make_unique()

# Combine mouse and human samples into a single adata per species

In [68]:
# concat the mouse cells (observations) from the two AnnData objects into a single AnnData object along the rows
# mouse_adata contains all cells from the original datasets, with any overlapping genes (variables) aligned
mouse_adata = anndata.concat([mouse1_adata, mouse2_adata], axis=0)

# add new column "gene_name" in the variable annotations (var)
# this makes the gene names more explicitly accessible as a column of the Dataframe
mouse_adata.var["gene_name"] = mouse_adata.var.index.tolist()
mouse_adata

AnnData object with n_obs × n_vars = 1886 × 14878
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_name'

In [71]:
# inspect five mouse cells (observations)
mouse_adata.obs.head()

Unnamed: 0,cell_barcodes,cell_manual_ids,cell_types,batch_sample
mouse1_lib1.final_cell_0001,AAGTGAAAG-CGACTCCT,mouse1_lib1.final_cell_0001,beta,GSM2230761_mouse1_umifm_counts
mouse1_lib1.final_cell_0002,TGCAAGGG-GGCATGCT,mouse1_lib1.final_cell_0002,ductal,GSM2230761_mouse1_umifm_counts
mouse1_lib1.final_cell_0003,GAGAGCCCAT-CTTCTGGA,mouse1_lib1.final_cell_0003,delta,GSM2230761_mouse1_umifm_counts
mouse1_lib1.final_cell_0004,AAATCAGA-GTTGCACG,mouse1_lib1.final_cell_0004,schwann,GSM2230761_mouse1_umifm_counts
mouse1_lib1.final_cell_0005,GAGGCGTTAG-GGCTACTA,mouse1_lib1.final_cell_0005,delta,GSM2230761_mouse1_umifm_counts


In [72]:
# inspect number of each type of mouse cell
Counter(mouse_adata.obs["cell_types"])

Counter({'beta': 894,
         'ductal': 275,
         'delta': 218,
         'schwann': 6,
         'quiescent_stellate': 47,
         'endothelial': 139,
         'gamma': 41,
         'alpha': 191,
         'macrophage': 36,
         'immune_other': 8,
         'activated_stellate': 14,
         'B_cell': 10,
         'T_cell': 7})

In [20]:
# save the combined AnnData object to the file specified by the path
mouse_adata.write_h5ad("/home/dor3/palmer_scratch/C2S_Files_Daphne/Cross_Species_Datasets/mouse_human_pancreas_tissue_Baron_et_al/processed_data/mouse_pancreas_all_samples_raw_adata.h5ad")

In [73]:
# concat the human cells (observations) from the four AnnData objects into a single AnnData object along the rows
# human_adata contains all cells from the original datasets, with any overlapping genes (variables) aligned
human_adata = anndata.concat([human1_adata, human2_adata, human3_adata, human4_adata], axis=0)

# add new column "gene_name" in the variable annotations (var)
# this makes the gene names more explicitly accessible as a column of the Dataframe
human_adata.var["gene_name"] = human_adata.var.index.tolist()
human_adata

AnnData object with n_obs × n_vars = 8569 × 20125
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample'
    var: 'gene_name'

In [74]:
# inspect five human cells (observations)
human_adata.obs.head()

Unnamed: 0,cell_barcodes,cell_manual_ids,cell_types,batch_sample
human1_lib1.final_cell_0001,GATGACGGAC-GGTGGGAT,human1_lib1.final_cell_0001,acinar,GSM2230757_human1_umifm_counts
human1_lib1.final_cell_0002,GAGCGTTGCT-ACCTTCTT,human1_lib1.final_cell_0002,acinar,GSM2230757_human1_umifm_counts
human1_lib1.final_cell_0003,CTTACGGG-CCATTACT,human1_lib1.final_cell_0003,acinar,GSM2230757_human1_umifm_counts
human1_lib1.final_cell_0004,GATGTACACG-TTAAACTG,human1_lib1.final_cell_0004,acinar,GSM2230757_human1_umifm_counts
human1_lib1.final_cell_0005,GAGATTGCGA-GTCGTCGT,human1_lib1.final_cell_0005,acinar,GSM2230757_human1_umifm_counts


In [76]:
# count and summarize the number of ocurrences of each unique batch or sample identifier
Counter(human_adata.obs["batch_sample"])

Counter({'GSM2230757_human1_umifm_counts': 1937,
         'GSM2230758_human2_umifm_counts': 1724,
         'GSM2230759_human3_umifm_counts': 3605,
         'GSM2230760_human4_umifm_counts': 1303})

In [75]:
# inspect number of each type of human cell
Counter(human_adata.obs["cell_types"])

Counter({'acinar': 958,
         'beta': 2525,
         'delta': 601,
         'activated_stellate': 284,
         'ductal': 1077,
         'alpha': 2326,
         'epsilon': 18,
         'gamma': 255,
         'endothelial': 252,
         'quiescent_stellate': 173,
         'macrophage': 55,
         'schwann': 13,
         'mast': 25,
         't_cell': 7})

In [77]:
# inspect the primary data matrix in the human_adata object, which contains gene expression data for single-cell RNA-seq studies (rows = cells; cols = genes)
human_adata.X

<8569x20125 sparse matrix of type '<class 'numpy.float32'>'
	with 16171764 stored elements in Compressed Sparse Row format>

In [78]:
# since the primary data matrix is sparse,
# we inspect the first 10 non-zero elements of the data matrix in the human_adata object
human_adata.X.data[:10]

array([4., 6., 1., 1., 2., 1., 1., 1., 1., 1.], dtype=float32)

In [28]:
# save the combined AnnData object to the file specified by the path
human_adata.write_h5ad("/home/dor3/palmer_scratch/C2S_Files_Daphne/Cross_Species_Datasets/mouse_human_pancreas_tissue_Baron_et_al/processed_data/human_pancreas_all_samples_raw_adata.h5ad")

# Old loading code for 1 sample

In [29]:
fname = "GSM2230761_mouse1_umifm_counts"
adata_mouse1_df = pd.read_csv(
    os.path.join(BASE_DIR, f"{fname}.csv"), 
    index_col=0
)
adata_mouse1_df.shape

(822, 14880)

In [30]:
adata_mouse1_df.head()

Unnamed: 0,barcode,assigned_cluster,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010F05Rik,0610010K14Rik,0610011F06Rik,...,Zw10,Zwilch,Zwint,Zxdb,Zxdc,Zyg11b,Zyx,Zzef1,Zzz3,l7Rn6
mouse1_lib1.final_cell_0001,AAGTGAAAG-CGACTCCT,beta,0,0,0,0,0,2,0,1,...,0,0,1,0,0,0,0,1,0,1
mouse1_lib1.final_cell_0002,TGCAAGGG-GGCATGCT,ductal,2,0,0,0,0,1,0,1,...,0,0,1,0,0,0,1,0,0,1
mouse1_lib1.final_cell_0003,GAGAGCCCAT-CTTCTGGA,delta,0,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,2
mouse1_lib1.final_cell_0004,AAATCAGA-GTTGCACG,schwann,0,0,0,0,1,0,0,0,...,1,0,1,0,0,1,2,0,0,3
mouse1_lib1.final_cell_0005,GAGGCGTTAG-GGCTACTA,delta,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0


In [31]:
fname = "GSM2230757_human1_umifm_counts"
adata_human1_df = pd.read_csv(
    os.path.join(BASE_DIR, f"{fname}.csv"), 
    index_col=0
)
adata_human1_df.shape

(1937, 20127)

In [32]:
adata_human1_df.head()

Unnamed: 0,barcode,assigned_cluster,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AA06,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
human1_lib1.final_cell_0001,GATGACGGAC-GGTGGGAT,acinar,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,1
human1_lib1.final_cell_0002,GAGCGTTGCT-ACCTTCTT,acinar,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,4,0,1,0
human1_lib1.final_cell_0003,CTTACGGG-CCATTACT,acinar,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
human1_lib1.final_cell_0004,GATGTACACG-TTAAACTG,acinar,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,3,1,0,0
human1_lib1.final_cell_0005,GAGATTGCGA-GTCGTCGT,acinar,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [33]:
cell_barcodes = adata_human1_df["barcode"].tolist()
len(cell_barcodes)
cell_barcodes[:3]

['GATGACGGAC-GGTGGGAT', 'GAGCGTTGCT-ACCTTCTT', 'CTTACGGG-CCATTACT']

In [34]:
cell_types = adata_human1_df["assigned_cluster"].tolist()
len(cell_types)
cell_types[:3]

['acinar', 'acinar', 'acinar']

In [35]:
pk_list = adata_human1_df["pk"].tolist()
len(pk_list)
pk_list[:3]

[1, 0, 0]

In [36]:
cell_manual_ids = adata_human1_df.index.tolist()
len(cell_manual_ids)
cell_manual_ids[:3]

['human1_lib1.final_cell_0001',
 'human1_lib1.final_cell_0002',
 'human1_lib1.final_cell_0003']

In [37]:
batch_sample = [fname] * len(cell_manual_ids)

In [38]:
cellxgene_matrix = adata_human1_df.iloc[:, 2:-1].to_numpy().astype(np.float32)
cellxgene_matrix = scipy.sparse.csr_matrix(cellxgene_matrix)
print(cellxgene_matrix.shape)
print(cellxgene_matrix.dtype)
print(type(cellxgene_matrix))

(1937, 20124)
float32
<class 'scipy.sparse._csr.csr_matrix'>


In [39]:
print(cellxgene_matrix.data[:10])
cellxgene_matrix

[4. 6. 1. 1. 2. 1. 1. 1. 1. 1.]


<1937x20124 sparse matrix of type '<class 'numpy.float32'>'
	with 3726243 stored elements in Compressed Sparse Row format>

In [40]:
adata_human1_df.iloc[:, 2:-1].head()

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AA06,AAAS,AACS,AACSP1,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
human1_lib1.final_cell_0001,0,4,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2,0,0
human1_lib1.final_cell_0002,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,1,4,0,1
human1_lib1.final_cell_0003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
human1_lib1.final_cell_0004,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,3,1,0
human1_lib1.final_cell_0005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [41]:
gene_names = adata_human1_df.iloc[:, 2:-1].columns.tolist()
print(len(gene_names))
gene_names = [name.upper() for name in gene_names]
gene_names[:3]

20124


['A1BG', 'A1CF', 'A2M']

In [42]:
obs_df = pd.DataFrame({
    "cell_barcodes": cell_barcodes,
    "cell_manual_ids": cell_manual_ids,
    "cell_types": cell_types,
    "batch_sample": batch_sample,
    "pk_list": pk_list,
}, index=cell_manual_ids)
var_df = pd.DataFrame({
    "gene_names": gene_names
}, index=gene_names)
adata = anndata.AnnData(
    X=cellxgene_matrix,
    obs=obs_df,
    var=var_df,
)

In [43]:
adata

AnnData object with n_obs × n_vars = 1937 × 20124
    obs: 'cell_barcodes', 'cell_manual_ids', 'cell_types', 'batch_sample', 'pk_list'
    var: 'gene_names'

In [44]:
adata.obs.head()

Unnamed: 0,cell_barcodes,cell_manual_ids,cell_types,batch_sample,pk_list
human1_lib1.final_cell_0001,GATGACGGAC-GGTGGGAT,human1_lib1.final_cell_0001,acinar,GSM2230757_human1_umifm_counts,1
human1_lib1.final_cell_0002,GAGCGTTGCT-ACCTTCTT,human1_lib1.final_cell_0002,acinar,GSM2230757_human1_umifm_counts,0
human1_lib1.final_cell_0003,CTTACGGG-CCATTACT,human1_lib1.final_cell_0003,acinar,GSM2230757_human1_umifm_counts,0
human1_lib1.final_cell_0004,GATGTACACG-TTAAACTG,human1_lib1.final_cell_0004,acinar,GSM2230757_human1_umifm_counts,0
human1_lib1.final_cell_0005,GAGATTGCGA-GTCGTCGT,human1_lib1.final_cell_0005,acinar,GSM2230757_human1_umifm_counts,1


In [45]:
adata.var.head()

Unnamed: 0,gene_names
A1BG,A1BG
A1CF,A1CF
A2M,A2M
A2ML1,A2ML1
A4GALT,A4GALT


In [46]:
adata.raw is None

True

In [47]:
adata_human2_df = pd.read_csv(
    os.path.join(BASE_DIR, "GSM2230760_human4_umifm_counts.csv"), 
    index_col=0
)
adata_human2_df.shape

(1303, 20127)

In [48]:
adata_human2_df.head()

Unnamed: 0,barcode,assigned_cluster,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AA06,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
human4_lib1.final_cell_0001,AATATCTTC-AGTGAAAG,ductal,0,1,0,0,0,0,0,2,...,2,0,0,0,0,2,2,1,1,0
human4_lib1.final_cell_0002,AGGCAACG-GCATGGGT,delta,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
human4_lib1.final_cell_0003,AACGCAGAG-TTGTCGCC,delta,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
human4_lib1.final_cell_0004,CGGCTTAC-CGGGCTTT,ductal,0,0,0,0,2,0,0,0,...,0,0,0,0,0,1,4,0,1,0
human4_lib1.final_cell_0005,AAGCTACGG-TGTAGTTT,ductal,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,4,0,0,1
