In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import re
import os
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

A minimal pre-process takes place before pooling the samples together.

In [2]:
os.listdir("../data/")

['GSM3377675_sample3h.csv',
 'GSM3377682_sample4m.csv',
 'GSM3377678_sample6h.csv',
 'GSM3377679_sample1m.csv',
 'GSM3377681_sample3m.csv',
 'GSE119562_RAW.tar',
 'GSM3377676_sample4h.csv',
 'pooled_data',
 'GSM3377677_sample5h.csv',
 'GSM3377680_sample2m.csv',
 'GSM3377684_sample6m.csv',
 'mouse_pooled.h5ad',
 'GSM3377683_sample5m.csv',
 'GSM3377673_sample1h.csv',
 'mouse_all_pooled.h5ad',
 'GSM3377674_sample2h.csv']

We aim to create 3 different datasets for our analysis.

- Mouse samples for both spleen and blood (every sample that contains 'm.')
- Spleen mouse samples (1m, 2m, 3m)
- Blood mouse samples (4m, 5m, 6m)

In [3]:
def pooling_preprocess(sample, human=False, tissuewise=False):

    data_path = "../data/"
    adata = sc.read_csv(data_path+sample).T
    sample_id = sample.split("_")[1][6:8]
    
    # add a new feature showing the origin of the sample
    if tissuewise:
        if sample_id in ['1m', '2m', '3m', '1h', '2h', '3h']:
            adata.obs['sample'] = 'spleen'
        else:
            adata.obs['sample'] = 'blood'
    else:
        adata.obs['sample'] = sample_id

    sc.pp.filter_genes(adata, min_cells=10)
    sc.pp.filter_cells(adata, min_genes=100)

    if human:

        # mitochondrial genes
        adata.var["mt"] = adata.var_names.str.startswith("MT-")

        # ribosomal genes
        ribo_url = "http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=KEGG_RIBOSOME&fileType=txt"
        ribo_genes = pd.read_table(ribo_url, skiprows=2, header=None)
        adata.var['ribo'] = adata.var_names.isin(ribo_genes[0].values)

        sc.pp.calculate_qc_metrics(adata=adata, qc_vars=["mt", "ribo"], percent_top=None, log1p=False, inplace=True)

        mt_median = np.median(adata.obs.pct_counts_mt.values)
        mt_std = np.std(adata.obs.pct_counts_mt.values)
        upper_lim_mt = mt_median + 5 * mt_std
        adata = adata[adata.obs.pct_counts_mt.values < upper_lim_mt]

        rb_median = np.median(adata.obs.pct_counts_ribo.values)
        rb_std = np.std(adata.obs.pct_counts_ribo.values)
        upper_lim_rb = rb_median + 5 * rb_std
        adata = adata[adata.obs.pct_counts_ribo.values < upper_lim_rb]

    return adata

**Pooling of mouse samples**

During this process, a new attibute is created in `adata` with the name `sample`. The column is filled depending on the mouse the samples originate from (1m, 2m, 3m, 4m, 5m, 6m).

In [4]:
mouse_samples_all = []

for sample in os.listdir("../data/"):
    if 'm.' in sample: # this indicates the mouse sample
        adata = pooling_preprocess(sample)
        mouse_samples_all.append(adata)

mouse_adata_all = sc.concat(mouse_samples_all)
mouse_adata_all.obs_names_make_unique()
# Export the sample
mouse_adata_all.write_h5ad("../data/pooled_data/mouse_donorwise_pooled.h5ad")

**Pooling of mouse samples**

In [5]:
mouse_samples = []

for sample in os.listdir("../data/"):
    if 'm.' in sample: # this indicates the mouse sample
        adata = pooling_preprocess(sample, tissuewise=True)
        mouse_samples.append(adata)

mouse_adata = sc.concat(mouse_samples)

print("This is the pooled dataset for mouse, there is a column specifing whether the sample originates from spleen or blood")
print(mouse_adata)

mouse_adata.obs_names_make_unique()

# Export the sample
mouse_adata.write_h5ad("../data/pooled_data/mouse_tissuewise_pooled.h5ad")

This is the pooled dataset for mouse, there is a column specifing whether the sample originates from spleen or blood
AnnData object with n_obs × n_vars = 8215 × 7760
    obs: 'sample', 'n_genes'


**Pool of blood samples from mice**

In [6]:
# Get the blood samples with a regular expression
blood_pattern = re.compile(r'[456][m]')
blood_csv = [dt for dt in os.listdir("../data/") if blood_pattern.search(dt)]
blood_csv

['GSM3377682_sample4m.csv',
 'GSM3377684_sample6m.csv',
 'GSM3377683_sample5m.csv']

In [7]:
blood_samples = []

for sample in blood_csv:
        adata = pooling_preprocess(sample)
        blood_samples.append(adata)

print("Individual datasets before integration")
print(blood_samples)

blood_adata = sc.concat(blood_samples)
blood_adata.obs_names_make_unique()
print(blood_adata)

# Export the sample
blood_adata.write_h5ad("../data/pooled_data/blood_mouse.h5ad")

Individual datasets before integration
[AnnData object with n_obs × n_vars = 1109 × 8023
    obs: 'sample', 'n_genes'
    var: 'n_cells', AnnData object with n_obs × n_vars = 1610 × 9205
    obs: 'sample', 'n_genes'
    var: 'n_cells', AnnData object with n_obs × n_vars = 1223 × 9136
    obs: 'sample', 'n_genes'
    var: 'n_cells']
AnnData object with n_obs × n_vars = 3942 × 7935
    obs: 'sample', 'n_genes'


**Pool spleen samples from mice**

In [8]:
mouse_spleen_pattern = re.compile(r'[123][m]')
mouse_spleen = [dt for dt in os.listdir("../data/") if mouse_spleen_pattern.search(dt)]
mouse_spleen

['GSM3377679_sample1m.csv',
 'GSM3377681_sample3m.csv',
 'GSM3377680_sample2m.csv']

In [9]:
mouse_sp_samples = []

for sample in mouse_spleen:
        adata = pooling_preprocess(sample, human=False)
        mouse_sp_samples.append(adata)

print("Individual datasets before integration")
print(mouse_sp_samples)

mouse_spleen_adata = sc.concat(mouse_sp_samples)
print("Adata after integration")
print(mouse_spleen_adata)

mouse_spleen_adata.obs_names_make_unique()

mouse_spleen_adata.write_h5ad("../data/pooled_data/spleen_mouse.h5ad")

Individual datasets before integration
[AnnData object with n_obs × n_vars = 1759 × 8870
    obs: 'sample', 'n_genes'
    var: 'n_cells', AnnData object with n_obs × n_vars = 1081 × 9202
    obs: 'sample', 'n_genes'
    var: 'n_cells', AnnData object with n_obs × n_vars = 1433 × 9148
    obs: 'sample', 'n_genes'
    var: 'n_cells']
Adata after integration
AnnData object with n_obs × n_vars = 4273 × 8521
    obs: 'sample', 'n_genes'
