In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import subprocess

import numpy as np
import pandas as pd
from scipy import io 
import csv

# Preprocces the liver organoid data (DesLO) 

Here we will preprocess the liver organoid data (DesLO) and obtain the filtered and normalized expression, combined from multiple time points. We format the output expression data in the CSHMM input file format:
    * The 1st row is the header. 
    * Each row is a cell. 
        * The 1st column is the name of the cell. The 2nd column is the sampled time of the cell. The 3rd column is the label (cell type) of the cell. All the other columns are the gene expression.
        
The inputs required are the original count matrices for the scRNA-seq data and can be downloaded from the repository GSE159491 as we noted in the paper. These files have the following structure:
    * sample name (e.g. mmB_DesLo_D17)
        * barcodes.tsv  
        * genes.tsv
        * matrix.mtx

You also need the following files from our GitHub website:
    * R script read_10x_save_csv.R, given we will also preprocess the data in R. 
    * .csv files holding the annotations from Seurat for each time point, e.g. DesLO_D17_Cluster.csv. There are three of them, each for the data from a single time point. You can find them at tutorials/annotations folder.

## Step 1: use Seurat in R to filter and normalize the expression data for each time point

    1. In R, run read_10x_save_csv.R to filter low-quality genes and cells, normalize the data and to obtain the .csv format expression data. 
        * R packages: Seurat(3.2.2) ,dplyr(1.0.2), data.table(1.13.2) are required for this step.  
        * The output files will be saved in .csv format, e.g. mmB_DesLO_D17.csv. After this step, you should has 3 .csv files: mmB_D5.csv, mmB_DesLO_D11.csv and mmB_DesLO_D17.csv

## Step 2: assign annotations and combine data from multiple time points 

### 2.1 read in data from each time point and assign annotations to them

In [2]:
label_known = True
convert2order = True
parent_path = "~/"
condition = ""
features = None

dict_times = {'':[5],
         'DesLO_': [11,17]}

time2order = {'D5': 1,
              'DesLO_D11': 3,
              'DesLO_D17': 4}

experiments = ['','DesLO_']  

In [3]:
parent_path = "/mnt/5eaf9992-fb1d-44ee-8ab5-a16d8b70d7d5/archive/research/cshmm-before-July21/DesLO/temp"

In [6]:
name_list = []  # original IDs associated with cells 
id_list = []
time_list = []
label_list = []
exp_list = []

In [7]:
for experiment in experiments:
    times = dict_times[experiment]
    
    for time in times:
        dataname =  experiment + 'D' + str(time)
        folder = "mmB_" + dataname

        suffix = condition
        filename = "mmB_" + dataname + suffix + ".csv"
        _path = os.path.join(parent_path, "input/expression/")
        expression = pd.read_csv(os.path.join(_path, filename))

        # read in gene names once 
        if features is None:
            _path = os.path.join(parent_path, "input/expression/raw")
            try:
                feature_file = os.path.join(_path, folder, "genes.tsv")
                features = pd.read_csv(feature_file, header = None)
            except IOError:
                feature_file = os.path.join(_path, folder, "features.tsv")
                features = pd.read_csv(feature_file, header = None)

            features = features.iloc[:,0].values # turn to numpy 
        
        if 'Expression' in features[0]: # remove Gene Expression tag 
            features = np.array(['\t'.join(l.split('\t')[:2]) for l in features])
        
        # prepare labels 
        if label_known:
            # read labels 
            _path = os.path.join(parent_path, "input/expression/")
            suffix = "Cluster.csv"
            filename = '_'.join([dataname, suffix])
            labels = pd.read_csv(os.path.join(_path, filename))
            labels.columns = ['name','label']
            if '-' in labels['name'][0]:
                labels['name'] = [c.split('-')[0] for c in labels['name']]

            # get cell names for the expression 
            cell_names = expression.columns.values
            cell_names = np.array([c.split('-') for c in cell_names])
            all_names = pd.DataFrame(cell_names)
            all_names['index'] = all_names.index
            all_names.columns = ['name','batch','index']
            assert len(set(all_names['batch'])) == 1  # all same batch

            # only use the cells overlapped in both the expression and the annotation file  
            merged = pd.merge(all_names, labels, on = 'name')
            id_selected = merged['name'] + '-' + merged['batch']
            
            expression = expression[id_selected]
            
            # same ordering in expression as in meta/label file
            assert all(expression.columns.values == id_selected)

            label_subset = merged['label'].values
        else:
            temp = np.empty(n_cells)  # no label is known
            temp[:] = np.nan
            label_subset = list(temp)    
            
        n_gene = expression.shape[0]
        n_cells = expression.shape[1]
        
        # prepare meta data 
        id_subset = [dataname + '_' + str(i) for i in range(n_cells)]
        if convert2order:
            time_subset = np.repeat(time2order[dataname], n_cells)
        else:
            time_subset = np.repeat(time, n_cells)
        
        if time == 5:  # use WT cells in D5 as D0  
            # get ID for D0
            idx_d0 = merged['index'][merged['label'] == 'WT'].values
            
            # mark as D0 instead of D5
            time_subset[idx_d0] = 0
            
        # convert to numpy 
        temp = expression.T
            
        name_list.append(id_selected)
        id_list.append(id_subset)
        time_list.append(list(time_subset))
        label_list.append(label_subset)
        exp_list.append(temp.values)  # list of numpy arrays

### 2.2 process feature names

In [8]:
n_datasets = len(exp_list)

In [9]:
# take only gene names; if same gene name then attach with code 
gene_names = [f.split('\t')[1] for f in features]
gene_codes = [f.split('\t')[0] for f in features]

unique, counts = np.unique(np.array(gene_names), return_counts=True)
print(counts[counts > 1])

gList = []
g_D = {}
for i in range(len(gene_names)):
    g = gene_names[i]
    c = gene_codes[i]
    if g in g_D:
        gList.append(g + '_' + c)
    else:
        g_D[g] = 1
        gList.append(g)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [10]:
gene_names = np.array(gList)
exp_list_common = exp_list

### 2.3 combine datasets  and save output
1. remove lowly expressed genes 
2. assemble into CSHMM / SCDIFF format

In [14]:
flatten = lambda l: [item for sublist in l for item in sublist]
    
def combine_meta_save(subsets, df, nz_genes):
    _id = flatten(list(id_list[i] for i in subsets))
    _time = flatten(list(time_list[i] for i in subsets))
    _label = flatten(list(label_list[i] for i in subsets))

    df['ID'] = _id
    df['time'] = _time
    df['label'] = _label

    columns = list(nz_genes) + ['ID','time','label'] 
    df.columns = columns

    columns = ['ID','time','label']  + list(nz_genes)
    df = df[columns]
    
    return df 

In [15]:
name_subsets = ['DesLO']
dict_subsets = {'DesLO': [0,1,2]}

In [16]:
for name in name_subsets:
    subsets = dict_subsets[name]
    _exp = np.concatenate(list(exp_list_common[i] for i in subsets))
    
    # remove 0 count genes
    _sum = _exp.sum(axis=0)
    zero_genes = gene_names[_sum==0]
    nz_genes = [x for x in gene_names if x not in zero_genes]
    temp = list(gene_names)
    nz_genes_idx = [temp.index(g) for g in nz_genes] 
    
    # filter expression
    _exp = _exp[:,nz_genes_idx]  
    _exp = pd.DataFrame(_exp)
    
    filename = ''.join([name, '.txt'])
    df = combine_meta_save(subsets, _exp, nz_genes)
    
    # remove other day 5 cells
    df = df[df['time'] != 1]
    df.to_csv(os.path.join(parent_path, filename), sep='\t', index=False)