### Concatenating, filtering the samples from all 10 donors

### Convert combined matrix in a Seurat object 

##### hpb29

Date: 2021-01-07

In [None]:
%matplotlib widget

import warnings
warnings.filterwarnings('ignore')

%load_ext rpy2.ipython

import os, sys, json, operator, getpass
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
import matplotlib.pyplot as plt
from ipywidgets import widgets

plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3

In [None]:
with open('/.singularity.d/labels.json') as fh:
    singularity = json.load(fh)
    
singularity['Version']

In [None]:
home = str(Path.home())
user = getpass.getuser()

basedir = os.path.join(home, 'databoard/users', user, '2020/SLX19841/')

sc.settings.writedir = os.path.join(basedir, 'analysis/h5ad/')

---

# DOD

In [None]:
dod1_2 = sc.read('SLX14831_12978_filtered_gene_bc_expression_minus_putative_doublets')

In [None]:
dod3_4 = sc.read('SLX19841_DOD_filtered_gene_bc_expression_minus_putative_doublets')

# LD

In [None]:
ld3_4_5 = sc.read('SLX18808_filtered_gene_bc_expression_minus_putative_doublets')

In [None]:
ld_6_7_8 = sc.read('SLX19286_filtered_gene_bc_expression_minus_putative_doublets' )

In [None]:
ld9_10 = sc.read('SLX19841_LD_filtered_gene_bc_expression_minus_putative_doublets')

---

Segregate data by donor

In [None]:
donor1 = dod1_2[dod1_2.obs.donor == 'DOD1'].copy()
donor2 = dod1_2[dod1_2.obs.donor == 'DOD2'].copy()

donor3 = dod3_4[dod3_4.obs.donor == 'KSP29'].copy()
donor4 = dod3_4[dod3_4.obs.donor == 'KSP32'].copy()

donor5 = ld3_4_5[ld3_4_5.obs.donor == 'TQ198'].copy()
donor6 = ld3_4_5[ld3_4_5.obs.donor == 'BP62j'].copy()
donor7 = ld3_4_5[ld3_4_5.obs.donor == 'BP37d'].copy()

donor8 = ld_6_7_8[ld_6_7_8.obs.donor == 'BP74'].copy()

donor9 = ld9_10[ld9_10.obs.donor == 'BP1c'].copy()
donor10 = ld9_10[ld9_10.obs.donor == 'BP59h'].copy()

In [None]:
donor1.obs['donor'] = 'DOD1'
donor2.obs['donor'] = 'DOD2'
donor3.obs['donor'] = 'DOD3'
donor4.obs['donor'] = 'DOD4'

In [None]:
donor5.obs.columns = ['batch', 'donor', 'library', 'organ', 'doublet_scores']
donor6.obs.columns = ['batch', 'donor', 'library', 'organ', 'doublet_scores']
donor7.obs.columns = ['batch', 'donor', 'library', 'organ', 'doublet_scores']
donor8.obs.columns = ['batch', 'donor', 'library', 'organ', 'doublet_scores']

In [None]:
def adjust_index(adata, prefix):

    index = []
    for x in adata.obs.index:
        x = x.replace('-', '.')
        x = prefix + x
        index.append(x)

    adata.obs.index = index

In [None]:
adjust_index(donor1, '_01_')
adjust_index(donor2, '_02_')
adjust_index(donor3, '_03_')
adjust_index(donor4, '_04_')
adjust_index(donor5, '_05_')
adjust_index(donor6, '_06_')
adjust_index(donor7, '_07_')
adjust_index(donor8, '_08_')
adjust_index(donor9, '_09_')
adjust_index(donor10, '_10_')

In [None]:
data = donor1.concatenate(donor2, donor3, donor4, donor5,
                          donor6, donor7, donor8, donor9, 
                          donor10, index_unique=None)

In [None]:
data.obs.drop(['doublet_scores'], axis=1, inplace=True)

In [None]:
data.var.drop(['feature_types-2', 'genome-2', 'feature_types-3',
       'genome-3', 'feature_types-4', 'genome-4', 'feature_types-5',
       'genome-5', 'feature_types-6', 'genome-6', 'feature_types-7',
       'genome-7', 'feature_types-8', 'genome-8', 'feature_types-9',
       'genome-9'], axis=1, inplace=True)

In [None]:
%%time
sc.pl.highest_expr_genes(data, n_top=20)

In [None]:
data.layers['counts'] = data.X.copy()

In [None]:
# add the total counts per cell as sample annotation to adata
data.obs['n_counts'] = np.sum(data.X, axis=1).A1
data.obs['n_counts_log'] = np.log10(np.sum(data.X, axis=1).A1)

In [None]:
mitoc_genes = np.array([name for name in data.var_names
                       if bool(re.search("^MT-", name))])
# for each cell compute fraction of counts in mito genes vs. all genes
data.obs['percent_mitoc'] = np.sum(data[:, mitoc_genes].X, axis=1) / np.sum(data.X, axis=1)

In [None]:
sc.pl.violin(data, ['n_counts', 'n_counts_log', 'percent_mitoc'],
             jitter=0.4, multi_panel=True)

In [None]:
sc.pl.violin(data,
             ['percent_mitoc'])

In [None]:
sc.pl.scatter(data, x='n_counts', y='percent_mitoc')

In [None]:
print("...filtering %d cells with 20 percent or more mitochondrial reads" %(sum(data.obs['percent_mitoc'] >= 0.2)))

Filter:
- genes expressed in fewer than 3 cells
- based on number of UMI counts per cell
- cells expressing fewer than 500 genes
- cells w/ more than 20% mitocondrial genes

In [None]:
sd_counts = np.std(data.obs['n_counts_log'])
mean_counts = np.mean(data.obs['n_counts_log'])

max_counts_filter = mean_counts + 3*sd_counts
min_counts_filter = mean_counts - 3*sd_counts

In [None]:
counts_filter = np.logical_and(data.obs['n_counts_log'] >= min_counts_filter, data.obs['n_counts_log'] <= max_counts_filter)
print(data.X.shape[0] - np.sum(counts_filter), 'cells with outlying counts to be filtered')

In [None]:
%%time
data = data[np.logical_and(data.obs['percent_mitoc'] < 0.20, counts_filter), :].copy()
sc.pp.filter_cells(data, min_genes=500)
sc.pp.filter_genes(data, min_cells=3)

In [None]:
nCells = data.X.shape[0]
nGenes = data.X.shape[1]
print('Data has', nGenes, 'genes in', nCells, 'cells')

Exclude sex linked genes

In [None]:
sex_linked = ['RPS4Y1','NACA2','RPL10L','TIPIN','ZNF90','UQCRHL','DDX3Y','EIF1AY',
              'MIF-AS1','ATP5L2','GREM1','EDARADD','AC009501.4','NBEAL1','MTRNR2L12',
              'FKBP1C','AC090498.1','NHSL2','LRRC69','MTRNR2L8','HNRNPA1L2','PABPC3',
              'RP11-302B13.5','RP5-940J5.9','EIF5AL','XIST']

In [None]:
data = data[:, [x for x in data.var.index if x not in sex_linked]].copy()

In [None]:
%%time

udf = pd.DataFrame( data[:, data.var.index].X.toarray().T , 
                      index=list(data.var.index), 
                      columns=list(data.obs.index) )

udf.to_csv( os.path.join(sc.settings.writedir, '..', 
                           'matrices', 'COMBO10_filtered_gene_x_cells_XFILTERED_matrix.txt.gz'),
                           sep='\t', quotechar='"')

In [None]:
meta = data.obs[['donor', 'organ']].copy()

In [None]:
meta.to_csv('COMBO10_filtered_metadata.txt', sep='\t')

### R

In [None]:
%%R

library(Seurat)

data.path = '~/datafloor/users/2020/SLX19841/analysis/matrices/'

combo.data <- read.table(file = gzfile( paste0(data.path, 
                                            "COMBO10_filtered_gene_x_cells_XFILTERED_matrix.txt.gz") ) )

# Replace first character (X) with nothing
colnames(combo.data) <- sub('.', '', colnames(combo.data))

In [None]:
%%R

metadata <- read.table('COMBO10_filtered_metadata.txt', 
                   sep='\t', header=T, row.names = 1)

common <- intersect( colnames(combo.data), rownames(metadata) )

data <- CreateSeuratObject(counts = combo.data[,common], meta.data = metadata[common,])

# QC showed donor 3 SPL sample to be contaminated, thus we exclude it henceforth
data <- data[ , !(data@meta.data$donor == 'DOD3' & data@meta.data$organ == 'SPL') ]


saveRDS(file = paste0(data.path, 'COMBO10_NO_SPL3_filtered_counts_Seurat3_obj.rds'), data)