# General import functions

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv

import scanpy as sc
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)

from igraph import *
from MulticoreTSNE import MulticoreTSNE as TSNE #faster TSNE alternative
from anndata import read_h5ad
from anndata import read_csv
import anndata

sc.logging.print_versions()

  from pandas.core.index import RangeIndex


scanpy==1.4.5.1 anndata==0.6.22 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==1.0.1 scikit-learn==0.22.2.post1 statsmodels==0.11.1 python-igraph==0.7.1 louvain==0.6.1


In [2]:
pwd

'/home/angela/sapiens/tabula-sapiens/analysis-scripts/pilot2'

# Load the data

List all the data available

In [3]:
path = '/mnt/ibm_lg/angela/sapiens/TSP2/10X/'

In [4]:
import os
pilot2_runs = [x[1] for x in os.walk(path)][0]
pilot2_runs

['TSP2_Blood_NA_1_3',
 'TSP2_Lung_proxmedialdistal_1_1',
 'TSP2_LI_proximal_1_1',
 'TSP2_Thymus_NA_1_3_5prime',
 'TSP2_Spleen_NA_2_1',
 'TSP2_SI_proximal_1_1',
 'TSP2_Kidney_NA_1_2',
 'TSP2_Thymus_NA_1_2',
 'TSP2_Blood_NA_2_1',
 'TSP2_Bladder_NA_1_1',
 'TSP2_Bladder_NA_1_2',
 'TSP2_Spleen_NA_1_1',
 'TSP2_BM_vertebralbody_2_1',
 'TSP2_SI_distal_1_1',
 'TSP2_Trachea_NA_1_2',
 'TSP2_Muscle_diaphragm_1_2',
 'TSP2_Heart_ventricle_1_1',
 'TSP2_Thymus_NA_1_4_5prime',
 'TSP2_BM_vertebralbody_1_1',
 'TSP2_Muscle_rectusabdominus_1_1',
 'TSP2_Blood_NA_1_5_5prime',
 'TSP2_Kidney_NA_1_1',
 'TSP2_Vasculature_Aorta_1_2',
 'TSP2_BM_vertebralbody_1_2_5prime',
 'TSP2_BM_vertebralbody_2_2_5prime',
 'TSP2_Vasculature_Aorta_1_1',
 'TSP2_Lung_proxmedialdistal_1_2',
 'TSP2_LymphNode_NA_1_1',
 'TSP2_LI_distal_1_1',
 'TSP2_Trachea_NA_1_1',
 'TSP2_Thymus_NA_1_1',
 'TSP2_Muscle_diaphragm_1_1',
 'TSP2_Muscle_rectusabdominus_1_2',
 'TSP2_LymphNode_NA_2_1']

In [5]:
adata = sc.AnnData()

for r in pilot2_runs:
    
    print(r)
    adataaux = sc.read_10x_mtx(path+r+'/raw_feature_bc_matrix/',cache=True)
    
    bfcmingenes = adataaux.shape[0]
    sc.pp.filter_cells(adataaux, min_genes=100)
    adataaux.obs['filter_cells_min_genes'] = bfcmingenes-adataaux.shape[0] #no droplets lost
    bfcmincounts = adataaux.shape[0]
    sc.pp.filter_cells(adataaux, min_counts=1000)
    adataaux.obs['filter_cells_min_counts'] = bfcmincounts-adataaux.shape[0] #no droplets lost
#     bfgmincells = adataaux.shape[0]
#     sc.pp.filter_genes(adataaux, min_cells=1)
#     adataaux.obs['filter_genes_min_cells'] = bfgmincells-adataaux.shape[0] #no genes lost
    adataaux.obs['pilot'] = r.split('_')[0]  
    adataaux.obs['tissue'] = r.split('_')[1]  
    adataaux.obs['subtissue'] = r.split('_')[2]  
    adataaux.obs['sample'] = r.split('_')[3]  
    adataaux.obs['replicate'] = r.split('_')[4]
   
    try:
        adataaux.obs['notes'] = r.split('_')[5]
    except:
        print("no notes to add")
    
    try:
        adata = adata.concatenate(adataaux)
        adata.obs = adata.obs.drop('batch',axis=1)
    except:
        adata = adataaux.copy()

adata.write(path+'tsp2_10X_raw.h5ad')
adata

TSP2_Blood_NA_1_3
no notes to add
TSP2_Lung_proxmedialdistal_1_1
no notes to add
TSP2_LI_proximal_1_1
no notes to add
TSP2_Thymus_NA_1_3_5prime
TSP2_Spleen_NA_2_1
no notes to add
TSP2_SI_proximal_1_1
no notes to add
TSP2_Kidney_NA_1_2
no notes to add
TSP2_Thymus_NA_1_2
no notes to add
TSP2_Blood_NA_2_1
no notes to add
TSP2_Bladder_NA_1_1
no notes to add
TSP2_Bladder_NA_1_2
no notes to add
TSP2_Spleen_NA_1_1
no notes to add
TSP2_BM_vertebralbody_2_1
no notes to add
TSP2_SI_distal_1_1
no notes to add
TSP2_Trachea_NA_1_2
no notes to add
TSP2_Muscle_diaphragm_1_2
no notes to add
TSP2_Heart_ventricle_1_1
no notes to add
TSP2_Thymus_NA_1_4_5prime
TSP2_BM_vertebralbody_1_1
no notes to add
TSP2_Muscle_rectusabdominus_1_1
no notes to add
TSP2_Blood_NA_1_5_5prime
TSP2_Kidney_NA_1_1
no notes to add
TSP2_Vasculature_Aorta_1_2
no notes to add
TSP2_BM_vertebralbody_1_2_5prime
TSP2_BM_vertebralbody_2_2_5prime
TSP2_Vasculature_Aorta_1_1
no notes to add
TSP2_Lung_proxmedialdistal_1_2
no notes to add
TS

... storing 'notes' as categorical
... storing 'pilot' as categorical
... storing 'replicate' as categorical
... storing 'sample' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'feature_types' as categorical


AnnData object with n_obs × n_vars = 138171 × 58870 
    obs: 'filter_cells_min_counts', 'filter_cells_min_genes', 'n_counts', 'n_genes', 'notes', 'pilot', 'replicate', 'sample', 'subtissue', 'tissue'
    var: 'gene_ids', 'feature_types'

In [6]:
adata.obs.head()

Unnamed: 0,filter_cells_min_counts,filter_cells_min_genes,n_counts,n_genes,notes,pilot,replicate,sample,subtissue,tissue
AAACCCAAGCATCAAA-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,12795,6776774,6855.0,2108,,TSP2,3,1,,Blood
AAACCCAGTATTGGCT-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,12795,6776774,4840.0,1667,,TSP2,3,1,,Blood
AAACCCATCAACTCTT-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,12795,6776774,1304.0,550,,TSP2,3,1,,Blood
AAACCCATCTAGTTCT-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,12795,6776774,13258.0,300,,TSP2,3,1,,Blood
AAACCCATCTGACCCT-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,12795,6776774,8011.0,2222,,TSP2,3,1,,Blood


In [7]:
adata.var.head()

Unnamed: 0,gene_ids,feature_types
DDX11L1,ENSG00000223972.5,Gene Expression
WASH7P,ENSG00000227232.5,Gene Expression
MIR6859-1,ENSG00000278267.1,Gene Expression
MIR1302-2HG,ENSG00000243485.5,Gene Expression
MIR1302-2,ENSG00000284332.1,Gene Expression


# Lazy filter the data

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20)


In [None]:
mito_genes = adata.var_names.str.startswith('MT-')
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [None]:
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],
             jitter=0.4, multi_panel=True)

Filter out cells with less than 200 genes detected and genes expressed in less than 3 cells

In [None]:
sc.pl.scatter(adata, x='n_counts', y='percent_mito')
sc.pl.scatter(adata, x='n_counts', y='n_genes')

In [None]:
adata.obs

In [None]:
df = adata.obs.copy()
df['tissue'] = df['tissue'].astype(str)
df['subtissue'] = df['subtissue'].astype(str)
df = pd.DataFrame(df.groupby(['tissue','subtissue','sample','replicate'])['pilot'].count())
df = df.rename({'pilot':'n_cells'},axis=1)
df = df.reset_index()
ax = sns.swarmplot(data = df,y='tissue',x='n_cells')#,kind='swarm')
ax = sns.barplot(data = df,y='tissue',x='n_cells')#,kind='bar')